In [64]:
import pandas as pd
import os
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
file_path = "dataset/sarcasm/archive (1)/train-balanced-sarcasm.csv"
final_path = os.path.join(parent_dir, file_path)


In [65]:
df = pd.read_csv(final_path)
sarcasm_only = df[df["label"] == 1]
sarcasm_only.sample()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
807933,1,He will throw them away because they are vulga...,jack55c2,india,1,1,0,2014-06,2014-06-28 04:29:09,"If I were Harshvardhan, I would encourage thes..."


In [66]:
sarcasm_v2 = sarcasm_only[["comment", "parent_comment"]]
sarcasm_v2.sample(5)

Unnamed: 0,comment,parent_comment
299498,Such a drop in their profits for them,"My response to ""You are ineligible to earn loo..."
259120,Is fun when you get on the internet and bitch ...,i don't think all of this nerd rage has anythi...
960500,"I disagree, I think OP needs a bigger frame.","Nice bike, but appears to be too big for you. ..."
997844,You got to love those zero tolerance policies.,The school did rightfully confiscate the inhal...
588428,I see no way in which this could possible go w...,"""U.S. Troops Begin Training 'Syrian Rebels' to..."


In [67]:
import re

INPUT_CSV = "dataset/sarcasm/archive (1)/train-balanced-sarcasm.csv"              # path to your SARC csv
OUTPUT_TXT = "dataset/sarcasm/sarcastic.txt"   # output corpus
END_OF_TEXT = ""

def clean_comment(text: str) -> str:
    """Basic Reddit text cleaning for language modeling."""
    text = text.strip()

    # Remove explicit sarcasm marker
    text = re.sub(r"\s*/s\s*$", "", text, flags=re.IGNORECASE)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove excessive whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()


def csv_to_corpus(csv_path: str, output_path: str):
    df = pd.read_csv(csv_path)

    # Keep only sarcastic comments
    df = df[df["label"] == 1]

    # Drop missing comments
    df = df[df["comment"].notna()]

    with open(output_path, "w", encoding="utf-8") as f:
        kept = 0
        for comment in df["comment"]:
            cleaned = clean_comment(comment)

            # Skip very short or empty comments
            if len(cleaned) < 20:
                continue

            f.write(cleaned + "\n")
            f.write(END_OF_TEXT)

            kept += 1

    print(f"Saved {kept} sarcastic comments to {output_path}")


In [68]:
# csv_to_corpus(INPUT_CSV, OUTPUT_TXT)

In [69]:
sarcasm_v3 = sarcasm_v2[["comment"]]
sarcasm_v3.reset_index(drop=True)
sarcasm_v3.shape

(505413, 1)

In [70]:
sarcasm_v3[sarcasm_v3["comment"] == "You're disrespecting the greatness that is David Arquette"]

Unnamed: 0,comment
10007,You're disrespecting the greatness that is Dav...


In [71]:
sarcasm_v4 = sarcasm_v3[sarcasm_v3.index > 10007]
sarcasm_v4.reset_index(drop=True)
sarcasm_v4.shape

(501696, 1)

In [72]:
sarcasm_v5 = sarcasm_v4[(sarcasm_v4["comment"].str.len() > 100) & (sarcasm_v4["comment"].str.len() < 250)]
sarcasm_v5.reset_index(drop=True)

Unnamed: 0,comment
0,Now I know why I had a blackout yesterday.... ...
1,"One driver, one crew chief, one spotter, the c..."
2,I too would like the double barrel to be purel...
3,it's where amaya puts there bitcoin miners so ...
4,Then it would confirm that Scott Cawthon is Ka...
...,...
54439,I'm so glad that I feel safe to practice my ow...
54440,"Yes, and there's no such thing as mental illne..."
54441,"Thank you, Glen Beck, Rush Limbaugh, Sean Hann..."
54442,The real question is why God hasn't killed Bar...


In [73]:
sarcasm_v6 = sarcasm_v5.sample(n=10000, random_state=42)
sarcasm_v6 = sarcasm_v6.reset_index(drop=True)

In [74]:
sarcasm_v6.iloc[1293, 0]

"I mean it's not like Jagr had an islander hanging of his stick during OT, Islanders too disciplined for that"

In [75]:
from sarvamai import SarvamAI
from dotenv import load_dotenv
import os 

load_dotenv()

SARVAM_API_KEY = os.getenv("SARVAM_API_KEY")

def generate_responses(comment):

    prompt = """
        Given the sarcastic reply below, generate a realistic, neutral user question
        that would naturally cause this reply.

        Rules:
        - The question must be neutral (not sarcastic)
        - Do NOT repeat the reply
        - Do NOT mention sarcasm
        - Output ONLY the question


        ### EXAMPLE 1
            USER: Oh sure, because that worked perfectly last time.
            ASSISTANT : What do you think about this plan?

        ### EXAMPLE 2
            USER: Yeah, totally flawless execution as always.
            ASSISTANT : Does this seem like a good idea to you?

    """

    client = SarvamAI(api_subscription_key=SARVAM_API_KEY)

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": comment},
    ]



    response = client.chat.completions(
        messages=messages
        )
    
    reply = response.choices[0].message.content
    return reply
    # print(reply)


In [76]:
print(generate_responses(sarcasm_v6.iloc[1293, 0]))
print(sarcasm_v6.iloc[1293, 0])

 What do you think about the team's performance in overtime?
I mean it's not like Jagr had an islander hanging of his stick during OT, Islanders too disciplined for that


In [92]:
import json
import time
def generate_instruction_jsonl(
    df,
    output_path=os.path.join(parent_dir, "dataset/sarcasm/sarcasm_instruction_pairs9.jsonl"),
    sleep_time=1
):
    records_written = 0

    with open(output_path, "w", encoding="utf-8") as f:
        for idx, row in df.iterrows():
            comment = row["comment"]
            # print(comment)
            
            question = generate_responses(comment)

            # Basic sanitation
            question = re.sub(r"\s+", " ", question)
            if not question.endswith("?"):
                question += "?"

            record = {
                "instruction": question,
                "response": comment.strip()
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            records_written += 1

            print(f"[{records_written}] OK")
            time.sleep(sleep_time)



    print(f"\nSaved {records_written} instruction–response pairs to {output_path}")


In [93]:
generate_instruction_jsonl(sarcasm_v6.iloc[7294:7294+1000])

[1] OK
[2] OK
[3] OK
[4] OK
[5] OK
[6] OK
[7] OK
[8] OK
[9] OK
[10] OK
[11] OK
[12] OK
[13] OK
[14] OK
[15] OK
[16] OK
[17] OK
[18] OK
[19] OK
[20] OK
[21] OK
[22] OK
[23] OK
[24] OK
[25] OK
[26] OK
[27] OK
[28] OK
[29] OK
[30] OK
[31] OK
[32] OK
[33] OK
[34] OK
[35] OK
[36] OK
[37] OK
[38] OK
[39] OK
[40] OK
[41] OK
[42] OK
[43] OK
[44] OK
[45] OK
[46] OK
[47] OK
[48] OK
[49] OK
[50] OK
[51] OK
[52] OK
[53] OK
[54] OK
[55] OK
[56] OK
[57] OK
[58] OK
[59] OK
[60] OK
[61] OK
[62] OK
[63] OK
[64] OK
[65] OK
[66] OK
[67] OK
[68] OK
[69] OK
[70] OK
[71] OK
[72] OK
[73] OK
[74] OK
[75] OK
[76] OK
[77] OK
[78] OK
[79] OK
[80] OK
[81] OK
[82] OK
[83] OK
[84] OK
[85] OK
[86] OK
[87] OK
[88] OK
[89] OK
[90] OK
[91] OK
[92] OK
[93] OK
[94] OK
[95] OK
[96] OK
[97] OK
[98] OK
[99] OK
[100] OK
[101] OK
[102] OK
[103] OK
[104] OK
[105] OK
[106] OK
[107] OK
[108] OK
[109] OK
[110] OK
[111] OK
[112] OK
[113] OK
[114] OK
[115] OK
[116] OK
[117] OK
[118] OK
[119] OK
[120] OK
[121] OK
[122] OK
[123] OK
[