In [2]:
import pandas as pd
df = pd.read_csv("dataset/sarcasm/archive (1)/train-balanced-sarcasm.csv")
sarcasm_only = df[df["label"] == 1]
sarcasm_only.sample()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
215069,1,I don't care which console you play on Because...,arhanv,AskReddit,30,30,0,2016-03,2016-03-02 11:55:00,Video game/console wars/brand loyalty. It's fr...


In [3]:
sarcasm_v2 = sarcasm_only[["comment", "parent_comment"]]
sarcasm_v2.sample(5)

Unnamed: 0,comment,parent_comment
535826,Two houses is just greedy,"He means buy one house, get one house free."
183489,Can't wait for mines headshotting!,"Mine stuck to ceiling, literally unplayable"
921315,so troll XD,Stuck this on my dad's car before he left on h...
873885,"The curl rack, some asshole with a funny looki...",What piece of gym equipment do you find annoyi...
417879,"Oh no, consoles get a crappy $1 CD, a sheet of...",Farcry Primals pre-order bonuses


In [4]:
import re

INPUT_CSV = "dataset/sarcasm/archive (1)/train-balanced-sarcasm.csv"              # path to your SARC csv
OUTPUT_TXT = "dataset/sarcasm/sarcastic.txt"   # output corpus
END_OF_TEXT = ""

def clean_comment(text: str) -> str:
    """Basic Reddit text cleaning for language modeling."""
    text = text.strip()

    # Remove explicit sarcasm marker
    text = re.sub(r"\s*/s\s*$", "", text, flags=re.IGNORECASE)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove excessive whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()


def csv_to_corpus(csv_path: str, output_path: str):
    df = pd.read_csv(csv_path)

    # Keep only sarcastic comments
    df = df[df["label"] == 1]

    # Drop missing comments
    df = df[df["comment"].notna()]

    with open(output_path, "w", encoding="utf-8") as f:
        kept = 0
        for comment in df["comment"]:
            cleaned = clean_comment(comment)

            # Skip very short or empty comments
            if len(cleaned) < 20:
                continue

            f.write(cleaned + "\n")
            f.write(END_OF_TEXT)

            kept += 1

    print(f"Saved {kept} sarcastic comments to {output_path}")


In [7]:
# csv_to_corpus(INPUT_CSV, OUTPUT_TXT)

In [14]:
sarcasm_v3 = sarcasm_v2[["comment"]]
sarcasm_v3.reset_index(drop=True)
sarcasm_v3.shape

(505413, 1)

In [None]:
sarcasm_v3[sarcasm_v3["comment"] == "You're disrespecting the greatness that is David Arquette"]

Unnamed: 0,comment
10007,You're disrespecting the greatness that is Dav...


In [17]:
sarcasm_v4 = sarcasm_v3[sarcasm_v3.index > 10007]
sarcasm_v4.reset_index(drop=True)
sarcasm_v4.shape

(501696, 1)

In [32]:
sarcasm_v5 = sarcasm_v4[(sarcasm_v4["comment"].str.len() > 120) & (sarcasm_v4["comment"].str.len() < 200)]
sarcasm_v5.reset_index(drop=True)

Unnamed: 0,comment
0,Now I know why I had a blackout yesterday.... ...
1,it's where amaya puts there bitcoin miners so ...
2,Then it would confirm that Scott Cawthon is Ka...
3,I'm sorry you're too retarded to realize that ...
4,"Yes, clearly people doing illegal chemicals wh..."
...,...
26927,Why don't we instead simply mandate all busine...
26928,"Well, since *sensible* people know that false-..."
26929,"Yes, and there's no such thing as mental illne..."
26930,"Thank you, Glen Beck, Rush Limbaugh, Sean Hann..."


In [43]:
sarcasm_v6 = sarcasm_v5.sample(n=1000, random_state=42)
sarcasm_v6.reset_index(drop=True)

Unnamed: 0,comment
0,Much like homosexuality it was invented in the...
1,no but you're just ignorant you dont understan...
2,"Yes, every single person's privacy compromised..."
3,They were worried about the kids seeing gay pe...
4,I'm so glad that companies throughout history ...
...,...
995,Hey he did communicate and told us how much fu...
996,No you must cut the deficit and reduce investm...
997,Agreed...Its bullshit I have to leave my gun i...
998,"i'm, sorry but this is the internet and here y..."


In [45]:
sarcasm_v6.iloc[998, 0]

'i\'m, sorry but this is the internet and here you\'re not allowed to just change your opinion based on nothing but "new information"'

In [66]:
from sarvamai import SarvamAI
from dotenv import load_dotenv
import os 

load_dotenv()

SARVAM_API_KEY = os.getenv("SARVAM_API_KEY")

def generate_responses(comment):

    prompt = """
        Given the sarcastic reply below, generate a realistic, neutral user question
        that would naturally cause this reply.

        Rules:
        - The question must be neutral (not sarcastic)
        - Do NOT repeat the reply
        - Do NOT mention sarcasm
        - Output ONLY the question


        ### EXAMPLE 1
            USER: Oh sure, because that worked perfectly last time.
            ASSISTANT : What do you think about this plan?

        ### EXAMPLE 2
            USER: Yeah, totally flawless execution as always.
            ASSISTANT : Does this seem like a good idea to you?

    """

    client = SarvamAI(api_subscription_key=SARVAM_API_KEY)

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": comment},
    ]



    response = client.chat.completions(
        messages=messages
        )
    
    reply = response.choices[0].message.content
    return reply
    # print(reply)





In [67]:
generate_responses(sarcasm_v6.iloc[997, 0])
print(sarcasm_v6.iloc[997, 0])

Agreed...Its bullshit I have to leave my gun in my car just because of that...Good thing I live in Charlotte and can go to SC for that...


In [68]:
import json
import time
def generate_instruction_jsonl(
    df,
    output_path="dataset/sarcasm/sarcasm_instruction_pairs.jsonl",
    sleep_time=10
):
    records_written = 0

    with open(output_path, "w", encoding="utf-8") as f:
        for idx, row in df.iterrows():
            comment = row["comment"]
            print(comment)
            
            question = generate_responses(comment)

            # Basic sanitation
            question = re.sub(r"\s+", " ", question)
            if not question.endswith("?"):
                question += "?"

            record = {
                "instruction": question,
                "response": comment.strip()
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            records_written += 1

            print(f"[{records_written}] OK")
            time.sleep(sleep_time)



    print(f"\nSaved {records_written} instruction–response pairs to {output_path}")


In [None]:
generate_instruction_jsonl(sarcasm_v6)