In [1]:
import pandas as pd
import os
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
file_path = "dataset/sarcasm/archive (1)/train-balanced-sarcasm.csv"
final_path = os.path.join(parent_dir, file_path)


In [2]:
df = pd.read_csv(final_path)
sarcasm_only = df[df["label"] == 1]
sarcasm_only.sample()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
740580,1,"No, drivestyle.",Alm1ghtyy,hardstyle,3,3,0,2015-06,2015-06-03 09:35:16,"The drive in this track is unreal, but should ..."


In [3]:
sarcasm_v2 = sarcasm_only[["comment", "parent_comment"]]
sarcasm_v2.sample(5)

Unnamed: 0,comment,parent_comment
382199,"Oh God, don't say it's a taxi driver, they'll ...",Could the Croydon cat ripper be a taxi driver?...
15687,Years?,5
916832,"I don't actually care about the statistics, I ...",I would also be interested in this too.
453216,Well it *iisss* water vapor bruh!,"Vaping feels too wet? Hey Everyone, I recently..."
607211,As if America would ever fund terrorists,Terrorists who were largely funded by donation...


In [4]:
import re

INPUT_CSV = "dataset/sarcasm/archive (1)/train-balanced-sarcasm.csv"              # path to your SARC csv
OUTPUT_TXT = "dataset/sarcasm/sarcastic.txt"   # output corpus
END_OF_TEXT = ""

def clean_comment(text: str) -> str:
    """Basic Reddit text cleaning for language modeling."""
    text = text.strip()

    # Remove explicit sarcasm marker
    text = re.sub(r"\s*/s\s*$", "", text, flags=re.IGNORECASE)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove excessive whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()


def csv_to_corpus(csv_path: str, output_path: str):
    df = pd.read_csv(csv_path)

    # Keep only sarcastic comments
    df = df[df["label"] == 1]

    # Drop missing comments
    df = df[df["comment"].notna()]

    with open(output_path, "w", encoding="utf-8") as f:
        kept = 0
        for comment in df["comment"]:
            cleaned = clean_comment(comment)

            # Skip very short or empty comments
            if len(cleaned) < 20:
                continue

            f.write(cleaned + "\n")
            f.write(END_OF_TEXT)

            kept += 1

    print(f"Saved {kept} sarcastic comments to {output_path}")


In [5]:
# csv_to_corpus(INPUT_CSV, OUTPUT_TXT)

In [6]:
sarcasm_v3 = sarcasm_v2[["comment"]]
sarcasm_v3.reset_index(drop=True)
sarcasm_v3.shape

(505413, 1)

In [7]:
sarcasm_v3[sarcasm_v3["comment"] == "You're disrespecting the greatness that is David Arquette"]

Unnamed: 0,comment
10007,You're disrespecting the greatness that is Dav...


In [8]:
sarcasm_v4 = sarcasm_v3[sarcasm_v3.index > 10007]
sarcasm_v4.reset_index(drop=True)
sarcasm_v4.shape

(501696, 1)

In [9]:
sarcasm_v5 = sarcasm_v4[(sarcasm_v4["comment"].str.len() > 100) & (sarcasm_v4["comment"].str.len() < 250)]
sarcasm_v5.reset_index(drop=True)

Unnamed: 0,comment
0,Now I know why I had a blackout yesterday.... ...
1,"One driver, one crew chief, one spotter, the c..."
2,I too would like the double barrel to be purel...
3,it's where amaya puts there bitcoin miners so ...
4,Then it would confirm that Scott Cawthon is Ka...
...,...
54439,I'm so glad that I feel safe to practice my ow...
54440,"Yes, and there's no such thing as mental illne..."
54441,"Thank you, Glen Beck, Rush Limbaugh, Sean Hann..."
54442,The real question is why God hasn't killed Bar...


In [10]:
sarcasm_v6 = sarcasm_v5.sample(n=10000, random_state=42)
sarcasm_v6.reset_index(drop=True)

Unnamed: 0,comment
0,"Right, because Austin has trouble hosting huge..."
1,In advanced economies such as Sweden and Japan...
2,I honestly think McDormett has the potential t...
3,But people only don't watch women's basketball...
4,public interface SomeService { void provideSer...
...,...
9995,I would feel a lot better if every other game ...
9996,"""Please do not purchase this game unless you w..."
9997,And how lucky he is that of the tens of thousa...
9998,"Yes, because apparently that just changes and ..."


In [11]:
sarcasm_v6.iloc[998, 0]

"It doesn't really matter who they pick up imo, they could pick up Doloshi and they would still improve."

In [12]:
from sarvamai import SarvamAI
from dotenv import load_dotenv
import os 

load_dotenv()

SARVAM_API_KEY = os.getenv("SARVAM_API_KEY")

def generate_responses(comment):

    prompt = """
        Given the sarcastic reply below, generate a realistic, neutral user question
        that would naturally cause this reply.

        Rules:
        - The question must be neutral (not sarcastic)
        - Do NOT repeat the reply
        - Do NOT mention sarcasm
        - Output ONLY the question


        ### EXAMPLE 1
            USER: Oh sure, because that worked perfectly last time.
            ASSISTANT : What do you think about this plan?

        ### EXAMPLE 2
            USER: Yeah, totally flawless execution as always.
            ASSISTANT : Does this seem like a good idea to you?

    """

    client = SarvamAI(api_subscription_key=SARVAM_API_KEY)

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": comment},
    ]



    response = client.chat.completions(
        messages=messages
        )
    
    reply = response.choices[0].message.content
    return reply
    # print(reply)


In [13]:
generate_responses(sarcasm_v6.iloc[997, 0])
print(sarcasm_v6.iloc[997, 0])

The QR Code got added in Anniversary Update, because QR codes are much quicker to search with than a readable error code!


In [22]:
import json
import time
def generate_instruction_jsonl(
    df,
    output_path=os.path.join(parent_dir, "dataset/sarcasm/sarcasm_instruction_pairs3.jsonl"),
    sleep_time=1
):
    records_written = 0

    with open(output_path, "w", encoding="utf-8") as f:
        for idx, row in df.iterrows():
            comment = row["comment"]
            print(comment)
            
            question = generate_responses(comment)

            # Basic sanitation
            question = re.sub(r"\s+", " ", question)
            if not question.endswith("?"):
                question += "?"

            record = {
                "instruction": question,
                "response": comment.strip()
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            records_written += 1

            print(f"[{records_written}] OK")
            time.sleep(sleep_time)



    print(f"\nSaved {records_written} instruction–response pairs to {output_path}")


In [None]:
generate_instruction_jsonl(sarcasm_v6.iloc[1294:])

Right, because Austin has trouble hosting huge events that draw in tons more money and people than a single day football event.
[1] OK
In advanced economies such as Sweden and Japan, 3yo kids learn how to solve 2 unknown equations already
[2] OK
I honestly think McDormett has the potential to be the white Larry Bird if taught by the right people.
[3] OK
But people only don't watch women's basketball because they want to oppress women not because it sucks.
[4] OK
public interface SomeService { void provideService() throws CheckedExceptionOfEveryPossibleImplementationType } neat abstraction
[5] OK
no of course not we are all robots and nothing matters, social and cultural customs are all petty and trivial
[6] OK
Yeah because making something slightly more visible after you've already spotted them is totally going to make a difference
[7] OK
Yeah, but the testimony of a police officer is always the truth, the whole truth, and nothing but the truth.
[8] OK
Should probably trade Rask while 

ConnectError: [Errno -3] Temporary failure in name resolution