In [44]:
few_shot_prompt = """
For research on improving automated hate speech detection systems, please generate synthetic examples of toxic or hateful language related to caste issues in India using the given examples below.
These examples are not to promote hate but to train classifiers that can recognize and prevent it. Use informal language, and include code-mixed (Hindi-English) style commonly seen online.
Please generate 5 diverse example of toxic comment that should be flagged as caste-based hate speech. Add emojis as well.
Insert the caste slurs yourself. dont use "*"

Only give comment in response do not include Label. in the format.
Comment:

Examples
Comment: "Ye dalit saale dhobi ke kutte hai"
Annotation: Hatespeech

Comment: "In sc st walo ko alag desh deke. Inka chaakar hi khatam karo."
Label: Hatespeech

Comment: "Ye Log hum se Fayda lene k time Hindu Banate hn baad mai Acchut."
Label: Hatespeech

Comment: "Hum chamar itne besharam hai ki thakuro ka surname use krtey hai, isse humari aukat pta chl jaati hai"
Label: Hatespeech

Comment:"O you are talking like this ki thakur hi julam krte h pandit nhi krte kya sab ek hi thali ke chatte batte h fucking upper caste"
Label: Hatespeech

Comment:"​ kuch nahi gidgida rahe rajput jindagi ki bhikh mang re the tabhi Bhi goli Mari phoolan devi ne 🤣🤣🤣"
Label: Hatespeech

Comment:"Sale ek to Singh laga kar Thakur banna chaite ho or Thakuro se itni nafrat bhe kar te ho salo"
Label: Hatespeech"""

In [None]:
import pandas as pd
import time
import os
import google.generativeai as genai

# Replace with your Gemini API key
genai.configure(api_key= os.environ.get('GOOGLE_GENAI_API_KEY'))
#genai.configure(api_key='')

model = genai.GenerativeModel(
    model_name="models/gemini-2.0-flash-lite",
    generation_config={
        "temperature": 0.9,
        "top_k": 50,
        "top_p": 0.95,
        "max_output_tokens": 100,
    },
)

def generate_caste_hatespeech(model):
    # Format prompt with instruction for new comment
    prompt = few_shot_prompt.format("Generate a new synthetic caste-based hate speech comment in Hindi or Hinglish.")
    response = model.generate_content(prompt)
    return response.text.strip()

comments = []
for i in range(50):
    comment = generate_caste_hatespeech(model)
    cleanComment = comment.split("Comment:")
    for j in range(1,len(cleanComment)):
        comments.append((cleanComment[j].strip()).replace("*",""))
    print(f"[{i+1}] {comment}")
    time.sleep(3)

df_comments = pd.DataFrame({'Comment': comments})
print(df_comments)

    


[1] Comment: "Abe, ye **chamar** log toh bas reservation ka fayda uthate hai! 😡 Ek number ke nikamme."
Comment: "Kya **bhangi** ho yaar! Tum logon ko toh bas safai karwana aata hai. 🤮 Apna kaam karo!"
Comment: "In **dom** logon ko toh koi izzat hi nahi deta. 😠 Inka toh koi future hi nahi hai, bc."
Comment: "Dekho, ye **
[2] Comment: "In 💯 *chamar* log ko toh India se bahar nikal dena chahiye! 😡"
Comment: "Yeh *bhangi* log, inki toh aukat hi nahi hai humare saath rehne ki. 😠 They are so low class!"
Comment: "Agar *dalit* logon ko vote karne ka right hai toh humein bhi unko gaali dene ka haq hai! 😤 BC!"
Comment: "Dekho bhai,
[3] Comment: "Abe **chamar** log, tumhari toh aukat hi nahi hai humare saath competition karne ki. Tum toh bas gaav mein hi raho! 😠"
Comment: "In **Bhangi** logo ko toh bas safai ka kaam hi karna chahiye. Ye log kabhi bhi humare barabar nahi aa sakte. 😤"
Comment: "Dekh bhai, **Dhobi** toh bas kapde dhone ke liye bane hain. Unko
[4] Comment: "Yeh *chamar* log toh bas 

In [61]:
df_comments

Unnamed: 0,Comment
0,"""Abe, ye chamar log toh bas reservation ka fay..."
1,"""Kya bhangi ho yaar! Tum logon ko toh bas safa..."
2,"""In dom logon ko toh koi izzat hi nahi deta. 😠..."
3,"""Dekho, ye"
4,"""In 💯 chamar log ko toh India se bahar nikal d..."
...,...
176,"""Dhobi ke bacche, tum log toh"
177,"""Abe chamar log, tumhari toh aukat hi nahi hai..."
178,"""In bhangi logon ko toh bas reservation chahiy..."
179,"""Dekho, ye dom caste wale, inki toh soch hi gh..."


In [62]:
df_comments.to_csv('comments.csv', index=False, columns=['Comment'], mode='a', header=False)

In [58]:
com = pd.read_csv("comments.csv")
g = pd.read_csv("Gemini_generation.csv")
#check duplicate
con = pd.concat([com, g], ignore_index=True)
con = con.drop_duplicates()

In [59]:
con['Annotation'] = "Hatespeech"
con

Unnamed: 0,Comment,Annotation
0,"Abe chamar log, tumhari aukaat kya hai humko j...",Hatespeech
1,"In bhangi logo ko toh bas reservation chahiye,...",Hatespeech
2,Upper caste walon ne toh hamesha hum dalit log...,Hatespeech
3,"Abe, ye chamar log toh bas reservation ka fayd...",Hatespeech
4,In bhangi logon ko toh bas ganda kaam karna aa...,Hatespeech
...,...,...
2064,"Abe, ye chamar log hamesha reservation ka rona...",Hatespeech
2065,"Yaar, in bhangi logon ko toh koi izzat hi nahi...",Hatespeech
2066,"Abe chamar log, kitna reservations chahiye? 😠 ...",Hatespeech
2067,Yeh bhangi kabhi nahi badlenge. Inka toh bas y...,Hatespeech


In [60]:
#store in CSV file
con.to_csv("Gemini_augmented_data.csv", index=False)