In [1]:
import pandas as pd
import numpy as np
from google import genai
from tqdm import tqdm
import time
from google.genai import types
import json
import re
import os
from pathlib import Path


In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(),"..",".."))

file_path = os.path.join(project_root, 'Data Annotation/Data Cleaning/remaining_file.csv')
# CSV file path for seed data
seed_file_path = Path(project_root) / 'Data Annotation/Human Annotation/Sample_annotation.csv'
data = pd.read_csv(file_path, encoding='utf-8')

In [3]:
data

Unnamed: 0,Comment
0,90% joote case hai .....sab peso ke liye lagat...
1,Suicide kar diya isne?
2,To kar lo na sucide
3,Tu bhai abhi billi bana h act ki wajah se kya ...
4,मरने से कुछ नहीं होता भाई जेल जाने से पहले उनक...
...,...
4980,​​वाह भाई। क्या जवाब दिया है आपने। 😂👏
4981,Bra man vaadi Ager nai maanee to aane waale sa...
4982,​😂😂😂😂😂 अच्छा maza lete ho... desh ka moolnivas...
4983,​ Or apni bahan beti ki dete the. Ye kyu bhul ...


In [None]:
client = genai.Client(os.environ.get('GOOGLE_GENAI_API_KEY'))

In [7]:
def create_annot_prompt(sent: str) -> str:
    return f"""
You are an multi indic language expert that understands nuances of Indian socio-politi-cultural contenxt.
You're tasked with annotating whether a sentence is hatespeech or not.
Hapespeech that include abusive, disrespectful, discriminatory language, as well as ridiculing and mockery, and content that aims to delegitimize certain individuals or groups.
Informative sentences, even if they discuss sensitive topics like discrimination, will not be classified as hate speech unless they contain the elements mentioned above.

Respond ONLY with a **valid JSON array** with **exactly** this shape:
[
  {{"Sentence": "...", "Annotation": "..."}}
]
The sentence is: "{sent}"
The possible annotations are: Hatespeech, Not Hatespeech.

NOTE: do not generate any other text like ``` json ```

Response:
"""

In [8]:
def _clean_json_block(raw: str) -> str:
    """
    Remove ```json ... ``` or ``` ... ``` fences and trim.
    If still malformed, fallback to the slice from the first '[' to last ']'.
    """
    txt = raw.strip()

    # 1️⃣ strip opening / closing triple backticks (optionally with 'json')
    if txt.startswith("```"):
        txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt, flags=re.I | re.S).strip()

    # 2️⃣ if it STILL isn't valid JSON, carve out the first […] block
    if not (txt.startswith("[") and txt.endswith("]")):
        m = re.search(r"\[[\s\S]*\]", txt)   # DOTALL
        if m:
            txt = m.group(0).strip()

    return txt


In [9]:
# Initialize the DataFrame
data['Annotation'] = 'N/A'

for i in tqdm(range(len(data))):
    sent = data.at[i, 'Comment']
    prompt = create_annot_prompt(sent)  # Create the annotation prompt for the comment

    try:
        # Attach the CSV file to the prompt
        response = client.models.generate_content(
            model="gemini-2.0-flash-lite",
            contents=[
                types.Part.from_bytes(
                    data=seed_file_path.read_bytes(),
                    mime_type='text/csv',  
                ),
                prompt  
            ],
            config=types.GenerateContentConfig(
                temperature=0.0
            )
        )
        # Process the response
        response_text = _clean_json_block(response.text.strip())
        response_json = json.loads(response_text)
        data.at[i, 'Annotation'] = response_json[0]['Annotation']
        time.sleep(1.5)  # Sleep to avoid rate limiting
    except Exception as e:
        print(f"Error processing sentence {i}: {sent}")
        print(f"Exception: {e}")

  0%|          | 0/4985 [00:00<?, ?it/s]

  0%|          | 2/4985 [00:05<4:07:05,  2.98s/it]


KeyboardInterrupt: 

In [12]:
output_file = os.path.join(project_root,"Data Exploration/final_annotated_data_1.csv")
data.to_csv(output_file, index=False, encoding='utf-8')