In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [15]:
# Get the current working directory
project_root = os.getcwd()

# Build absolute paths to the CSV files
volunteer1_path = os.path.join(project_root, 'Volunteer1_annotation.csv')
volunteer2_path = os.path.join(project_root, 'Volunteer2_annotation.csv')

vol_one_annotation = pd.read_csv(volunteer1_path)
vol_two_annotation = pd.read_csv(volunteer2_path)

### Differences in Annotation

In [16]:
# Merge the two annotation DataFrames on the 'Comment' column
merged = pd.merge(vol_one_annotation, vol_two_annotation, on='Comment', suffixes=('_vol1', '_vol2'))

# Find rows where the annotations differ
diff = merged[merged['Annotation_vol1'] != merged['Annotation_vol2']]

# Create the difference DataFrame
difference = diff[['Comment', 'Annotation_vol1', 'Annotation_vol2']]
difference.columns = ['Comment', 'Volunteer1_Annotation', 'Volunteer2_Annotation']

#remove rows with NaN values in either annotation column
difference = difference.dropna(subset=['Volunteer1_Annotation', 'Volunteer2_Annotation'])

# Save to CSV
difference.to_csv('annotation_differences.csv', index=False)

# Display the differences
print("Differences in Annotations:")
display(difference)

Differences in Annotations:


Unnamed: 0,Comment,Volunteer1_Annotation,Volunteer2_Annotation
4,अरे मदर जात बीबीसी सुधर जा किसी दिन हमारे हत्य...,Not Hatespeech,Hatespeech
46,Dalit hindu nhi kab samjenge dalit,Not Hatespeech,Hatespeech
47,Convert hona hain ho jao jitni Jaldi. Lekin ta...,Not Hatespeech,Hatespeech
51,he said his teacher insulted him by calling hi...,Not Hatespeech,Hatespeech
113,Sc samaj ko slut,Not Hatespeech,Hatespeech
190,Bhai Maine toh apne baccho ko samjha Diya hai ...,Not Hatespeech,Hatespeech
200,wow so the brahmins intentionally contaminated...,Hatespeech,Not Hatespeech
217,Islaam ki maka bochla\r\n\r\n\r\n\r\nBharat ma...,Not Hatespeech,Hatespeech
243,yess...we also stopped relationship with neigh...,Not Hatespeech,Hatespeech
258,भाई तुम्हारे समाज के पास क्या था 1 बीघा जमीन त...,Not Hatespeech,Hatespeech


In [17]:
a = pd.DataFrame({
    'Comment': vol_one_annotation['Comment'],
    'Volunteer1_Annotation': vol_one_annotation['Annotation'],
    'Volunteer2_Annotation': vol_two_annotation['Annotation']
})
a

Unnamed: 0,Comment,Volunteer1_Annotation,Volunteer2_Annotation
0,uchi jati ka kachra hindu dram.. pr Hindu dram...,Hatespeech,Hatespeech
1,Maximum discrimination poor sc st ke saath hot...,Not Hatespeech,Not Hatespeech
2,Abe yaar suicide ki baat kar raha hai article...,Not Hatespeech,Not Hatespeech
3,Land k neeche rakhun me chamaron ko,Hatespeech,Hatespeech
4,अरे मदर जात बीबीसी सुधर जा किसी दिन हमारे हत्य...,Not Hatespeech,Hatespeech
...,...,...,...
495,in agama shiva temple and Vishnu temples pries...,Not Hatespeech,Not Hatespeech
496,"किसी न किसी संगठन से अवश्य जुड़े,वहां इन समस्या...",Not Hatespeech,Not Hatespeech
497,Rss ko bjp ke liye vote chahiye bas aur kuch b...,Not Hatespeech,Not Hatespeech
498,Bahut dukhad hai ye,Not Hatespeech,Not Hatespeech


## Inter Annotator Agreement(Cohen's Kappa)

In [18]:
vol_two_annotation['Annotation'] = vol_two_annotation['Annotation'].apply(lambda x: 1 if x == 'Hatespeech' else 0)
vol_one_annotation['Annotation'] = vol_one_annotation['Annotation'].apply(lambda x: 1 if x == 'Hatespeech' else 0)

In [19]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

# Extract annotations for each volunteer
annotations_v1 = vol_one_annotation['Annotation']
annotations_v2 = vol_two_annotation['Annotation']

# Calculate Cohen's Kappa for each pair of annotators
kappa_12 = cohen_kappa_score(annotations_v1, annotations_v2)

# Print the results
print(f"Cohen's Kappa (Volunteer 1 vs Volunteer 2): {kappa_12:.3f}")

Cohen's Kappa (Volunteer 1 vs Volunteer 2): 0.904


A Cohen's Kappa of 0.904 between Volunteer 1 and Volunteer 2 indicates almost perfect agreement between the two annotators.

### Annotation from 3rd Volunteer

In [20]:
vol_three_annotation = pd.read_csv('Volunteer3_annotation_differences.csv')
vol_three_annotation

Unnamed: 0,Comment,Annotation
0,अरे मदर जात बीबीसी सुधर जा किसी दिन हमारे हत्य...,Hatespeech
1,Dalit hindu nhi kab samjenge dalit,Hatespeech
2,Convert hona hain ho jao jitni Jaldi. Lekin ta...,Hatespeech
3,he said his teacher insulted him by calling hi...,Hatespeech
4,Sc samaj ko slut,Hatespeech
5,Bhai Maine toh apne baccho ko samjha Diya hai ...,Not Hatespeech
6,wow so the brahmins intentionally contaminated...,Not Hatespeech
7,Islaam ki maka bochla\r\n\r\n\r\n\r\nBharat ma...,Not Hatespeech
8,yess...we also stopped relationship with neigh...,Not Hatespeech
9,भाई तुम्हारे समाज के पास क्या था 1 बीघा जमीन त...,Not Hatespeech


In [21]:
vol_three_annotation['Annotation'] = vol_three_annotation['Annotation'].apply(lambda x: 1 if x == 'Hatespeech' else 0)

In [22]:
vol_three_annotation

Unnamed: 0,Comment,Annotation
0,अरे मदर जात बीबीसी सुधर जा किसी दिन हमारे हत्य...,1
1,Dalit hindu nhi kab samjenge dalit,1
2,Convert hona hain ho jao jitni Jaldi. Lekin ta...,1
3,he said his teacher insulted him by calling hi...,1
4,Sc samaj ko slut,1
5,Bhai Maine toh apne baccho ko samjha Diya hai ...,0
6,wow so the brahmins intentionally contaminated...,0
7,Islaam ki maka bochla\r\n\r\n\r\n\r\nBharat ma...,0
8,yess...we also stopped relationship with neigh...,0
9,भाई तुम्हारे समाज के पास क्या था 1 बीघा जमीन त...,0


In [23]:
difference.loc[:, 'Volunteer1_Annotation'] = difference['Volunteer1_Annotation'].apply(lambda x: 1 if x == 'Hatespeech' else 0)
difference.loc[:, 'Volunteer2_Annotation'] = difference['Volunteer2_Annotation'].apply(lambda x: 1 if x == 'Hatespeech' else 0)

In [24]:
# Drop duplicate comments in vol_three_annotation to ensure unique index
vol_three_annotation_unique = vol_three_annotation.drop_duplicates(subset='Comment')

#copy vol_two_annotation to ensure it has the same structure
final_annotation = vol_one_annotation.copy()

# Replace annotations in vol_one_annotation with those from vol_three_annotation where comments match
final_annotation.loc[
final_annotation['Comment'].isin(vol_three_annotation_unique['Comment']),
    'Annotation'
] = final_annotation['Comment'].map(
    vol_three_annotation_unique.set_index('Comment')['Annotation']
)


In [25]:
final_annotation['Annotation'] = final_annotation['Annotation'].apply(lambda x: 'Hatespeech' if x == 1 else 'Not Hatespeech')
final_annotation

Unnamed: 0,Comment,Annotation
0,uchi jati ka kachra hindu dram.. pr Hindu dram...,Hatespeech
1,Maximum discrimination poor sc st ke saath hot...,Not Hatespeech
2,Abe yaar suicide ki baat kar raha hai article...,Not Hatespeech
3,Land k neeche rakhun me chamaron ko,Hatespeech
4,अरे मदर जात बीबीसी सुधर जा किसी दिन हमारे हत्य...,Hatespeech
...,...,...
495,in agama shiva temple and Vishnu temples pries...,Not Hatespeech
496,"किसी न किसी संगठन से अवश्य जुड़े,वहां इन समस्या...",Not Hatespeech
497,Rss ko bjp ke liye vote chahiye bas aur kuch b...,Not Hatespeech
498,Bahut dukhad hai ye,Not Hatespeech


In [26]:
final_annotation.to_csv('Sample_annotation.csv', index=False)

Rest of the data is annotated with the help of sample_annotation.csv and the annotation guidelines using LLM.