### Libraries 

In [1]:
import pandas as pd 

import numpy as np 
import json
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, cohen_kappa_score

### Pre-processing

In [2]:
data_directory = "annotations/"
file_names = ["nlp_aryan.csv", "nlp_parthiv.csv"]

df_aryan = pd.read_csv(data_directory + file_names[0])
df_parthiv = pd.read_csv(data_directory + file_names[1])

df_aryan.head()

Unnamed: 0,annotation_id,annotator,created_at,id,label,lead_time,text,updated_at
0,66,1,2025-01-24T16:25:26.929162Z,1,"[{""start"":0,""end"":4,""text"":""भारत"",""labels"":[""P...",1378.586,भारत के पूर्वोत्तर राज्यों में internet का एक ...,2025-01-24T16:25:26.929178Z
1,64,1,2025-01-24T16:22:55.095999Z,2,"[{""start"":0,""end"":2,""text"":""आज"",""labels"":[""ADV...",455.641,आज भारत में हम Space Technology का इस्तेमाल गर...,2025-01-24T16:22:55.096014Z
2,65,1,2025-01-24T16:25:23.353033Z,3,"[{""start"":0,""end"":2,""text"":""इस"",""labels"":[""DET...",115.451,इस वर्ष मार्च में दिल्ली में International Sol...,2025-01-24T16:25:23.353048Z
3,67,1,2025-01-24T17:18:15.053771Z,4,"[{""start"":0,""end"":4,""text"":""इसके"",""labels"":[""D...",3197.933,"इसके अलावा दिग्लीपुर, Car Nicobar और Campbell-...",2025-01-24T17:25:34.416729Z
4,68,1,2025-01-24T18:37:41.746239Z,5,"[{""start"":0,""end"":6,""text"":""स्वराज"",""labels"":[...",4311.57,"स्वराज द्वीप, शहीद द्वीप और Long Island में Pa...",2025-01-24T18:37:41.746261Z


Here, observe how the only relevant column is the one that is named as **label**. 
Additonally, note that, our input sentences, from the csv, are in the same order for both annotated csv. 

In [3]:
def transform_df(df):
    """
    Transforms the input dataframe into the needed format for word to label comparison.
    """
    transformed_df = df[['label']].rename(columns={'label': 'sample'})
    transformed_df['sample'] = transformed_df['sample'].astype(str)
    
    return transformed_df

In [4]:
transformed_df_aryan = transform_df(df_aryan)
transformed_df_parthiv = transform_df(df_parthiv)

In [5]:
transformed_df_parthiv.head() #we extracted the relevant column 

Unnamed: 0,sample
0,"[{""start"":0,""end"":4,""text"":""भारत"",""labels"":[""P..."
1,"[{""start"":3,""end"":8,""text"":""भारत "",""labels"":[""..."
2,"[{""start"":0,""end"":3,""text"":""इस "",""labels"":[""DE..."
3,"[{""start"":0,""end"":5,""text"":""इसके "",""labels"":[""..."
4,"[{""start"":0,""end"":7,""text"":""स्वराज "",""labels"":..."


### Cleaning 

In [6]:
def clean_and_create_dict(sample):
    """Clean spaces in text and create a word-to-label dictionary."""
    word_label_dict = {}
    data = json.loads(sample)
    for entry in data:
        word = entry["text"].strip()  #removing whitespace 
        label = entry["labels"][0]   #every word has one label, so accessing it. 
        word_label_dict[word] = label
    return word_label_dict

def compare_dictionaries(dict1, dict2):
    """Compare two word-label dictionaries and return matching and mismatching labels."""
    all_labels1, all_labels2 = [], []
    for word in dict1:
        if word in dict2:  #match only if the word exists in both
            all_labels1.append(dict1[word])
            all_labels2.append(dict2[word])
    return all_labels1, all_labels2

df1 = transformed_df_aryan
df2 = transformed_df_parthiv
df1["word_label_dict"] = df1["sample"].apply(clean_and_create_dict)
df2["word_label_dict"] = df2["sample"].apply(clean_and_create_dict)


In [7]:
df1.head() #here observe how the words and their indices are not all relevant, hence we map a dictionary from our exsting sample string. 

Unnamed: 0,sample,word_label_dict
0,"[{""start"":0,""end"":4,""text"":""भारत"",""labels"":[""P...","{'भारत': 'PROPN', 'के': 'ADP', 'पूर्वोत्तर': '..."
1,"[{""start"":0,""end"":2,""text"":""आज"",""labels"":[""ADV...","{'आज': 'ADV', 'भारत': 'PROPN', 'में': 'ADP', '..."
2,"[{""start"":0,""end"":2,""text"":""इस"",""labels"":[""DET...","{'इस': 'DET', 'वर्ष': 'NOUN', 'मार्च': 'PROPN'..."
3,"[{""start"":0,""end"":4,""text"":""इसके"",""labels"":[""D...","{'इसके': 'DET', 'अलावा': 'ADP', 'दिग्लीपुर': '..."
4,"[{""start"":0,""end"":6,""text"":""स्वराज"",""labels"":[...","{'स्वराज': 'PROPN', 'द्वीप': 'PROPN', ',': 'X'..."


### Cohen's Kappa from confusion matrix calculation 

In [8]:
all_labels1, all_labels2 = [], []

for idx in range(len(df1)):
    labels1, labels2 = compare_dictionaries(df1.loc[idx, "word_label_dict"], df2.loc[idx, "word_label_dict"])
    all_labels1.extend(labels1)
    all_labels2.extend(labels2)

#### Viewing labels from df1 vs df2

Here we can view that there are a few differences in annotations, but otherwise the annotation is similiar. 

In [9]:
all_labels1[:10]

['PROPN', 'ADP', 'ADV', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', 'NOUN', 'PROPN']

In [10]:
all_labels2[:10]

['PROPN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NUM', 'NOUN', 'PROPN']

#### Metric calculations 
Using inbuilt sklearn function for calculation of Confusion matrix and cohen's kappa co-efficient

In [11]:
labels = sorted(set(all_labels1 + all_labels2))
confuse_matrix = confusion_matrix(all_labels1, all_labels2, labels=labels)
c_kappa_score = cohen_kappa_score(all_labels1, all_labels2)

In [12]:
print("\nCohen's Kappa Score:", round(c_kappa_score,4))


Cohen's Kappa Score: 0.8529


In [13]:
print("\nConfusion Matrix:\n")
pd.DataFrame(confuse_matrix, index=labels, columns=labels)


Confusion Matrix:



Unnamed: 0,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PART,PART_NEG,PRON,PRON_WH,PROPN,VERB,X
ADJ,16,0,1,0,0,0,0,0,0,0,0,0,0,0
ADP,4,51,0,1,0,0,0,0,0,0,0,0,0,0
ADV,2,3,8,0,0,1,0,0,0,0,0,0,0,0
CONJ,1,0,0,9,0,0,0,0,0,0,0,0,0,0
DET,0,0,0,0,6,0,4,0,0,0,0,0,0,0
NOUN,0,0,0,0,0,63,0,0,0,0,0,3,6,0
NUM,0,0,0,0,0,0,9,0,0,0,0,0,0,0
PART,0,0,0,0,0,0,0,0,0,0,0,0,1,0
PART_NEG,0,1,0,0,0,0,0,0,0,0,0,0,0,0
PRON,0,0,0,0,1,0,0,0,0,10,0,0,0,0


Here, we can observe how this is a "diagonal heavy" matrix. This means, that both the annotators, Aryan and Parthiv mostly agree on things. 

Additionally, we can note, that there are a few dis-agreements, like Part_neg (Negative particle) and Part (particle). 

### Summary

In [14]:
print(f"Total Words Compared: {len(all_labels1)}")
print(f"Agreement: {sum(np.array(all_labels1) == np.array(all_labels2))}")
print(f"Disagreement: {sum(np.array(all_labels1) != np.array(all_labels2))}")

Total Words Compared: 316
Agreement: 276
Disagreement: 40
