In [None]:
import pandas as pd
import json
import numpy as np
from collections import Counter

import statsmodels

DOMAIN = 'COMP'

GROUP_NAME = 'GROUP_16' # TODO: update
# Load the JSON file
file_path = 'json/group16.json' # TODO: update
with open(file_path, 'r') as file:
    data = json.load(file)

counter_for_id = 1
annotator_data_dict = {}
for item in data:
    for annotation in item['annotations']:
        email = annotation['completed_by']['email']
        if email not in annotator_data_dict:
            annotator_data_dict[email] = {}
        task_id = f'task_{counter_for_id}_full_annotation'
        annotator_data_dict[email][task_id] = annotation
    counter_for_id += 1
# Convert to DataFrame
df = pd.DataFrame.from_dict(annotator_data_dict, orient='index')
# create a new column for the mail
df['mail'] = df.index
# reset the index
df.reset_index(drop=True, inplace=True)
# mail must be the first column
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]


JSON_FILE = 'json/group16.json' # TODO: update
COLUMNS = df.columns.tolist()

TOTAL_NUMBER_OF_TASKS = len(COLUMNS) - 1

df["number_of_tasks_completed"] = df[COLUMNS].notnull().sum(axis=1) - 1

REMOVE_MAILS = ['gosahin@ku.edu.tr', 'efe.ozkara@metu.edu.tr', 'yigitcankarakas@gmail.com', 'abulut20@ku.edu.tr'] # TODO: update
df = df[~df['mail'].isin(REMOVE_MAILS)]
ANNOTATOR_MAILS = df['mail'].tolist()


# columns are task_1_full_annotation, task_2_full_annotation, ..., 
# sometimes not all columns are filled with ground truth annotations, so we need to eliminate those columns
new_columns = []
task_columns = []
new_columns.append("mail")
new_columns.append("number_of_tasks_completed")
new_total_number_of_tasks = 0
for i in range(1, TOTAL_NUMBER_OF_TASKS+1):
    column_name = f'task_{i}_full_annotation'
    new_columns.append(column_name)
    task_columns.append(column_name)
    new_total_number_of_tasks += 1

TOTAL_NUMBER_OF_TASKS = new_total_number_of_tasks
df = df[new_columns]



def read_tasks_as_json(input_file):
    with open(input_file, 'r') as f:
        task_data = json.load(f)
    return task_data

tasks_json = read_tasks_as_json(JSON_FILE)

def extract_paragraph_data(task_data):
    all_paragraph = task_data['data']['my_text_1']
    sections = ['TR-1', 'EN-2', 'TR-2', 'EN-3', 'TR-3']
    positions = [all_paragraph.find(section) for section in sections] + [len(all_paragraph)]
    intervals = [(positions[i], positions[i+1]) for i in range(len(positions) - 1)]
    return all_paragraph, intervals

def separate_clean_data(paragraph):
    lines = paragraph.strip().split(' \n')
    english_data, turkish_data = {}, {}
    for line in lines:
        line = line.strip()
        if line.startswith('EN-'):
            key, value = line.split(': ', 1)
            english_data[key] = value
        elif line.startswith('TR-'):
            key, value = line.split(': ', 1)
            turkish_data[key] = value
    return english_data, turkish_data

def count_words(data, keys):
    all = sum(len(data[key].split()) for key in keys if key in data.keys())
    return all

def get_annotation_results(annotation, mail, intervals_for_sentences):
    results = {'email': mail}
    features = {
        'english_detected_terms': [],
        'turkish_detected_terms': [],
        'turkish_detected_labels': [],
        'turkish_detected_corrections': [],
        'english_terimler_org_detected': []
    }

    for item in annotation['result']:
        if 'value' in item and 'labels' in item['value']:
            label = item['value']['labels'][0]
            text, start, end = item['value']['text'], item['value']['start'], item['value']['end']
            meta_text = item.get('meta', {}).get('text', [None])[0]
            section = None

            # Check if text is not empty before modifying it
            if text:
                # Remove non-alphanumeric characters from the end
                if not text[-1].isalnum():
                    text = text[:-1]
                    end -= 1

                # Remove non-alphanumeric characters from the beginning
                if text and not text[0].isalnum():
                    text = text[1:]
                    start += 1
                    
            # trim 'a' like a term --> term, note that there should be a space after 'a'
            if text and text[0].lower() == 'a' and text[1].isspace():
                text = text[2:]
                start += 2
                
            # trim 'an' like a term --> term, note that there should be a space after 'an'
            if text and text[:2].lower() == 'an' and text[2].isspace():
                text = text[3:]
                start += 3
                
            # trim 'the' like a term --> term, note that there should be a space after 'the'
            if text and text[:3].lower() == 'the' and text[3].isspace():
                text = text[4:]
                start += 4
                
            # trim bir terim --> terim, note that there should be a space after 'bir'
            if text and text[:3].lower() == 'bir' and text[3].isspace():
                text = text[4:]
                start += 4
            
            
            

            # Determine the section based on the start and end positions
            if 0 <= start < end <= intervals_for_sentences[0][0]:
                section_for_EN_1 = 'EN-1'
            elif intervals_for_sentences[0][0] <= start <= end <= intervals_for_sentences[0][1]:
                section = 'TR-1'
            elif intervals_for_sentences[1][0] <= start <= end <= intervals_for_sentences[1][1]:
                section = 'EN-2'
            elif intervals_for_sentences[2][0] <= start <= end <= intervals_for_sentences[2][1]:
                section = 'TR-2'
            elif intervals_for_sentences[3][0] <= start <= end <= intervals_for_sentences[3][1]:
                section = 'EN-3'
            elif intervals_for_sentences[4][0] <= start <= end <= intervals_for_sentences[4][1]:
                section = 'TR-3'


            if text:  # Only process if text is not empty
                if label == 'TERM':
                    # features['english_detected_terms'].append((text, start, end, section))

                    words = text.split()
                    word_start = start

                    for word in words:
                        word_end = word_start + len(word)

                        # Add each word as a separate term in the `english_detected_terms` list
                        features['english_detected_terms'].append((word, word_start, word_end))

                        # Move to the next word's start position
                        word_start = word_end + 1  # assuming there's a space between words
                        
                    if meta_text:
                        # lowercase meta_text
                        meta_text = meta_text.lower()
                        features['english_terimler_org_detected'].append((text, start, end, meta_text))
                    else:
                        features['english_terimler_org_detected'].append((text, start, end, "EMPTY LINK"))
                elif label in ['CORRECT_TRANSLATION', 'WRONG_TRANSLATION']:
                    # features['turkish_detected_terms'].append((text, start, end, section))
                    # features['turkish_detected_labels'].append((text, start, end, section, label))
                    
                    words = text.split()
                    word_start = start
                    
                    for word in words:
                        word_end = word_start + len(word)
                        
                        # Add each word as a separate term in the `turkish_detected_terms` list
                        features['turkish_detected_terms'].append((word, word_start, word_end))
                        features['turkish_detected_labels'].append((word, word_start, word_end, label))
                        
                        # Move to the next word's start position
                        word_start = word_end + 1                        
                        
                    if label == 'WRONG_TRANSLATION':
                        if meta_text:
                            # lowercase meta_text
                            meta_text = meta_text.lower()
                            features['turkish_detected_corrections'].append((text, start, end, meta_text))
                        else:
                            features['turkish_detected_corrections'].append((text, start, end, "EMPTY CORRECTION"))

    results.update(features)
    return results

import re

def word_positions_with_check(sentence):
    # Words to exclude
    exclude_words = {"EN", "TR", "1", "2", "3"}
    
    # Find all the words and their positions using regular expression
    words = re.finditer(r'\b\w+\b', sentence)
    
    # Create a list of tuples (word, start, end) excluding specified words
    word_list = [
        (match.group(), match.start(), match.end())
        for match in words
        if match.group() not in exclude_words
    ]    
    return word_list


def get_lists(word_list, intervals_for_sentences):
    english_list = []
    turkish_list = []

    for word in word_list:
        start = word[1]
        end = word[2]

        if 0 <= start < end <= intervals_for_sentences[0][0]:
            section = 'EN-1'
            english_list.append(word)
        elif intervals_for_sentences[0][0] <= start <= end <= intervals_for_sentences[0][1]:
            section = 'TR-1'
            turkish_list.append(word)
        elif intervals_for_sentences[1][0] <= start <= end <= intervals_for_sentences[1][1]:
            section = 'EN-2'
            english_list.append(word)
        elif intervals_for_sentences[2][0] <= start <= end <= intervals_for_sentences[2][1]:
            section = 'TR-2'
            turkish_list.append(word)
        elif intervals_for_sentences[3][0] <= start <= end <= intervals_for_sentences[3][1]:
            section = 'EN-3'
            english_list.append(word)
        elif intervals_for_sentences[4][0] <= start <= end <= intervals_for_sentences[4][1]:
            section = 'TR-3'
            turkish_list.append(word)

    return english_list, turkish_list


def term_position(word_list, term_list):
    result_list = [
    1 if (word, start, end) in term_list else 0
    for word, start, end in word_list]
    return result_list


def calculate_fleiss_kappa(annotators, num_items):
    """
    Calculate Fleiss' Kappa for multiple annotators.
    
    :param annotators: A list of lists where each sublist represents the annotations by a single annotator.
                       Each element in the sublist should be binary (0 for not detected, 1 for detected).
    :param num_items: The total number of items (e.g., terms) being evaluated.
    :return: Fleiss' kappa score.
    """
    # Count the number of raters and items
    num_raters = len(annotators)
    
    # Initialize an array to store the counts for each category per item
    # We assume two categories: 0 (not detected) and 1 (detected)
    category_counts = np.zeros((num_items, 2))
    
    # Populate the category_counts matrix with annotator decisions
    for annotator in annotators:
        for item_idx, decision in enumerate(annotator):
            category_counts[item_idx, decision] += 1

    # Compute the proportion of agreement for each item
    P_i = (category_counts ** 2).sum(axis=1) - num_raters
    P_i = P_i / (num_raters * (num_raters - 1))
    
    # Compute the mean agreement for all items
    P_bar = np.mean(P_i)
    
    # Compute the proportion of decisions for each category (overall distribution)
    p_j = category_counts.sum(axis=0) / (num_raters * num_items)
    
    # Compute the expected agreement by chance
    P_e_bar = (p_j ** 2).sum()
    
    # Fleiss' kappa
    kappa = (P_bar - P_e_bar) / (1 - P_e_bar) if (1 - P_e_bar) != 0 else 0
    
    return kappa




In [None]:
df

In [None]:
# # if some cells are NaN in the first row, fill them with the last row
# 
# df.iloc[0] = df.iloc[0].fillna(df.iloc[-1])
# 
# # If you want to update the DataFrame in place, this line does it
# df.update(df.iloc[0])
# 
# # drop the last row
# df = df.drop(df.tail(1).index)
# 
# 
# 
# 
# 


In [None]:
df

In [None]:

english_fleiss_kappas = []
turkish_fleiss_kappas = []

results_df = pd.DataFrame()


for task_column in task_columns:
    
    task_num = int(task_column.split('_')[1])
    all_paragraph, intervals_for_sentences = extract_paragraph_data(tasks_json[task_num - 1])
    english_data, turkish_data = separate_clean_data(all_paragraph)
    num_words_en = count_words(english_data, ['EN-1', 'EN-2', 'EN-3'])
    num_words_tr = count_words(turkish_data, ['TR-1', 'TR-2', 'TR-3'])
    tasks_annotations = df[f'{task_column}'].to_list()
    
    word_list = word_positions_with_check(all_paragraph)
    english_list, turkish_list = get_lists(word_list, intervals_for_sentences)
    
    
    task_results = []
    
    
    for annotation in tasks_annotations:
        if annotation is None:
            continue
        mail = annotation['completed_by']['email']
        results = get_annotation_results(annotation, mail, intervals_for_sentences)
        results['task'] = task_column
        results['term_position_eng'] = term_position(english_list, results['english_detected_terms'])
        results['term_position_tr'] = term_position(turkish_list, results['turkish_detected_terms'])
        task_results.append(results)
                            
                            
    english_results = []
    turkish_results = []
    sum_of_english_results = [0] * len(english_list)
    sum_of_turkish_results = [0] * len(turkish_list)
    
    for result in task_results:
        english_results.append(result['term_position_eng'])
        turkish_results.append(result['term_position_tr'])
        
        sum_of_english_results = [sum(x) for x in zip(sum_of_english_results, result['term_position_eng'])]
        sum_of_turkish_results = [sum(x) for x in zip(sum_of_turkish_results, result['term_position_tr'])]
        
    
    final_format_english = []
    
    
    en_num_of_gold_terms_0_3 = 0
    en_num_of_silver_terms_1_2 = 0
    en_num_of_gold_nan_terms_3_0 = 0
    en_num_of_silver_nan_terms_2_1 = 0
    
    
    for category_2 in sum_of_english_results:
        category_1 = 3 - category_2
        new_tuple = [category_1, category_2]
        final_format_english.append(new_tuple)
        
        
        if new_tuple == [0, 3]:
            en_num_of_gold_terms_0_3 += 1
        elif new_tuple == [1, 2]:
            en_num_of_silver_terms_1_2 += 1
        elif new_tuple == [3, 0]:
            en_num_of_gold_nan_terms_3_0 += 1
        elif new_tuple == [2, 1]:
            en_num_of_silver_nan_terms_2_1 += 1
        
            
        
        
        
    final_format_turkish = []
    
    tr_num_of_gold_terms_0_3 = 0
    tr_num_of_silver_terms_1_2 = 0
    tr_num_of_gold_nan_terms_3_0 = 0
    tr_num_of_silver_nan_terms_2_1 = 0
    
    for category_2 in sum_of_turkish_results:
        category_1 = 3 - category_2
        new_tuple = [category_1, category_2]
        final_format_turkish.append(new_tuple)
        
        if new_tuple == [0, 3]:
            tr_num_of_gold_terms_0_3 += 1
        elif new_tuple == [1, 2]:
            tr_num_of_silver_terms_1_2 += 1
        elif new_tuple == [3, 0]:
            tr_num_of_gold_nan_terms_3_0 += 1
        elif new_tuple == [2, 1]:
            tr_num_of_silver_nan_terms_2_1 += 1
        
        
    
        
        
        
    english_fleiss_kappa = calculate_fleiss_kappa(english_results, len(english_list))
    turkish_fleiss_kappa = calculate_fleiss_kappa(turkish_results, len(turkish_list))
        
        
    
    # SAVE RESULTS TO DATAFRAME
    to_be_concat = pd.DataFrame({'group_name': [GROUP_NAME],
                                    'task_name': [task_column],
                                 'domain': [DOMAIN],
                                 'word_list_en': [english_list],
                                 'word_list_tr': [turkish_list],
                                    'english_annotations': [english_results],
                                    'turkish_annotations': [turkish_results],
                                'final_format_english': [final_format_english],
                                'final_format_turkish': [final_format_turkish], 
                                'en_num_of_gold_terms_0_3': [en_num_of_gold_terms_0_3],
                                'en_num_of_silver_terms_1_2': [en_num_of_silver_terms_1_2],
                                 'en_num_of_gold_nan_terms_3_0': [en_num_of_gold_nan_terms_3_0],
                                    'en_num_of_silver_nan_terms_2_1': [en_num_of_silver_nan_terms_2_1],
                                    'tr_num_of_gold_terms_0_3': [tr_num_of_gold_terms_0_3],
                                    'tr_num_of_silver_terms_1_2': [tr_num_of_silver_terms_1_2],
                                    'tr_num_of_gold_nan_terms_3_0': [tr_num_of_gold_nan_terms_3_0],
                                    'tr_num_of_silver_nan_terms_2_1': [tr_num_of_silver_nan_terms_2_1], 
                                'english_fleiss_kappa_1': [english_fleiss_kappa],
                                'turkish_fleiss_kappa_1': [turkish_fleiss_kappa]})
    results_df = pd.concat([results_df, to_be_concat], ignore_index=True)

In [None]:
import torch
from torchmetrics.nominal import FleissKappa
fleiss_kappa_english = []
fleiss_kappa_turkish = []

for index, row in results_df.iterrows():
    english_annotations = row['final_format_english']
    turkish_annotations = row['final_format_turkish']
    
    # convert to long tensor
    english_annotations = torch.tensor(english_annotations).long()
    turkish_annotations = torch.tensor(turkish_annotations).long()
    metric = FleissKappa(mode='counts')
    english_kappa = metric(english_annotations)
    turkish_kappa = metric(turkish_annotations)
    
    fleiss_kappa_english.append(english_kappa.item())
    fleiss_kappa_turkish.append(turkish_kappa.item())
    

In [None]:
results_df['english_fleiss_kappa_2'] = fleiss_kappa_english
results_df['turkish_fleiss_kappa_2'] = fleiss_kappa_turkish

In [None]:
results_df.to_csv('results/' + GROUP_NAME + '_results.csv', index=False)

In [None]:
results_df
