In [None]:
# Importing the libraries
import pandas as pd
import json

# Importing the annotation results
df = pd.read_csv('new_annotation_results.csv')
COLUMNS = ['mail', 'task_1_full_annotation', 'task_2_full_annotation',
           'task_3_full_annotation', 'task_4_full_annotation',
           'task_5_full_annotation', 'task_6_full_annotation',
           'task_7_full_annotation', 'task_8_full_annotation',
           'task_9_full_annotation', 'task_10_full_annotation']

df["number_of_tasks_completed"] = df[COLUMNS].notnull().sum(axis=1) - 1

GROUND_TRUTH_MAIL = "gosahin@ku.edu.tr"
ground_truth = df[df["mail"] == GROUND_TRUTH_MAIL]







def read_tasks_as_json(input_file):
    with open(input_file, 'r') as f:
        task_data = json.load(f)
    return task_data

tasks_json = read_tasks_as_json('tasks_results.json')

def extract_paragraph_data(task_data):
    all_paragraph = task_data['data']['my_text_1']
    sections = ['TR-1', 'EN-2', 'TR-2', 'EN-3', 'TR-3']
    positions = [all_paragraph.find(section) for section in sections] + [len(all_paragraph)]
    intervals = [(positions[i], positions[i+1]) for i in range(len(positions) - 1)]
    return all_paragraph, intervals

def separate_clean_data(paragraph):
    lines = paragraph.strip().split('\n')
    english_data, turkish_data = {}, {}
    for line in lines:
        line = line.strip()
        if line.startswith('EN-'):
            key, value = line.split(': ')
            english_data[key] = value
        elif line.startswith('TR-'):
            key, value = line.split(': ')
            turkish_data[key] = value
    return english_data, turkish_data

def count_words(data, keys):
    return sum(len(data[key].split()) for key in keys)

def get_annotation_results(annotation, mail, intervals_for_sentences):
    is_ground_truth = mail == GROUND_TRUTH_MAIL
    results = {'email': mail, 'ground_truth': is_ground_truth}
    features = {
        'english_detected_terms': [],
        'turkish_detected_terms': [],
        'turkish_detected_labels': [],
        'turkish_detected_corrections': [],
        'english_terimler_org_detected': []
    }
    
    for item in annotation['result']:
        if 'value' in item and 'labels' in item['value']:
            label = item['value']['labels'][0]
            text, start, end = item['value']['text'], item['value']['start'], item['value']['end']
            meta_text = item.get('meta', {}).get('text', [None])[0]
            section = None

            # Check if text is not empty before modifying it
            if text:
                # Remove non-alphanumeric characters from the end
                if not text[-1].isalnum():
                    text = text[:-1]
                    end -= 1
                
                # Remove non-alphanumeric characters from the beginning
                if text and not text[0].isalnum():
                    text = text[1:]
                    start += 1

            # Determine the section based on the start and end positions
            if 0 <= start < end <= intervals_for_sentences[0][0]:
                section = 'EN-1'
            elif intervals_for_sentences[0][0] <= start <= end <= intervals_for_sentences[0][1]:
                section = 'TR-1'
            elif intervals_for_sentences[1][0] <= start <= end <= intervals_for_sentences[1][1]:
                section = 'EN-2'
            elif intervals_for_sentences[2][0] <= start <= end <= intervals_for_sentences[2][1]:
                section = 'TR-2'
            elif intervals_for_sentences[3][0] <= start <= end <= intervals_for_sentences[3][1]:
                section = 'EN-3'
            elif intervals_for_sentences[4][0] <= start <= end <= intervals_for_sentences[4][1]:
                section = 'TR-3'
                
                
            if text:  # Only process if text is not empty
                if label == 'TERM':
                    features['english_detected_terms'].append((text, start, end, section))
                    if meta_text:
                        features['english_terimler_org_detected'].append((text, start, end, section, meta_text))
                elif label in ['CORRECT_TRANSLATION', 'WRONG_TRANSLATION']:
                    features['turkish_detected_terms'].append((text, start, end, section))
                    features['turkish_detected_labels'].append((text, start, end, section, label))
                    if label == 'WRONG_TRANSLATION' and meta_text:
                        features['turkish_detected_corrections'].append((text, start, end, section, meta_text))
    
    results.update(features)
    return results

def calculate_term_detection_score(annotation, ground_truth, num_words, language='english'):
    feature = 'english_detected_terms' if language == 'english' else 'turkish_detected_terms'
    pred = set(annotation[feature])
    truth = set(ground_truth[feature])

    tp_set = pred.intersection(truth)
    tp = sum(len(term[0].split()) for term in tp_set)
    
    fp_set = pred - truth
    fp = sum(len(term[0].split()) for term in fp_set)
    
    fn_set = truth - pred
    fn = sum(len(term[0].split()) for term in fn_set)
    
    tn = num_words - tp - fp - fn

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if tp + tn + fp + fn > 0 else 0
    
    result = {'precision': precision, 'recall': recall, 'f1_score': f1_score, 'accuracy': accuracy, 'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn, 'tp_set': tp_set, 'fp_set': fp_set, 'fn_set': fn_set, 'name': feature}

    return result

def calculate_exact_match(annotation, ground_truth, feature):    
    pred = set(annotation[feature])
    truth = set(ground_truth[feature])
    
    pred_first_three = {(x[0], x[1], x[2], x[3]) for x in pred}
    truth_first_three = {(x[0], x[1], x[2], x[3]) for x in truth}
    
    intersection = pred_first_three.intersection(truth_first_three)
    
    pred_dict = { (x[0], x[1], x[2], x[3]): x[4] for x in pred }
    truth_dict = { (x[0], x[1], x[2], x[3]): x[4] for x in truth }
    
    pred = [ (x[0], x[1], x[2], x[3], pred_dict[x]) for x in intersection]
    truth = [ (x[0], x[1], x[2],x[3], truth_dict[x]) for x in intersection]
    
    pred = set(pred)
    truth = set(truth)
    
    intersection_num = len(pred.intersection(truth))
    difference_num = len(truth - pred)
    
    intersection_set   = pred.intersection(truth)
    difference_set = pred - truth
    
    exact_match = intersection_num / (intersection_num + difference_num) if intersection_num + difference_num > 0 else 0
    
    result = { 'intersection_set': intersection_set, 'difference_set': difference_set, 'exact_match': exact_match, 'name': feature}
    
    return result


In [None]:
# Initialize a list to store all results for each annotator
all_annotator_results = []


# Loop through each annotator
for index, row in df.iterrows():
    ANNOTATOR_MAIL = row['mail']
    annotator_results = {}

    # Initialize cumulative metrics
    total_tp_english_term_detection = 0
    total_fp_english_term_detection = 0
    total_fn_english_term_detection = 0
    total_tn_english_term_detection = 0
    total_num_words_en = 0
    
    total_tp_turkish_term_detection = 0
    total_fp_turkish_term_detection = 0
    total_fn_turkish_term_detection = 0
    total_tn_turkish_term_detection = 0
    
    total_intersection_turkish_labels = 0
    total_difference_turkish_labels = 0
    
    total_intersection_turkish_corrections = 0
    total_difference_turkish_corrections = 0
    
    total_intersection_english_term_linking = 0
    total_difference_english_term_linking = 0

    # Loop through each task from 1 to 10
    for task_num in range(1, 11):
        # Get the annotator's task data
        annotator_task = row[f'task_{task_num}_full_annotation']

        
        # Check if the annotator's task data is NaN
        if pd.isna(annotator_task):
            # If NaN, set all relevant columns to NaN
            annotator_results[f'task_{task_num}_english_term_detection_tp'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_fp'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_fn'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_tn'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_precision'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_recall'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_f1_score'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_accuracy'] = float('nan')
            
            annotator_results[f'task_{task_num}_turkish_term_detection_tp'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_fp'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_fn'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_tn'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_precision'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_recall'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_f1_score'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_accuracy'] = float('nan')
            
            annotator_results[f'task_{task_num}_turkish_labels_exact_match'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_corrections_exact_match'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_linking_exact_match'] = float('nan')
            
            annotator_results[f'task_{task_num}_english_term_detection_tp_set'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_fp_set'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_detection_fn_set'] = float('nan')
            
            annotator_results[f'task_{task_num}_turkish_term_detection_tp_set'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_fp_set'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_term_detection_fn_set'] = float('nan')
            
            annotator_results[f'task_{task_num}_turkish_labels_intersection_set'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_labels_difference_set'] = float('nan')
            
            annotator_results[f'task_{task_num}_turkish_corrections_intersection_set'] = float('nan')
            annotator_results[f'task_{task_num}_turkish_corrections_difference_set'] = float('nan')
            
            annotator_results[f'task_{task_num}_english_term_linking_intersection_set'] = float('nan')
            annotator_results[f'task_{task_num}_english_term_linking_difference_set'] = float('nan')
        
        else:
            # If not NaN, proceed with the regular processing
            ground_truth_task = eval(ground_truth[f'task_{task_num}_full_annotation'].values[0])
            annotator_task = eval(annotator_task)
            
            # Extract paragraph data and intervals
            all_paragraph, intervals_for_sentences = extract_paragraph_data(tasks_json[task_num - 1])
            
            # Separate and count words in English and Turkish
            english_data, turkish_data = separate_clean_data(all_paragraph)
            num_words_en = count_words(english_data, ['EN-1', 'EN-2', 'EN-3'])
            num_words_tr = count_words(turkish_data, ['TR-1', 'TR-2', 'TR-3'])
            
            # Get annotation results for both ground truth and the current annotator
            ground_truth_annotation_results = get_annotation_results(ground_truth_task, GROUND_TRUTH_MAIL, intervals_for_sentences)
            annotator_annotation_results = get_annotation_results(annotator_task, ANNOTATOR_MAIL, intervals_for_sentences)
            
            # Calculate English and Turkish term detection metrics
            english_term_detection_data = calculate_term_detection_score(annotator_annotation_results, ground_truth_annotation_results, num_words_en, 'english')
            turkish_term_detection_data = calculate_term_detection_score(annotator_annotation_results, ground_truth_annotation_results, num_words_tr, 'turkish')
            
            # Calculate exact matches for Turkish labels and corrections, and English term linking
            turkish_labels_exact_match_data = calculate_exact_match(annotator_annotation_results, ground_truth_annotation_results, 'turkish_detected_labels')
            turkish_corrections_exact_match_data = calculate_exact_match(annotator_annotation_results, ground_truth_annotation_results, 'turkish_detected_corrections')
            english_term_linking_exact_match_data = calculate_exact_match(annotator_annotation_results, ground_truth_annotation_results, 'english_terimler_org_detected')
            
            # Store the metrics in the dictionary for the current annotator
            annotator_results[f'task_{task_num}_english_term_detection_tp'] = english_term_detection_data['tp']
            annotator_results[f'task_{task_num}_english_term_detection_fp'] = english_term_detection_data['fp']
            annotator_results[f'task_{task_num}_english_term_detection_fn'] = english_term_detection_data['fn']
            annotator_results[f'task_{task_num}_english_term_detection_tn'] = english_term_detection_data['tn']
            annotator_results[f'task_{task_num}_english_term_detection_precision'] = english_term_detection_data['precision']
            annotator_results[f'task_{task_num}_english_term_detection_recall'] = english_term_detection_data['recall']
            annotator_results[f'task_{task_num}_english_term_detection_f1_score'] = english_term_detection_data['f1_score']
            annotator_results[f'task_{task_num}_english_term_detection_accuracy'] = english_term_detection_data['accuracy']
            
            annotator_results[f'task_{task_num}_turkish_term_detection_tp'] = turkish_term_detection_data['tp']
            annotator_results[f'task_{task_num}_turkish_term_detection_fp'] = turkish_term_detection_data['fp']
            annotator_results[f'task_{task_num}_turkish_term_detection_fn'] = turkish_term_detection_data['fn']
            annotator_results[f'task_{task_num}_turkish_term_detection_tn'] = turkish_term_detection_data['tn']
            annotator_results[f'task_{task_num}_turkish_term_detection_precision'] = turkish_term_detection_data['precision']
            annotator_results[f'task_{task_num}_turkish_term_detection_recall'] = turkish_term_detection_data['recall']
            annotator_results[f'task_{task_num}_turkish_term_detection_f1_score'] = turkish_term_detection_data['f1_score']
            annotator_results[f'task_{task_num}_turkish_term_detection_accuracy'] = turkish_term_detection_data['accuracy']
            
            # Store exact match results
            annotator_results[f'task_{task_num}_turkish_labels_exact_match'] = turkish_labels_exact_match_data['exact_match']
            annotator_results[f'task_{task_num}_turkish_corrections_exact_match'] = turkish_corrections_exact_match_data['exact_match']
            annotator_results[f'task_{task_num}_english_term_linking_exact_match'] = english_term_linking_exact_match_data['exact_match']
            
            # Store sets as strings in the dictionary for the current annotator
            annotator_results[f'task_{task_num}_english_term_detection_tp_set'] = str(english_term_detection_data['tp_set'])
            annotator_results[f'task_{task_num}_english_term_detection_fp_set'] = str(english_term_detection_data['fp_set'])
            annotator_results[f'task_{task_num}_english_term_detection_fn_set'] = str(english_term_detection_data['fn_set'])
            
            annotator_results[f'task_{task_num}_turkish_term_detection_tp_set'] = str(turkish_term_detection_data['tp_set'])
            annotator_results[f'task_{task_num}_turkish_term_detection_fp_set'] = str(turkish_term_detection_data['fp_set'])
            annotator_results[f'task_{task_num}_turkish_term_detection_fn_set'] = str(turkish_term_detection_data['fn_set'])
            
            
            annotator_results[f'task_{task_num}_turkish_labels_intersection_set'] = str(turkish_labels_exact_match_data['intersection_set'])
            annotator_results[f'task_{task_num}_turkish_labels_difference_set'] = str(turkish_labels_exact_match_data['difference_set'])
            annotator_results[f'task_{task_num}_turkish_labels_intersection_num'] = len(turkish_labels_exact_match_data['intersection_set'])
            annotator_results[f'task_{task_num}_turkish_labels_difference_num'] = len(turkish_labels_exact_match_data['difference_set'])

            
            
            
            annotator_results[f'task_{task_num}_turkish_corrections_intersection_set'] = str(turkish_corrections_exact_match_data['intersection_set'])
            annotator_results[f'task_{task_num}_turkish_corrections_difference_set'] = str(turkish_corrections_exact_match_data['difference_set'])
            annotator_results[f'task_{task_num}_turkish_corrections_intersection_num'] = len(turkish_corrections_exact_match_data['intersection_set'])
            annotator_results[f'task_{task_num}_turkish_corrections_difference_num'] = len(turkish_corrections_exact_match_data['difference_set'])
            
            annotator_results[f'task_{task_num}_english_term_linking_intersection_set'] = str(english_term_linking_exact_match_data['intersection_set'])
            annotator_results[f'task_{task_num}_english_term_linking_difference_set'] = str(english_term_linking_exact_match_data['difference_set'])
            annotator_results[f'task_{task_num}_english_term_linking_intersection_num'] = len(english_term_linking_exact_match_data['intersection_set'])
            annotator_results[f'task_{task_num}_english_term_linking_difference_num'] = len(english_term_linking_exact_match_data['difference_set'])

            # Update cumulative metrics
            total_tp_english_term_detection += english_term_detection_data['tp']
            total_fp_english_term_detection += english_term_detection_data['fp']
            total_fn_english_term_detection += english_term_detection_data['fn']
            total_tn_english_term_detection += english_term_detection_data['tn']
            
            total_tp_turkish_term_detection += turkish_term_detection_data['tp']
            total_fp_turkish_term_detection += turkish_term_detection_data['fp']
            total_fn_turkish_term_detection += turkish_term_detection_data['fn']
            total_tn_turkish_term_detection += turkish_term_detection_data['tn']
            
            total_intersection_turkish_labels += len(turkish_labels_exact_match_data['intersection_set'])
            total_difference_turkish_labels += len(turkish_labels_exact_match_data['difference_set'])
            
            total_intersection_turkish_corrections += len(turkish_corrections_exact_match_data['intersection_set'])
            total_difference_turkish_corrections += len(turkish_corrections_exact_match_data['difference_set'])
            
            total_intersection_english_term_linking += len(english_term_linking_exact_match_data['intersection_set'])
            total_difference_english_term_linking += len(english_term_linking_exact_match_data['difference_set'])


    # Calculate cumulative metrics
    cumulative_english_term_detection_data = {
        'tp': total_tp_english_term_detection,
        'fp': total_fp_english_term_detection,
        'fn': total_fn_english_term_detection,
        'tn': total_tn_english_term_detection
    }
    
    cumulative_turkish_term_detection_data = {
        'tp': total_tp_turkish_term_detection,
        'fp': total_fp_turkish_term_detection,
        'fn': total_fn_turkish_term_detection,
        'tn': total_tn_turkish_term_detection
    }
    
    cumulative_turkish_labels_exact_match_data = {
        'intersection_num': total_intersection_turkish_labels,
        'difference_num': total_difference_turkish_labels
    }
    
    cumulative_turkish_corrections_exact_match_data = {
        'intersection_num': total_intersection_turkish_corrections,
        'difference_num': total_difference_turkish_corrections
    }
    
    cumulative_english_term_linking_exact_match_data = {
        'intersection_num': total_intersection_english_term_linking,
        'difference_num': total_difference_english_term_linking
    }
    
    # Calculate cumulative scores
    cumulative_english_term_detection_precision = cumulative_english_term_detection_data['tp'] / (cumulative_english_term_detection_data['tp'] + cumulative_english_term_detection_data['fp']) if cumulative_english_term_detection_data['tp'] + cumulative_english_term_detection_data['fp'] > 0 else 0
    
    cumulative_english_term_detection_recall = cumulative_english_term_detection_data['tp'] / (cumulative_english_term_detection_data['tp'] + cumulative_english_term_detection_data['fn']) if cumulative_english_term_detection_data['tp'] + cumulative_english_term_detection_data['fn'] > 0 else 0
    
    cumulative_english_term_detection_f1_score = (2 * cumulative_english_term_detection_precision * cumulative_english_term_detection_recall) / (cumulative_english_term_detection_precision + cumulative_english_term_detection_recall) if cumulative_english_term_detection_precision + cumulative_english_term_detection_recall > 0 else 0
    
    cumulative_english_term_detection_accuracy = (cumulative_english_term_detection_data['tp'] + cumulative_english_term_detection_data['tn']) / (cumulative_english_term_detection_data['tp'] + cumulative_english_term_detection_data['tn'] + cumulative_english_term_detection_data['fp'] + cumulative_english_term_detection_data['fn']) if cumulative_english_term_detection_data['tp'] + cumulative_english_term_detection_data['tn'] + cumulative_english_term_detection_data['fp'] + cumulative_english_term_detection_data['fn'] > 0 else 0
    
    cumulative_turkish_term_detection_precision = cumulative_turkish_term_detection_data['tp'] / (cumulative_turkish_term_detection_data['tp'] + cumulative_turkish_term_detection_data['fp']) if cumulative_turkish_term_detection_data['tp'] + cumulative_turkish_term_detection_data['fp'] > 0 else 0
    
    cumulative_turkish_term_detection_recall = cumulative_turkish_term_detection_data['tp'] / (cumulative_turkish_term_detection_data['tp'] + cumulative_turkish_term_detection_data['fn']) if cumulative_turkish_term_detection_data['tp'] + cumulative_turkish_term_detection_data['fn'] > 0 else 0
    
    cumulative_turkish_term_detection_f1_score = (2 * cumulative_turkish_term_detection_precision * cumulative_turkish_term_detection_recall) / (cumulative_turkish_term_detection_precision + cumulative_turkish_term_detection_recall) if cumulative_turkish_term_detection_precision + cumulative_turkish_term_detection_recall > 0 else 0
    
    cumulative_turkish_term_detection_accuracy = (cumulative_turkish_term_detection_data['tp'] + cumulative_turkish_term_detection_data['tn']) / (cumulative_turkish_term_detection_data['tp'] + cumulative_turkish_term_detection_data['tn'] + cumulative_turkish_term_detection_data['fp'] + cumulative_turkish_term_detection_data['fn']) if cumulative_turkish_term_detection_data['tp'] + cumulative_turkish_term_detection_data['tn'] + cumulative_turkish_term_detection_data['fp'] + cumulative_turkish_term_detection_data['fn'] > 0 else 0
    
    
    # Calculate cumulative exact matches
    cumulative_turkish_labels_exact_match = cumulative_turkish_labels_exact_match_data['intersection_num'] / (cumulative_turkish_labels_exact_match_data['intersection_num'] + cumulative_turkish_labels_exact_match_data['difference_num']) if cumulative_turkish_labels_exact_match_data['intersection_num'] + cumulative_turkish_labels_exact_match_data['difference_num'] > 0 else 0
    
    cumulative_turkish_corrections_exact_match = cumulative_turkish_corrections_exact_match_data['intersection_num'] / (cumulative_turkish_corrections_exact_match_data['intersection_num'] + cumulative_turkish_corrections_exact_match_data['difference_num']) if cumulative_turkish_corrections_exact_match_data['intersection_num'] + cumulative_turkish_corrections_exact_match_data['difference_num'] > 0 else 0
    
    cumulative_english_term_linking_exact_match = cumulative_english_term_linking_exact_match_data['intersection_num'] / (cumulative_english_term_linking_exact_match_data['intersection_num'] + cumulative_english_term_linking_exact_match_data['difference_num']) if cumulative_english_term_linking_exact_match_data['intersection_num'] + cumulative_english_term_linking_exact_match_data['difference_num'] > 0 else 0
    
    # Store the cumulative metrics in the dictionary for the current annotator
    
    annotator_results['cumulative_english_term_detection_tp'] = cumulative_english_term_detection_data['tp']
    annotator_results['cumulative_english_term_detection_fp'] = cumulative_english_term_detection_data['fp']
    annotator_results['cumulative_english_term_detection_fn'] = cumulative_english_term_detection_data['fn']
    annotator_results['cumulative_english_term_detection_tn'] = cumulative_english_term_detection_data['tn']
    
    annotator_results['cumulative_english_term_detection_precision'] = cumulative_english_term_detection_precision
    annotator_results['cumulative_english_term_detection_recall'] = cumulative_english_term_detection_recall
    annotator_results['cumulative_english_term_detection_f1_score'] = cumulative_english_term_detection_f1_score
    annotator_results['cumulative_english_term_detection_accuracy'] = cumulative_english_term_detection_accuracy
    
    annotator_results['cumulative_turkish_term_detection_tp'] = cumulative_turkish_term_detection_data['tp']
    annotator_results['cumulative_turkish_term_detection_fp'] = cumulative_turkish_term_detection_data['fp']
    annotator_results['cumulative_turkish_term_detection_fn'] = cumulative_turkish_term_detection_data['fn']
    annotator_results['cumulative_turkish_term_detection_tn'] = cumulative_turkish_term_detection_data['tn']
    
    annotator_results['cumulative_turkish_term_detection_precision'] = cumulative_turkish_term_detection_precision
    annotator_results['cumulative_turkish_term_detection_recall'] = cumulative_turkish_term_detection_recall
    annotator_results['cumulative_turkish_term_detection_f1_score'] = cumulative_turkish_term_detection_f1_score
    annotator_results['cumulative_turkish_term_detection_accuracy'] = cumulative_turkish_term_detection_accuracy

    annotator_results['cumulative_turkish_labels_intersection_num'] = cumulative_turkish_labels_exact_match_data['intersection_num']
    annotator_results['cumulative_turkish_labels_difference_num'] = cumulative_turkish_labels_exact_match_data['difference_num']
    annotator_results['cumulative_turkish_labels_exact_match'] = cumulative_turkish_labels_exact_match
    
    annotator_results['cumulative_turkish_corrections_intersection_num'] = cumulative_turkish_corrections_exact_match_data['intersection_num']
    annotator_results['cumulative_turkish_corrections_difference_num'] = cumulative_turkish_corrections_exact_match_data['difference_num']
    annotator_results['cumulative_turkish_corrections_exact_match'] = cumulative_turkish_corrections_exact_match
    
    annotator_results['cumulative_english_term_linking_intersection_num'] = cumulative_english_term_linking_exact_match_data['intersection_num']
    annotator_results['cumulative_english_term_linking_difference_num'] = cumulative_english_term_linking_exact_match_data['difference_num']
    annotator_results['cumulative_english_term_linking_exact_match'] = cumulative_english_term_linking_exact_match
    
    # Append results to the list
    all_annotator_results.append(annotator_results)

# Convert the list of dictionaries into a DataFrame and concatenate it with the original DataFrame
all_annotator_results_df = pd.DataFrame(all_annotator_results, index=df.index)
df = pd.concat([df, all_annotator_results_df], axis=1)


In [None]:
# Save the results to a CSV file
df.to_csv('results/3_all_results_old_ground_truth_ibrahim.csv', index=False)
# drop task_1_full_annotation, task_2_full_annotation, task_3_full_annotation, task_4_full_annotation, task_5_full_annotation, task_6_full_annotation, task_7_full_annotation, task_8_full_annotation, task_9_full_annotation, task_10_full_annotation
df.drop(['task_1_full_annotation', 'task_2_full_annotation', 'task_3_full_annotation', 'task_4_full_annotation', 'task_5_full_annotation', 'task_6_full_annotation', 'task_7_full_annotation', 'task_8_full_annotation', 'task_9_full_annotation', 'task_10_full_annotation'], axis=1, inplace=True)

df.to_csv('results/4_only_metrics_old_ground_truth_ibrahim.csv', index=False)

In [None]:
df.to_excel('results/4_only_metrics_old_ground_truth_ibrahim.xlsx', index=False)