In [11]:
import pandas as pd
import os

# Directory structure
base_dir = '../data/hitl'
annotators = ['HITL_Alex', 'HITL_Atheer', 'HITL_Hend', 'HITL_Xiaoyuan']
file_names = ['Copy of answers_samples.xlsx', 'Copy of questions_samples.xlsx', 'Copy of questions_zs_samples.xlsx']

# Initialize empty DataFrames for each type of samples
df_answers = pd.DataFrame()
df_questions = pd.DataFrame()
df_questions_zs = pd.DataFrame()

# Iterate over each annotator and each file
for annotator in annotators:
    for file_name in file_names:
        file_path = os.path.join(base_dir, annotator, file_name)
        df = pd.read_excel(file_path)

        # Drop "instructions" column and unnamed columns
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        if 'instructions' in df.columns:
            df = df.drop(columns=['instructions'])
        if 'Comments' in df.columns:
            df = df.drop(columns=['Comments'])
        # Drop columns with names that are not english
        df = df.drop(columns=[col for col in df.columns if not col.isascii()])

        # Rename "annotation" column to include the annotator's name
        if 'annotation' in df.columns:
            annotator_name = annotator.split('_')[-1].lower()
            df = df.rename(columns={'annotation': f'annotation_{annotator_name}'})

        # Concatenate data to respective DataFrames
        if 'answers' in file_name:
            df_answers = pd.concat([df_answers, df], ignore_index=True)
        elif 'questions_zs' in file_name:
            df_questions_zs = pd.concat([df_questions_zs, df], ignore_index=True)
        else:  # 'questions' in file_name
            df_questions = pd.concat([df_questions, df], ignore_index=True)



In [12]:
df_answers

Unnamed: 0,question_id,answer_id,fact,question,answer,annotation_alex,annotation_atheer,annotation_hend,annotation_xiaoyuan
0,39710,79419,"In an article about 'Treasure Planet', section...",What company released a line-up of action figu...,The company that released a line-up of action ...,1.0,,,
1,5328,10655,"In an article about 'We Can 't Stop', section ...","How many copies of ""We Can't Stop"" had been so...","As of January 2014, ""We Can't Stop"" by Miley C...",2.0,,,
2,16513,33025,In an article about 'Tropical Storm Marco ( 20...,Where did the center of Tropical Storm Marco m...,"According to records, the center of Tropical S...",1.0,,,
3,19669,39337,In an article about 'California Southern Railr...,Where is the preserved California Southern Rai...,The preserved California Southern Railroad sta...,1.0,,,
4,16778,33555,"In an article about 'Vincent van Gogh', sectio...",What activities did Vincent van Gogh's family ...,"During their time in Zundert, Vincent van Gogh...",4.0,,,
...,...,...,...,...,...,...,...,...,...
395,25949,51897,In an article about 'Gloucester Road tube stat...,Which two companies jointly operated the Middl...,The Metropolitan Railway and the District Rail...,,,,1.0
396,9880,19759,"In an article about 'Bennie Oosterbaan', secti...",What position did Bennie Oosterbaan hold on th...,Bennie Oosterbaan played end (wide receiver) d...,,,,1.0
397,28776,57551,"In an article about 'Forza Motorsport 4', sect...","According to Turn 10, what features of the UNS...","According to Turn 10, the developers of Forza ...",,,,5.0
398,8110,16219,"In an article about 'Panzer 35 ( t )', section...",What is the diameter of the Panzer 35(t)'s tur...,The diameter of the Panzer 35(t)'s turret ring...,,,,1.0
