In [26]:
import pandas as pd
import os

df_10k = pd.read_csv('../data/askme-qa/raw_data_10k.csv')
base_dir = '../data/hitl'
annotators = ['HITL_Alex', 'HITL_Atheer', 'HITL_Hend', 'HITL_Xiaoyuan']
file_names = ['Copy of answers_samples.xlsx', 'Copy of questions_samples.xlsx', 'Copy of questions_zs_samples.xlsx']

# Initialize empty DataFrames for each type of samples
df_answers = pd.DataFrame()
df_questions = pd.DataFrame()
df_questions_zs = pd.DataFrame()

# Iterate over each annotator and each file
for annotator in annotators:
    for file_name in file_names:
        file_path = os.path.join(base_dir, annotator, file_name)
        df = pd.read_excel(file_path)

        # Drop "instructions" column and unnamed columns
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        if 'instructions' in df.columns:
            df = df.drop(columns=['instructions'])
        if 'Comments' in df.columns:
            df = df.drop(columns=['Comments'])
        # Drop columns with names that are not english
        df = df.drop(columns=[col for col in df.columns if not col.isascii()])

        # Rename "annotation" column to include the annotator's name
        if 'annotation' in df.columns:
            annotator_name = annotator.split('_')[-1].lower()
            df = df.rename(columns={'annotation': f'annotation_{annotator_name}'})

        # Concatenate data to respective DataFrames
        if 'answers' in file_name:
            df_answers = pd.concat([df_answers, df], ignore_index=True)
        elif 'questions_zs' in file_name:
            df_questions_zs = pd.concat([df_questions_zs, df], ignore_index=True)
        else:  # 'questions' in file_name
            df_questions = pd.concat([df_questions, df], ignore_index=True)

df_answers = df_answers.merge(df_10k[['id_answer', 'value']], left_on='answer_id', right_on='id_answer', how='left').drop(columns=['id_answer'])
df_questions_zs = df_questions_zs.merge(df_10k[['question_id', 'is_answerable_zs']], on='question_id', how='left')
df_questions = df_questions.merge(df_10k[['question_id', 'is_answerable_ic']], on='question_id', how='left')

df_questions = df_questions.rename(columns={'is_answerable_ic': 'annotation_llama3_70b'})
df_questions_zs = df_questions_zs.rename(columns={'is_answerable_zs': 'annotation_llama3_70b'})
df_answers = df_answers.rename(columns={'value': 'annotation_llama3_70b'})

# replace Y with True and N with False
df_questions['annotation_llama3_70b'] = df_questions['annotation_llama3_70b'].replace({'Y': True, 'N': False})
df_questions_zs['annotation_llama3_70b'] = df_questions_zs['annotation_llama3_70b'].replace({'Y': True, 'N': False})
df_answers['annotation_llama3_70b'] = df_answers['annotation_llama3_70b'].replace({'Y': True, 'N': False})

df_questions.to_csv('../data/hitl/questions.csv', index=False)
df_questions_zs.to_csv('../data/hitl/questions_zs.csv', index=False)
df_answers.to_csv('../data/hitl/answers.csv', index=False)

In [25]:
df_questions_zs

Unnamed: 0,question_id,question,annotation_alex,annotation_atheer,annotation_hend,annotation_xiaoyuan,annotation_llama3_70b,annotation_llama3_70b.1
0,35859,"Who provides vocals for the song ""Shivers""?",Y,,,,False,False
1,35859,"Who provides vocals for the song ""Shivers""?",Y,,,,False,False
2,35859,"Who provides vocals for the song ""Shivers""?",Y,,,,False,False
3,35859,"Who provides vocals for the song ""Shivers""?",Y,,,,False,False
4,29022,Who responded enthusiastically to Kevin Spacey...,Y,,,,True,True
...,...,...,...,...,...,...,...,...
1595,23389,What was the amount offered by Ralston Purina ...,,,,Y,False,False
1596,32187,What is the title of the song that features a ...,,,,Y,False,False
1597,32187,What is the title of the song that features a ...,,,,Y,False,False
1598,32187,What is the title of the song that features a ...,,,,Y,False,False
