In [50]:
import pandas as pd
import os

# Load the master dataframe
df_10k = pd.read_csv('../data/askme-qa/raw_data_10k.csv')

# Directory structure
base_dir = '../data/hitl'
annotators = ['HITL_Alex', 'HITL_Atheer', 'HITL_Hend', 'HITL_Xiaoyuan']
file_names = ['Copy of answers_samples.xlsx', 'Copy of questions_samples.xlsx', 'Copy of questions_zs_samples.xlsx']

# Initialize empty DataFrames for each type of samples
df_answers = None
df_questions = None
df_questions_zs = None

# Iterate over each annotator and each file
for annotator in annotators:
    for file_name in file_names:
        file_path = os.path.join(base_dir, annotator, file_name)
        df = pd.read_excel(file_path)

        # Drop "instructions" column and unnamed columns
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        if 'instructions' in df.columns:
            df = df.drop(columns=['instructions'])
        if 'Comments' in df.columns:
            df = df.drop(columns=['Comments'])
        # Drop columns with names that are not in English
        df = df.drop(columns=[col for col in df.columns if not col.isascii()])

        # Rename "annotation" column to include the annotator's name
        if 'annotation' in df.columns:
            annotator_name = annotator.split('_')[-1].lower()
            df = df.rename(columns={'annotation': f'annotation_{annotator_name}'})
            annotation_col = df[[f'annotation_{annotator_name}']]  # Select only the renamed annotation column

            # Merge data to respective DataFrames horizontally
            if 'answers' in file_name:
                if df_answers is None:
                    df_answers = df[['answer_id']].join(annotation_col)
                else:
                    df_answers = df_answers.join(annotation_col, rsuffix=f'_{annotator_name}')
            elif 'questions_zs' in file_name:
                if df_questions_zs is None:
                    df_questions_zs = df[['question_id']].join(annotation_col)
                else:
                    df_questions_zs = df_questions_zs.join(annotation_col, rsuffix=f'_{annotator_name}')
            else:  # 'questions' in file_name
                if df_questions is None:
                    df_questions = df[['question_id']].join(annotation_col)
                else:
                    df_questions = df_questions.join(annotation_col, rsuffix=f'_{annotator_name}')

# Append additional fields from df_10k
df_answers = df_answers.merge(df_10k[['id_answer', 'value']], left_on='answer_id', right_on='id_answer', how='left').drop(columns=['id_answer'])
df_questions_zs = df_questions_zs.merge(df_10k[['question_id', 'is_answerable_zs']], on='question_id', how='left')
df_questions = df_questions.merge(df_10k[['question_id', 'is_answerable_ic']], on='question_id', how='left')

# Rename columns
df_questions = df_questions.rename(columns={'is_answerable_ic': 'annotation_llama3_70b'})
df_questions_zs = df_questions_zs.rename(columns={'is_answerable_zs': 'annotation_llama3_70b'})
df_answers = df_answers.rename(columns={'value': 'annotation_llama3_70b'})

# Replace Y with True and N with False for all annotation columns
for col in df_questions.columns:
    if col.startswith('annotation_'):
        df_questions[col] = df_questions[col].replace({'Y': True, 'N': False})
for col in df_questions_zs.columns:
    if col.startswith('annotation_'):
        df_questions_zs[col] = df_questions_zs[col].replace({'Y': True, 'N': False})
for col in df_answers.columns:
    if col.startswith('annotation_'):
        df_answers[col] = df_answers[col].replace({'Y': True, 'N': False})

# Save the updated DataFrames to CSV files
df_questions.to_csv('../data/hitl/questions.csv', index=False)
df_questions_zs.to_csv('../data/hitl/questions_zs.csv', index=False)
df_answers.to_csv('../data/hitl/answers.csv', index=False)


In [51]:
df_questions

Unnamed: 0,question_id,annotation_alex,annotation_atheer,annotation_hend,annotation_xiaoyuan,annotation_llama3_70b
0,36261,True,,True,True,True
1,36261,True,,True,True,True
2,39026,True,,True,True,True
3,39026,True,,True,True,True
4,4635,True,,True,True,True
...,...,...,...,...,...,...
195,3509,False,True,,False,True
196,2839,True,False,,True,True
197,2839,True,False,,True,True
198,33316,True,True,,False,True


In [52]:
# Function to calculate average correlation between human annotators
def average_human_correlation(df):
    annotator_columns = [col for col in df.columns if col.startswith('annotation_') and col != 'annotation_llama3_70b']
    n = len(annotator_columns)
    correlations = []

    for i in range(n):
        for j in range(i + 1, n):
            common_data = df[[annotator_columns[i], annotator_columns[j]]].dropna()
            if not common_data.empty and common_data[annotator_columns[i]].nunique() > 1 and common_data[annotator_columns[j]].nunique() > 1:
                corr, _ = pearsonr(common_data[annotator_columns[i]], common_data[annotator_columns[j]])
                correlations.append(corr)

    return sum(correlations) / len(correlations) if correlations else 0

# Function to calculate average correlation between human annotators and machine annotator
def average_human_machine_correlation(df):
    human_columns = [col for col in df.columns if col.startswith('annotation_') and col != 'annotation_llama3_70b']
    machine_column = 'annotation_llama3_70b'
    correlations = []

    for human_col in human_columns:
        common_data = df[[human_col, machine_column]].dropna()
        if not common_data.empty and common_data[human_col].nunique() > 1 and common_data[machine_column].nunique() > 1:
            corr, _ = pearsonr(common_data[human_col], common_data[machine_column])
            correlations.append(corr)

    return sum(correlations) / len(correlations) if correlations else 0

# Calculate correlations for each type of sample
avg_human_corr_answers = average_human_correlation(df_answers)
avg_human_machine_corr_answers = average_human_machine_correlation(df_answers)

avg_human_corr_questions = average_human_correlation(df_questions)
avg_human_machine_corr_questions = average_human_machine_correlation(df_questions)

avg_human_corr_questions_zs = average_human_correlation(df_questions_zs)
avg_human_machine_corr_questions_zs = average_human_machine_correlation(df_questions_zs)

# Print the results
print("Average Human Correlation for Answers:", avg_human_corr_answers)
print("Average Human-Machine Correlation for Answers:", avg_human_machine_corr_answers)

print("Average Human Correlation for Questions:", avg_human_corr_questions)
print("Average Human-Machine Correlation for Questions:", avg_human_machine_corr_questions)

print("Average Human Correlation for Questions ZS:", avg_human_corr_questions_zs)
print("Average Human-Machine Correlation for Questions ZS:", avg_human_machine_corr_questions_zs)


Average Human Correlation for Answers: 0.8512799168548875
Average Human-Machine Correlation for Answers: 0.76844814051456
Average Human Correlation for Questions: 0.09792867290069808
Average Human-Machine Correlation for Questions: 0.0442966417395686
Average Human Correlation for Questions ZS: 0.2775558123462398
Average Human-Machine Correlation for Questions ZS: 0.29006448642906507
