# MRL ST2024 MC Questions

## Create MC Format 

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import random, os
import plotly.express as px
BASE_PATH = Path('/home/dug/Dropbox/MRL/2024_export/')
OUT_REF = BASE_PATH / 'out'
OUT_PUBLIC = BASE_PATH / 'public'

os.makedirs(OUT_REF, exist_ok=True)
os.makedirs(OUT_PUBLIC, exist_ok=True)
os.makedirs(OUT_REF / 'MC', exist_ok=True)
os.makedirs(OUT_REF / 'QA', exist_ok=True)
os.makedirs(OUT_PUBLIC / 'MC', exist_ok=True)
os.makedirs(OUT_PUBLIC / 'QA', exist_ok=True)

char2idx = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
idx2char = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}

OUT_REF

PosixPath('/home/dug/Dropbox/MRL/2024_export/out')

In [2]:
def shuffle_answers(answers):
    """shuffle the answers, keep true answer label

    Args:
        answers (list): 0th index is the correct answer, the rest are incorrect

    Returns:
        label (str): the correct answer label
        shuffled_answers (list): the shuffled answers
    """
    keys = ['A', 'B', 'C', 'D'] # A: correct, B-D: incorrect
    answer_choices = list(zip(keys, answers))
    rand_answers = random.sample(answer_choices, len(answer_choices))
    rand_answers = dict(rand_answers)
    labels = list(rand_answers.keys())

    label = labels.index('A') # Correct answer pos
    label = idx2char[label]

    return label, list(rand_answers.values())




In [3]:
langs = ['ALS', 'YO', 'TR', 'AZ', 'IG']
val_lens = [200, 200, 195, 200, 200] # remaining are for test

for i, lang in enumerate(langs):
    df = pd.read_csv(BASE_PATH / f'{lang}.csv')

    df = df[['annotation_id', 'text', 'question_updated', 'answer_updated', 'wrong_answer_1', 'wrong_answer_2', 'wrong_answer_3']]
    df.rename(columns={'question_updated': 'question', 'answer_updated': 'A', 'wrong_answer_1': 'B', 'wrong_answer_2': 'C', 'wrong_answer_3': 'D'}, inplace=True)
    
    # df for open QA
    df_open = df.copy()
    df_open['answer'] = df_open['A']
    df_open.drop(columns=['A', 'B', 'C', 'D'], inplace=True)
    
    # create MC format
    df['label'] = 'A'
    for index, row in df.iterrows():
        answers = [row['A'], row['B'], row['C'], row['D']]
        label, answers_shuffled = shuffle_answers(answers)
        
        assert answers[0] == answers_shuffled[char2idx[label]], f'issue with shuffling: \ngold label: {label}, \ngold answer: {answers[0]} \nAnswers shuffled: {answers_shuffled}'
        
        df.at[index, 'label'] = label
        df.at[index, 'A'] = answers_shuffled[0]
        df.at[index, 'B'] = answers_shuffled[1]
        df.at[index, 'C'] = answers_shuffled[2]
        df.at[index, 'D'] = answers_shuffled[3]

    
    # reference data
    open_test = df_open.loc[val_lens[i]:]
    open_val = df_open.loc[:val_lens[i]]
    print(f'Len ref open val: {len(open_val)}')
    df_test = df.loc[val_lens[i]:]
    df_val = df.loc[:val_lens[i]]
    print(f'Len ref MC val: {len(df_val)}')
    
    open_test.to_csv(OUT_REF / 'QA' / f'QA_{lang}_reference_test.csv', index=False)
    open_val.to_csv(OUT_REF / 'QA' / f'QA_{lang}_reference_val.csv', index=False)
    df_test.to_csv(OUT_REF / 'MC' / f'MC_{lang}_reference_test.csv', index=False)
    df_val.to_csv(OUT_REF / 'MC' / f'MC_{lang}_reference_val.csv', index=False)
    
    print(f'Ref: {lang} (val={val_lens[i]}): {len(open_test)}, {len(open_val)}, {len(df_test)}, {len(df_val)}')
    
    
    # prediction data
    print(f'DF columns: {df.columns.tolist()}')
    unlabeled_open = df.drop(columns=['A', 'B', 'C', 'D', 'label'])
    unlabeled_open['prediction'] = ''
    df_unlabeled = df.drop(columns=['label'])
    df_unlabeled['prediction'] = ''
    
    open_test = unlabeled_open.loc[val_lens[i]:]
    open_val = unlabeled_open.loc[:val_lens[i]]
    df_test = df_unlabeled.loc[val_lens[i]:]
    df_val = df_unlabeled.loc[:val_lens[i]]
    
    assert 'label' not in open_test.columns.tolist(), 'label column should not be in the open test set'
    assert 'label' not in open_val.columns.tolist(), 'label column should not be in the open val set'
    assert 'label' not in df_test.columns.tolist(), 'label column should not be in the MC test set'
    assert 'label' not in df_val.columns.tolist(), 'label column should not be in the MC val set'
    
    open_test.to_csv(OUT_PUBLIC / 'QA' / f'QA_{lang}_test.predict', index=False)
    open_val.to_csv(OUT_PUBLIC / 'QA' / f'QA_{lang}_val.predict', index=False)
    df_test.to_csv(OUT_PUBLIC / 'MC' / f'MC_{lang}_test.predict', index=False)
    df_val.to_csv(OUT_PUBLIC / 'MC' / f'MC_{lang}_val.predict', index=False)
    
    print(f'Pred / Public: {lang} (val={val_lens[i]}): {len(open_test)}, {len(open_val)}, {len(df_test)}, {len(df_val)}')
    
    hist = px.histogram(df, x="label", nbins=4)
    
    
    
    

hist.show()


Len ref open val: 201
Len ref MC val: 201
Ref: ALS (val=200): 650, 201, 650, 201
DF columns: ['annotation_id', 'text', 'question', 'A', 'B', 'C', 'D', 'label']
Pred / Public: ALS (val=200): 650, 201, 650, 201
Len ref open val: 201
Len ref MC val: 201
Ref: YO (val=200): 672, 201, 672, 201
DF columns: ['annotation_id', 'text', 'question', 'A', 'B', 'C', 'D', 'label']
Pred / Public: YO (val=200): 672, 201, 672, 201
Len ref open val: 196
Len ref MC val: 196
Ref: TR (val=195): 147, 196, 147, 196
DF columns: ['annotation_id', 'text', 'question', 'A', 'B', 'C', 'D', 'label']
Pred / Public: TR (val=195): 147, 196, 147, 196
Len ref open val: 201
Len ref MC val: 201
Ref: AZ (val=200): 290, 201, 290, 201
DF columns: ['annotation_id', 'text', 'question', 'A', 'B', 'C', 'D', 'label']
Pred / Public: AZ (val=200): 290, 201, 290, 201
Len ref open val: 201
Len ref MC val: 201
Ref: IG (val=200): 747, 201, 747, 201
DF columns: ['annotation_id', 'text', 'question', 'A', 'B', 'C', 'D', 'label']
Pred / Publ

## Evaluate MC Answers

In [10]:
y_true = df['label'].to_list()
answer_options = ['A', 'B', 'C', 'D']
y_pred = random.choices(answer_options, k=len(y_true))

len(y_true), len(y_pred)


(747, 747)

In [11]:
import sklearn.metrics as metrics

accuracy = metrics.accuracy_score(y_true, y_pred, normalize=True)
f1 = metrics.f1_score(y_true, y_pred, average='micro')
precision = metrics.precision_score(y_true, y_pred, average='micro')
recall = metrics.recall_score(y_true, y_pred, average='micro')

recall

0.2222222222222222