In [None]:
import pandas as pd
import random
import string
from collections import Counter

datasets = ['MCQA-train.csv', 'MCQA-validation.csv', 'MCQA-test.csv']

for dataset in datasets:

  df = pd.read_csv(dataset)

  alternatives = df[['A', 'B', 'C', 'D', 'E']]

  # Create a list of lists
  list_of_lists = alternatives.values.tolist()

  # Shuffle the elements in each sublist
  for sublist in list_of_lists:
      random.shuffle(sublist)

  # Create a dataset with the shuffled alternatives
  df_shuffled = pd.DataFrame(list_of_lists)

  # Replace them in the original dataset
  df.loc[:, ['A', 'B', 'C', 'D', 'E']] = df_shuffled.values

  # Remove wrong column
  df = df.drop('answer_alternative', axis=1)

  # Clean with regex
  df = df.replace(r'\r+|\n+|\t+','', regex=True)

  # F1 score
  def normalize_answer(s):

    def white_space_fix(text):
      return ' '.join(text.split())
      
    def remove_punc(text):
      exclude = set(string.punctuation)
      return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
      return text.lower()

    return white_space_fix(remove_punc(lower(s)))

  def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
      return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) * 100
    return f1

  # Find alternative 
  df['value_A'] = df.apply(lambda row: f1_score(row['A'], row['correct']), axis=1)
  df['value_B'] = df.apply(lambda row: f1_score(row['B'], row['correct']), axis=1)
  df['value_C'] = df.apply(lambda row: f1_score(row['C'], row['correct']), axis=1)
  df['value_D'] = df.apply(lambda row: f1_score(row['D'], row['correct']), axis=1)
  df['value_E'] = df.apply(lambda row: f1_score(row['E'], row['correct']), axis=1)

  # Select a subset of columns to compare
  cols_to_compare = ['value_A', 'value_B', 'value_C', 'value_D', 'value_E']

  # Find the column name with the largest value for each row in the subset of columns
  df['alternative'] = df[cols_to_compare].idxmax(axis=1).str[-1]

  df = df.drop(['value_A', 'value_B', 'value_C', 'value_D', 'value_E'], axis=1)

  df.to_csv(dataset)