# Finetuning QA model

User-defined fine-tuning parameters in variables:

1. **qa_model**: name of the model to finetune. Change only when you want to fine tune a particular model checkpoint.

2. **data_path**: the data csv upon which training/validation is required to be done.

3. **data_path_eval**: the data csv to evaluate the checkpoint in save_path_eval

4. **save_path**: the path to save the checkpoints during finetune.

5. **save_path_eval**: the path containing checkpoints to evaluate

6. **batch_size**: Batch size for finetuning. default=32

7. **split**: split of train, validate and test of data_path csv for finetuning

8. **mode**: set to different modes namely- finetune+eval on same data('ft+eval_same'), finetune+eval on different data('ft+eval_custom') and evaluation only to evaluate checkpoints on a particular data('eval_only').

## FOR FINETUNE AND VAL BOTH('ft+eval_same', 'ft+eval_custom'):

  File Generation and preprocessing---> Imports ----> Variables(set required mode-'ft+eval_same' or'ft+eval_custom')---->FineTuning---->Validation csv Generation---> 4 csv generation---> QA evaluation

## FOR VAL only ('eval_only'):

  File Generation and preprocessing---> Imports ----> Variables(set required mode-'eval_only')---->Validation csv Generation---> 4 csv generation---> QA evaluation






## File Generation and Preprocessing

In [None]:
import os
import numpy as np

In [None]:
'''
The 'para_data' and 'question_ans' variables take the files provided in the 
first stage of evaluation, i.e., in morning
'''


import pandas as pd
# Paragraph mapping file
para_data = pd.read_csv('https://drive.google.com/uc?export=download&id=1GNHhH81J1pEZSB-OSGRTtqIUQCCNhJ6I')
# Questions mapping file
question_ans = pd.read_csv('https://drive.google.com/uc?export=download&id=1GQ3-E7k60K16f47A18ptQfKUBkhxQudM')

In [None]:
df = question_ans
df2 = para_data

In [None]:
para_id_list = df.paragraph_id.tolist()
paras = []
for i in para_id_list:
    paras.append(df2['paragraph'][df2.id==i].tolist()[0])
df['paragraph'] = paras
df.rename(columns = {'question':'Question', 'theme':'Theme','answer':'Answer_text', 'paragraph':'Paragraph'}, inplace = True)
poss = []
for i in range(len(df)):
    poss.append('TRUE')
df['Answer_possible'] = poss
start = []
for i in range(len(df)):
  # print(df['Answer_text'][i])
  s = str(df['Answer_text'][i])
  start.append([df['Paragraph'][i].find(s)])
df['Answer_start'] = start
answers = df['Answer_text'].tolist()
ans = []
for i in answers:
    ans.append([i])
df['Answer_text'] = ans
df.to_csv('/content/finaldataprocessed.csv')

In [None]:
csv_save_path='/content/finaldataprocessed.csv'

In [None]:
df = pd.read_csv(csv_save_path)

In [None]:
df = df.rename(columns={'Unnamed: 0':'id'})
df['id'] =df['id']+1
df.to_csv('/content/finaldataprocessed.csv')

In [None]:
if not os.path.exists('/content/Final_data'): os.mkdir('/content/Final_data')

In [None]:
m='/content/Final_data'
df = pd.read_csv('/content/finaldataprocessed.csv')

In [None]:
paras = df.Paragraph.unique()

In [None]:
df

In [None]:
df.columns

## Imports

In [None]:
# Install the Transformers, Datasets, and Evaluate libraries to run this notebook.
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

import torch
import pandas as pd
from transformers import AutoModelForQuestionAnswering
import collections
import evaluate
from tqdm.auto import tqdm
import datasets
from transformers import AutoTokenizer

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install gdown

In [None]:
import gdown

url = "https://drive.google.com/uc?export=download&id=19dxbN5KR2O0eVA4XA76PVD5HJVszm3Nk"
output = "synth_data_stage1.csv"
gdown.download(url, output)

## Variables

In [None]:
#mode: 'ft+eval_same', 'ft+eval_custom', 'eval_only


mode='eval_only'
data_path = ''
split = [80, 20, 0]

if(mode=='ft+eval_same'):
  #qa model to finetune and evaluate
  qa_model = 'mrm8488/electra-small-finetuned-squadv2'
  #dataset to finetune
  data_path = csv_save_path
  #split the dataset 
  split_ft = [80,20,0]
  split = split_ft
  # Checkpoints save_path for saving model checkpoints during fine tuning
  save_path = 'electra-small-finetuned-stage1_'
  # checkpoints path for evaluation

elif (mode=='ft+eval_custom'):
  #qa model to finetune/evaluate
  qa_model = 'mrm8488/electra-small-finetuned-squadv2'
  #dataset to finetune
  data_path = csv_save_path
  #dataset to evaluate 
  data_path_eval='synth_data_stage1.csv'
  #split the dataset 
  split_ft = [80,20,0]
  split = split_ft
  split_eval=[0,100,0]
  # Checkpoints save_path for model checkpoints for fine tuning
  save_path = 'electra-small-finetuned-stage1_'
  # checkpoints path for evaluation
  save_path_eval = 'electra-small-finetuned-stage1_'
  # This path should have one or more folders containing model checkpoints
elif( mode=='eval_only'):
  #qa model to evaluate
  qa_model = 'mrm8488/electra-small-finetuned-squadv2'
  #dataset to evaluate 
  data_path_eval='synth_data_stage1.csv'
  data_path = data_path_eval
  split_eval=[0,100,0] #default----don't change if you want to validate whole of your data
  # checkpoints path for evaluation
  # save_path_eval = '/content/drive/MyDrive/Assets/Finetuned-QA/electra-small-finetuned-stage1_Abhijitbatch_size_32'
  save_path_eval = 'electra-small-finetuned-stage1_'
  # This path should have one or more folders containing model checkpoints

batch_size=32


## Fine Tuning (To be run only during fine tune)

### Helpers

In [None]:
#preprocess the validation examples
def preprocess_validation_examples(examples):  
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs
#Preprocessing the training examples
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        if len(answer['answer_start'])==0:
          start_positions.append(0)
          end_positions.append(0)
        else:
          start_char = answer["answer_start"][0]
          end_char = answer["answer_start"][0] + len(answer["text"][0])
          sequence_ids = inputs.sequence_ids(i)

          # Find the start and end of the context
          idx = 0
          while sequence_ids[idx] != 1:
              idx += 1
          context_start = idx
          while sequence_ids[idx] == 1:
              idx += 1
          context_end = idx - 1

          # If the answer is not fully inside the context, label is (0, 0)
          if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
              start_positions.append(0)
              end_positions.append(0)
          else:
              # Otherwise it's the start and end token positions
              idx = context_start
              while idx <= context_end and offset[idx][0] <= start_char:
                  idx += 1
              start_positions.append(idx - 1)

              idx = context_end
              while idx >= context_start and offset[idx][1] >= end_char:
                  idx -= 1
              end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

#To compute the metrics for question answering transformer frameworks
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            score_null = start_logits[0][0] + end_logits[0][0]
            score_diff = score_null-best_answer["logit_score"]
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"],'no_answer_probability':score_diff}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)


In [None]:
#load the model and the respective tokenizer from the checkpoint link in variable qa_model

model_checkpoint = qa_model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.is_fast

In [None]:
#split the dataset according to the variable split. Default split=[80, 10, 10]---(train, validation, test)

from ast import literal_eval

full_data = pd.read_csv(data_path)
id = []
for i in range(len(full_data)):
  id.append(i)
full_data['Unnamed: 0']=id
themes = full_data.Theme.unique()
full_data.Answer_start = full_data.Answer_start.apply(literal_eval)
full_data.Answer_text = full_data.Answer_text.apply(literal_eval)
full_data['Unnamed: 0'] = full_data['Unnamed: 0'].astype(str)
train_samples = []
dev_samples = []
test_samples = []

for theme in themes:
  theme_df = full_data[full_data['Theme']==theme]
  n = len(theme_df)
  for i,theme_row in enumerate(theme_df.iterrows()):
    theme_row = theme_row[1]
    input = {
              'answers': {'answer_start':theme_row['Answer_start'],'text':theme_row['Answer_text']},
              'context':theme_row['Paragraph'],
              'id':theme_row['Unnamed: 0'],
              'question': theme_row['Question'],
              'title': theme_row['Theme']
          }
    if i<int(split[0]*n/sum(split)):
      train_samples.append(input)
    elif i<int((split[0]+split[1])*n/sum(split)):
      dev_samples.append(input)
    else:
      test_samples.append(input)

In [None]:
# dev_samples

In [None]:
#dataset for the trainer #dataset preprocessing to calculate the specific squadV2 metrics

train_dataset2 = datasets.Dataset.from_list(train_samples)
val_dataset2 = datasets.Dataset.from_list(dev_samples)

In [None]:
#dataset for the trainer 

max_length = 384
stride = 128

train_dataset = train_dataset2.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_dataset2.column_names,
)

train_dataset_eval=train_dataset2.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=train_dataset2.column_names,)

validation_dataset = val_dataset2.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=val_dataset2.column_names,
)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

metric = evaluate.load("squad_v2")
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
from transformers import TrainingArguments

# Give the required training arguments
args = TrainingArguments(
    save_path,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=False,
    push_to_hub=False,
    logging_steps=100
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset_eval,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)
'''
#generate base results
import numpy as np
n_best=20
max_answer_length = 30
predictions,_,_ = trainer.predict(train_dataset)
start_logits, end_logits = predictions
metrics=compute_metrics(start_logits, end_logits, train_dataset, train_dataset2)
print(metrics)
df=pd.DataFrame(metrics, index=[0])'''

In [None]:
#Start Fine Tune

trainer.train()

In [None]:
# trainer.push_to_hub(commit_message="Training complete")

## Validation CSV Generation

In [None]:
if(mode=='ft+eval_same'):
  split=split_ft
elif(mode=='ft+eval_custom'):
  data_path=data_path_eval
  split=split_eval
elif(mode=='eval_only'):
  data_path=data_path_eval
  split=split_eval



In [None]:
from ast import literal_eval

full_data = pd.read_csv(data_path)
id = []
for i in range(len(full_data)):
  id.append(i)
full_data['Unnamed: 0']=id
themes = full_data.Theme.unique()
full_data.Answer_start = full_data.Answer_start.apply(literal_eval)
full_data.Answer_text = full_data.Answer_text.apply(literal_eval)
full_data['Unnamed: 0'] = full_data['Unnamed: 0'].astype(str)
train_samples = []
dev_samples = []
test_samples = []

for theme in themes:
  theme_df = full_data[full_data['Theme']==theme]
  n = len(theme_df)
  for i,theme_row in enumerate(theme_df.iterrows()):
    theme_row = theme_row[1]
    input = {
              #'Answer_possible': theme_row['Answer_possible'],
              'Answer_start':theme_row['Answer_start'],
              'Answer_Text':theme_row['Answer_text'],
              'Paragraph':theme_row['Paragraph'],
              'id':theme_row['Unnamed: 0'],
              'Question': theme_row['Question'],
              'Theme': theme_row['Theme']
          }
    if i<int(split[0]*n/sum(split)):
      train_samples.append(input)
    elif i<int((split[0]+split[1])*n/sum(split)):
      dev_samples.append(input)
    else:
      test_samples.append(input)

In [None]:
'''
train_dataset2 = datasets.Dataset.from_list(train_samples)
val_dataset2 = datasets.Dataset.from_list(dev_samples)

'''

In [None]:
import pandas as pd
df_val = pd.DataFrame(dev_samples)

In [None]:
df_val.head()

In [None]:
val_csv_name=data_path.split('/')[-1][:-4]+'_'+str(split[0])+'%_validation.csv'
val_csv_name

In [None]:
df_val.to_csv(val_csv_name)

## Evaluation Files(4 CSVs) Generation

In [None]:
val_folder=val_csv_name[:-4]
val_folder

In [None]:
import os
if not os.path.exists(val_folder): os.makedirs(val_folder)

In [None]:
df = pd.read_csv(val_csv_name)

In [None]:
df

In [None]:
paras = df.Paragraph.unique()

In [None]:
df[df["Paragraph"]==paras[0]].Theme[0]

In [None]:
data = []
for i, para in enumerate(paras):
    data_dict = {}
    data_dict['id']=i+1
    data_dict['paragraph']=para
    data_dict['theme'] = df[df["Paragraph"]==para].iloc[0].Theme
    data.append(data_dict)

In [None]:
# data

In [None]:
df2 = pd.DataFrame(data)

In [None]:
df2.head()

In [None]:
df_theme=df2.copy()

In [None]:
input_para=val_folder+'/input_para.csv'
df2.to_csv(input_para, header=True, index=False)

In [None]:
data = []
for i in range(len(df.Question)):
    data_dict = dict()
    data_dict['id'] = i+1
    data_dict['question'] = df.Question[i]
    data_dict['theme'] = df.Theme[i]
    data.append(data_dict)

In [None]:
df2 = pd.DataFrame(data)
input_question=val_folder+'/input_question.csv'
df2.to_csv(input_question, header=True, index=False)

In [None]:
data = []
for i in range(len(df.Question)):
    data_dict = dict()
    data_dict['question_id'] = i+1
    if(df.Answer_start[i]=='[]'):
     data_dict['paragraph_id'] = -1
    else:
     data_dict['paragraph_id']=df_theme[df_theme.paragraph==df.Paragraph[i]].index.tolist()[0]+1
    data_dict['answers'] = df.Answer_Text[i]
    data.append(data_dict)
df2 = pd.DataFrame(data)
ground_truth=val_folder+'/ground_truth.csv'
df2.to_csv(ground_truth, header=True, index=False)

In [None]:
data = []
for i in df.Theme.unique():
    data_dict = dict()
    start, end = (df.index[df['Theme']==i][[0,-1]]+1).tolist()
    data_dict["theme"] = i
    data_dict["start"] = start
    data_dict['end'] = end
    data.append(data_dict)
df2 = pd.DataFrame(data)
theme_interval=val_folder+'/theme_interval.csv'
df2.to_csv(theme_interval, header=True, index=False)

In [None]:
print(input_para+'\n'+input_question+'\n'+ground_truth+'\n'+theme_interval+'\n')

# QA Evaluation

## Variable

In [None]:
ques_data_path = val_csv_name
theme_path = theme_interval
truth_path = ground_truth
threshold = 0.1

## Imports

In [None]:
import collections
import json
import pandas as pd
import re
import string
import timeit
from ast import literal_eval
!pip install transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import time

In [None]:
questions = pd.read_csv(ques_data_path)
theme_intervals = pd.read_csv(theme_path)
truth =pd.read_csv(truth_path)

## Helpers

In [None]:
def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()
#gold_toks is the preprocessed text
def calc_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

def calc_max_f1(predicted, ground_truths):
  max_f1 = 0
  for ground_truth in ground_truths:
    f1 = calc_f1(predicted, ground_truth)
    max_f1 = max(max_f1, f1)
  return max_f1

In [None]:
def Average(lis):
  return sum(lis)/len(lis)

In [None]:
def get_theme_model(theme):
  global_model = nlp
  return global_model

def pred_theme_ans(questions,theme_model, pred_out):
  theme = questions[0]["Theme"]
  for question in questions:
    ans = {}
    ans["question_id"] = question["id"]
    result = theme_model(question=question['Question'], context=question['Paragraph'])
    score = result['score']
    answer = result['answer']
    if (score<threshold):
       ans["answers"]= ''
    else:
      ans["answers"]=answer
    pred_out.append(ans)
 
    

## Check point evaluation(Validation)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch


### Base eval

In [None]:

#generate base results

'''
By default, it generates results on base electra finetuned on squad V2 with model card: "mrm8488/electra-small-finetuned-squadv2".
If you want to check base results of a checkpoint change the qa_model_path with the custom checkpoint path
'''

model_name = qa_model
m=model_name.split('/')[-1]+'validation'
nlp = pipeline("question-answering", model = model_name)
  # All theme prediction.
questions = json.loads(pd.read_csv(ques_data_path).to_json(orient="records"))
theme_intervals = json.loads(pd.read_csv(theme_path).to_json(orient="records"))
pred_out = []
theme_inf_time = {}
execution_times = []
for theme_interval in tqdm(theme_intervals):
  theme_ques = questions[int(theme_interval["start"]) - 1: int(theme_interval["end"])]
  theme = theme_ques[0]["Theme"]
  # Load model fine-tuned for this theme.
  theme_model = get_theme_model(theme)
  execution_time = timeit.timeit(lambda: pred_theme_ans(theme_ques, theme_model, pred_out), number=1)
  execution_times.append(execution_time)
  theme_inf_time[theme_interval["theme"]] = execution_time * 1000 # in milliseconds.
pred_df = pd.DataFrame.from_records(pred_out)
# Write prediction to a CSV file. Teams are required to submit this csv file.
pred_df.to_csv(f'{m}_pred.csv', index=False)
theme_inf_df = pd.DataFrame(list(theme_inf_time.items()),columns = ['theme','avg_inf_time']) 
theme_inf_df.to_csv(f'{m}_inf_time.csv', index=False)
print("avg_inference_time:",round(sum(execution_times)/len(questions),3)*1000)
pred_df.to_csv(f'{m}_pred.csv', index=False)
lst=theme_inf_df['avg_inf_time']
theme_int=pd.read_csv(theme_path)
theme_times = [ex_time/(theme_int['end'][i]-theme_int['start'][i]+1) for i,ex_time in enumerate(lst)]
metrics = {}
total_f1=0
pred = pd.read_csv(f'{m}_pred.csv')
truth = pd.read_csv(truth_path)
# truth.paragraph_id = truth.paragraph_id.apply(literal_eval)
truth.answers = truth.answers.apply(literal_eval)
questions = pd.read_csv(ques_data_path)
for idx in pred.index:
  q_id = pred.index[idx]
  q_rows = questions.loc[questions.index == q_id].iloc[-1]
  theme = q_rows["Theme"]
  predicted_ans = pred["answers"][idx]
  
  if theme not in metrics.keys():
    metrics[theme] = {"true_positive": 0, "true_negative": 0, "total_predictions": 0, "f1_sum": 0}

  truth_row = truth.loc[truth.index == q_id].iloc[-1]
  if truth_row["answers"] == [] and str(predicted_ans) =='nan':
    metrics[theme]["true_negative"] = metrics[theme]["true_negative"] + 1
    f1=1
  else:
    metrics[theme]["true_positive"] = metrics[theme]["true_positive"] + 1
    f1 = calc_max_f1(predicted_ans, truth_row["answers"])
  metrics[theme]["total_predictions"] = metrics[theme]["total_predictions"] + 1
  metrics[theme]["f1_sum"] = metrics[theme]["f1_sum"] + f1
  total_f1+=f1
final_f1 = round(total_f1/len(questions),3)
theme_inf_df = pd.read_csv(f'{m}_inf_time.csv')
theme_inf_time = {theme:theme_inf_df[theme_inf_df['theme']==theme]['avg_inf_time'].tolist()[0] for theme in metrics}
no_of_themes=len(theme_inf_df)
# Final score.
inf_time_threshold = 1000.0 # milliseconds.
final_qa_score = 0.0

for theme in metrics:
  inf_time_score = 1.0
  metric = metrics[theme]
  qa_score = metric["f1_sum"] / metric["total_predictions"]
  avg_inf_time = theme_inf_time[theme] / metric["total_predictions"]
  if avg_inf_time > inf_time_threshold:
    inf_time_score = inf_time_threshold / avg_inf_time
  final_qa_score += 1/(no_of_themes) * inf_time_score * qa_score
print(final_f1,round(final_qa_score,3)*100,round(Average(theme_times),3),round(theme_times[len(theme_times)//2],3),round(max(theme_times),3))
metrics = [final_f1,round(final_qa_score,3)*100,round(Average(theme_times),3),round(theme_times[len(theme_times)//2],3),round(max(theme_times),3)]
df=pd.DataFrame(metrics).T
df.columns=['final_f1', 'qa_score','averege inf time', 'median inf time', 'max inf time']

In [None]:
df.head()

### Checkpoint Eval

In [None]:
import os 
import numpy as np
j = 0

if mode=='ft+eval_same':
  path = save_path
else:
  path = save_path_eval

for i in os.listdir(path):
  if i == 'runs':
    continue
  m=i
  qa_model = i
  print(path+'/'+i)
  print("epoch =", j)
  j += 1
  print("checkpoint = ", i)
  df1 = df.copy()
  ft_model_path=path+'/'+i
  nlp = pipeline("question-answering", model = ft_model_path)
    # All theme prediction.
  questions = json.loads(pd.read_csv(ques_data_path).to_json(orient="records"))
  theme_intervals = json.loads(pd.read_csv(theme_path).to_json(orient="records"))
  pred_out = []
  theme_inf_time = {}
  execution_times = []
  for theme_interval in tqdm(theme_intervals):
    theme_ques = questions[int(theme_interval["start"]) - 1: int(theme_interval["end"])]
    theme = theme_ques[0]["Theme"]
    # Load model fine-tuned for this theme.
    theme_model = get_theme_model(theme)
    execution_time = timeit.timeit(lambda: pred_theme_ans(theme_ques, theme_model, pred_out), number=1)
    execution_times.append(execution_time)
    theme_inf_time[theme_interval["theme"]] = execution_time * 1000 # in milliseconds.
  pred_df = pd.DataFrame.from_records(pred_out)
  # Write prediction to a CSV file. Teams are required to submit this csv file.
  pred_df.to_csv(f'{m}_pred.csv', index=False)
  theme_inf_df = pd.DataFrame(list(theme_inf_time.items()),columns = ['theme','avg_inf_time']) 
  theme_inf_df.to_csv(f'{m}_inf_time.csv', index=False)
  print("avg_inference_time:",round(sum(execution_times)/len(questions),3)*1000)
  pred_df.to_csv(f'{m}_pred.csv', index=False)
  lst=theme_inf_df['avg_inf_time']
  theme_int=pd.read_csv(theme_path)
  theme_times = [ex_time/(theme_int['end'][i]-theme_int['start'][i]+1) for i,ex_time in enumerate(lst)]
  metrics = {}
  total_f1=0
  pred = pd.read_csv(f'{m}_pred.csv')
  truth = pd.read_csv(truth_path)
  # truth.paragraph_id = truth.paragraph_id.apply(literal_eval)
  truth.answers = truth.answers.apply(literal_eval)
  questions = pd.read_csv(ques_data_path)
  for idx in pred.index:
    q_id = pred.index[idx]
    q_rows = questions.loc[questions.index == q_id].iloc[-1]
    theme = q_rows["Theme"]
    predicted_ans = pred["answers"][idx]
    
    if theme not in metrics.keys():
      metrics[theme] = {"true_positive": 0, "true_negative": 0, "total_predictions": 0, "f1_sum": 0}

    truth_row = truth.loc[truth.index == q_id].iloc[-1]
    if truth_row["answers"] == [] and str(predicted_ans) =='nan':
      metrics[theme]["true_negative"] = metrics[theme]["true_negative"] + 1
      f1=1
    else:
      metrics[theme]["true_positive"] = metrics[theme]["true_positive"] + 1
      f1 = calc_max_f1(predicted_ans, truth_row["answers"])
    metrics[theme]["total_predictions"] = metrics[theme]["total_predictions"] + 1
    metrics[theme]["f1_sum"] = metrics[theme]["f1_sum"] + f1
    total_f1+=f1
  final_f1 = round(total_f1/len(questions),3)
  theme_inf_df = pd.read_csv(f'{m}_inf_time.csv')
  theme_inf_time = {theme:theme_inf_df[theme_inf_df['theme']==theme]['avg_inf_time'].tolist()[0] for theme in metrics}
  no_of_themes=len(theme_inf_df)
  # Final score.
  inf_time_threshold = 1000.0 # milliseconds.
  final_qa_score = 0.0

  for theme in metrics:
    inf_time_score = 1.0
    metric = metrics[theme]
    qa_score = metric["f1_sum"] / metric["total_predictions"]
    avg_inf_time = theme_inf_time[theme] / metric["total_predictions"]
    if avg_inf_time > inf_time_threshold:
      inf_time_score = inf_time_threshold / avg_inf_time
    final_qa_score += 1/(no_of_themes) * inf_time_score * qa_score
  print(final_f1,round(final_qa_score,3)*100,round(Average(theme_times),3),round(theme_times[len(theme_times)//2],3),round(max(theme_times),3))
  metrics = [final_f1,round(final_qa_score,3)*100,round(Average(theme_times),3),round(theme_times[len(theme_times)//2],3),round(max(theme_times),3)]
  df2 = pd.DataFrame(metrics).T
  df2.columns=['final_f1', 'qa_score','averege inf time', 'median inf time', 'max inf time']
  df = pd.concat([df1, df2], axis=0)
  print(df)
  print("\n\n\n===========================================\n\n\n")

  print(metrics)

  print("\n\n\n===========================================\n\n\n")



In [None]:
df

In [None]:
out=path.split('/')[-1]+'_final_eval.csv'
df.to_csv(out)
out #Evaulation results path