In [1]:
import collections
import json
import pandas as pd
import re
import string
import timeit
from ast import literal_eval

In [2]:
# NOTE: make changes in this cell only.
def pred_theme_ans(questions, pred_out):
  #-----------------------------
  # Warmup module for a theme model goes here.
  #------------------------------
  theme = questions[0]["theme"]
  for question in questions:
    #-------------------------------------   
    # add your prediction methodology here.
    #-------------------------------------
    # Dummy method.
    ans = {}
    ans["question_id"] = question["id"]
    if theme == "Kubernetes":
        # If no prediction for a paragraph, predict -1 for paragraph prediction
        # and empty string for answers.
        ans["paragraph_id"] = 2
        ans["answers"] = "Google"
    elif theme == "ChatGPT":
        ans["paragraph_id"] = 4
        ans["answers"] = "2022"
    elif theme == "Football world cup":
        ans["paragraph_id"] = 6
        ans["answers"] = "Qatar"
    pred_out.append(ans)

In [3]:
# All theme prediction.
questions = json.loads(pd.read_csv("sample_input_question.csv").to_json(orient="records"))
theme_intervals = json.loads(pd.read_csv("sample_theme_interval.csv").to_json(orient="records"))
pred_out = []
theme_inf_time = {}
for theme_interval in theme_intervals:
  theme_ques = questions[int(theme_interval["start"]) - 1: int(theme_interval["end"])]
  execution_time = timeit.timeit(lambda: pred_theme_ans(theme_ques, pred_out), number=1)
  theme_inf_time[theme_interval["theme"]] = execution_time * 1000 # in milliseconds.
pred_df = pd.DataFrame.from_records(pred_out)
# Write prediction to a CSV file. Teams are required to submit this csv file.
pred_df.to_csv('sample_output_prediction.csv', index=False)

In [None]:
def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def calc_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

def calc_max_f1(predicted, ground_truths):
  max_f1 = 0
  for ground_truth in ground_truths:
    f1 = calc_f1(predicted, ground_truth)
    max_f1 = max(max_f1, f1)
  return max_f1

def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def calc_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

def calc_max_f1(predicted, ground_truths):
  max_f1 = 0
  for ground_truth in ground_truths:
    f1 = calc_f1(predicted, ground_truth)
    max_f1 = max(max_f1, f1)
  return max_f1

In [None]:
# Evaluation methodology.
metrics = {}
pred = pd.read_csv("sample_output_prediction.csv")
truth = pd.read_csv("sample_ground_truth.csv")
truth.paragraph_id = truth.paragraph_id.apply(literal_eval)
truth.answers = truth.answers.apply(literal_eval)
questions = pd.read_csv("sample_input_question.csv")
for idx in pred.index:
  q_id = pred["question_id"][idx]
  q_rows = questions.loc[questions['id'] == q_id].iloc[-1]
  theme = q_rows["theme"]
  predicted_paragraph = pred["paragraph_id"][idx]
  predicted_ans = pred["answers"][idx]
  
  if theme not in metrics.keys():
    metrics[theme] = {"true_positive": 0, "true_negative": 0, "total_predictions": 0, "f1_sum": 0}

  truth_row = truth.loc[truth['question_id'] == q_id].iloc[-1]
  truth_paragraph_id = [ int(i) for i in truth_row["paragraph_id"] ]
  if predicted_paragraph in truth_paragraph_id:
    # Increase TP for that theme.
    metrics[theme]["true_positive"] = metrics[theme]["true_positive"] + 1
  # -1 prediction in case there is no paragraph which can answer the query.
  if predicted_paragraph == -1 and truth_row["paragraph_id"] == []:
    # Increase TN.
    metrics[theme]["true_negative"] = metrics[theme]["true_negative"] + 1
  # Increase total predictions for that theme.
  metrics[theme]["total_predictions"] = metrics[theme]["total_predictions"] + 1
  f1 = calc_max_f1(predicted_ans, truth_row["answers"])
  metrics[theme]["f1_sum"] = metrics[theme]["f1_sum"] + f1


In [None]:
# Final score.
inf_time_threshold = 200.0
final_para_score = 0.0
final_qa_score = 0.0
# Weight would stay hidden from teams.
theme_weights = {"Kubernetes": 0.5, "ChatGPT": 0.4, "Football world cup": 0.1}
for theme in metrics:
  inf_time_score = 1.0
  metric = metrics[theme]
  para_score = (metric["true_positive"] + metric["true_negative"]) / metric["total_predictions"] 
  qa_score = metric["f1_sum"] / metric["total_predictions"]
  avg_inf_time = theme_inf_time[theme] / metric["total_predictions"]
  if avg_inf_time > inf_time_threshold:
    inf_time_score = inf_time_threshold / avg_inf_time
  final_qa_score += theme_weights[theme] * inf_time_score * qa_score
  final_para_score += theme_weights[theme] * inf_time_score * para_score
print (final_para_score)
print (final_qa_score)


0.7
0.5666666666666667
