# HOW TO RUN!!!

Set Variables Accordingly

model_name: Huggingface Model Id

m: Informal Model Name(Used for saving results)

### Run These Sections:

1) Imports

2) Loading Base(Inside Model)

3) ||Optional|| Run the Pruning or Quantization Section or or Dynamic Quantization or None according to requirement (Both Sections Inside Model)

4) Prediction


### Can Also Change Type or amount of pruning by uncommenting cells in Pruning Section.

# Variables

In [1]:
model_name = "navteca/electra-base-squad2"
m='ELECTRA'

# Imports

In [2]:
import collections
import json
import pandas as pd
import re
import string
import timeit
from ast import literal_eval
!pip install transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import time
import torch.nn.utils.prune as prune
import numpy as np
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [35]:
device = 0 if torch.cuda.is_available() else -1

# Model

## Loading Base

In [99]:
# Load Checkpoints from Huggingface
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [100]:
# Make Pipeline
nlp = pipeline("question-answering", model = model, tokenizer= tokenizer,device=device)

In [101]:
# Get the Layers of the Transformer Model
modules = [module for module in nlp.model.modules()]

In [102]:
# Function to get the model layer by layer
mods = []
def get_modules(mod,parent):
    mod_list = list(mod._modules.keys())
    if(len(mod_list)>0):    
        for mod_n in mod_list:
#             print(mod_n)
            if(mod_n in ['qa_outputs']):
                continue
            get_modules(mod._modules[mod_n],mod)
    else:
        mods.append((str(type(mod)).split('.')[-1][:-2],mod))
        return

In [103]:
get_modules(modules[0],None)

In [104]:
# Remove Layers that cannot be pruned(Don't have weights)
final = [(name,layer) for name,layer in mods if name not in ['Dropout','GELUActivation','LayerNorm','Softmax','Tanh','MatMulWrapper', 'SqueezeBertLayerNorm']]

In [105]:
unc = ['Linear']
for name,layer in final:
    if name not in unc:
        unc.append(name)
unc

['Linear', 'Embedding']

## Pruning ([Article](https://towardsdatascience.com/how-to-prune-neural-networks-with-pytorch-ebef60316b91))

In [86]:
# Global Unstructured Pruning(Proper)
parameters_to_prune = [
    (module, "weight") for name,module in final
]
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured, #L1 is the L1 Norm and Unstructured Pruning method(Global)
    amount=0.3, # Percentage Amount of weights to be pruned,works best in range of 0.2 to 0.3
)
# Pruning generates a mask of pruned layers which takes up space in memory and takes more inference time
# Here we remove the pruned mask and writes the pruned weights as the actual weights,now we can not revert back to the previous state
for j in range(len(final)):
    if prune.is_pruned(final[j][1]):
        prune.remove(final[j][1],'weight')

In [87]:
# Local Structured Pruning
# for i in range(len(final)):
#     prune.ln_structured(final[i][1], name="weight", amount=1, n=2, dim=1)
#     if prune.is_pruned(final[i][1]):
#         print(i)
#         prune.remove(final[i][1],'weight')

In [88]:
# Local Unstructured Pruning
# for i in range(len(final)):
#     prune.l1_unstructured(final[i][1], name="weight", amount=0.1)
#     if prune.is_pruned(final[i][1]):
#         print(i)
#         prune.remove(final[i][1],'weight')

In [89]:
# Local Random Unstructured Pruning
# for i in range(len(final)):
#     prune.random_unstructured(final[i][1], name="weight", amount=0.2)
#     if prune.is_pruned(final[i][1]):
#         print(i)
#         prune.remove(final[i][1],'weight')

In [90]:
import torch

In [91]:
torch.save(model,'/content/pruned.pt')

## Quantization ([Article](https://pytorch.org/blog/quantization-in-practice/#post-training-static-quantization-ptq))

In [62]:
## EAGER MODE
from torch import nn
from torch.quantization import quantize_dynamic
import torch
# Adds stubs to dynamically quantizes the model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model_q = quantize_dynamic(
    model=model, qconfig_spec={nn.Linear}, dtype=torch.qint8, inplace=False
)

In [63]:
nlp = pipeline("question-answering", model = model_q, tokenizer= model_name)

In [64]:
torch.save(model_q,'/content/quantized.pt')

# Dynamic Quantization

In [72]:
pip install git+https://github.com/huggingface/optimum-intel.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/optimum-intel.git
  Cloning https://github.com/huggingface/optimum-intel.git to /tmp/pip-req-build-wkooc_h2
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/optimum-intel.git /tmp/pip-req-build-wkooc_h2
  Resolved https://github.com/huggingface/optimum-intel.git to commit e761f21d30ba45bd49362efd276d354d9cfd8601
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting datasets>=1.4.0
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K

In [73]:
!python -m pip install optimum[neural-compressor]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neural-compressor>=2.0.0
  Downloading neural_compressor-2.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnxruntime
  Downloading onnxruntime-1.14.0-cp38-cp38-manylinux_2_27_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnx
  Downloading onnx-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
Collecting deprecated
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting schema
  Downloading schema-0.7.5-py2.py3-none-any.whl (17 kB)
Collecting py-cpuinfo
  Downloading py_cpuinfo-9.0.0-py3-none-an

In [74]:
from transformers import AutoModelForQuestionAnswering
from neural_compressor.config import PostTrainingQuantConfig
from optimum.intel.neural_compressor import INCQuantizer

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# The directory where the quantized model will be saved
save_dir = "quantized_model"
# Load the quantization configuration detailing the quantization we wish to apply
quantization_config = PostTrainingQuantConfig(approach="dynamic")
quantizer = INCQuantizer.from_pretrained(model)
# Apply dynamic quantization and save the resulting model
quantizer.quantize(quantization_config=quantization_config, save_directory=save_dir)

2023-02-17 18:03:45 [INFO] Because both eval_dataloader_cfg and user-defined eval_func are None, automatically setting 'tuning.exit_policy.performance_only = True'.
2023-02-17 18:03:45 [INFO] Generate a fake evaluation function.
2023-02-17 18:03:45 [INFO] Pass query framework capability elapsed time: 568.19 ms
2023-02-17 18:03:45 [INFO] Get FP32 model baseline.
2023-02-17 18:03:45 [INFO] Save tuning history to /content/nc_workspace/2023-02-17_18-03-42/./history.snapshot.
2023-02-17 18:03:45 [INFO] FP32 baseline is: [Accuracy: 1.0000, Duration (seconds): 0.0000]
2023-02-17 18:03:45 [INFO] Fx trace of the entire model failed, We will conduct auto quantization
2023-02-17 18:03:49 [INFO] |******Mixed Precision Statistics******|
2023-02-17 18:03:49 [INFO] +-----------------+----------+---------+
2023-02-17 18:03:49 [INFO] |     Op Type     |  Total   |   INT8  |
2023-02-17 18:03:49 [INFO] +-----------------+----------+---------+
2023-02-17 18:03:49 [INFO] |    Embedding    |    3     |    3

In [75]:
from transformers import AutoTokenizer, pipeline
tokenizer = AutoTokenizer.from_pretrained(model_name)

from optimum.intel.neural_compressor import INCModelForQuestionAnswering

# Load the PyTorch model hosted on the hub
loaded_model_from_hub = INCModelForQuestionAnswering.from_pretrained(
    "/content/quantized_model"
)

nlp = pipeline("question-answering", model = loaded_model_from_hub, tokenizer= tokenizer)

# Prediction

In [106]:
def pred_theme_ans(questions, pred_out,threshold=0.05):
  theme = questions[0]["Theme"]
  for question in questions:
    ans = {}
    # Get Question Id
    ans["question_id"] = question["id"]
    QA_input = {
      'question':question['Question'],
      'context': question['Paragraph']
      }
    
    # Get Result, Contains, Answer Start, Answer End, Answer and the Confidence Score
    res = nlp(QA_input) 
    score = res['score']
    # If Confidence Score is less than the threshold then dont Answer,otherwise answer
    if (score<threshold):
       ans["answers"]= ''
    else:
      ans["answers"]=res['answer']
    pred_out.append(ans)

# Removes Punctuation,Stopwords and Case from a String
def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(str(s)))))

# Get Words from a String
def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

# Calculate the f1 between two sentences
def calc_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

# Calculate the f1 score between prediction and multiple ground truths
def calc_max_f1(predicted, ground_truths):
  max_f1 = 0
  for ground_truth in ground_truths:
    f1 = calc_f1(predicted, ground_truth)
    max_f1 = max(max_f1, f1)
  try:
    ground_truths[0]
  except Exception as e:
    if predicted!=predicted:
      max_f1 = 1
  return max_f1

In [107]:
questions_df = pd.read_csv('https://drive.google.com/uc?export=download&id=14Semm5RXuaQ2UwlIKfi5HtFX_5LzNSGX')
theme_df = pd.read_csv('https://drive.google.com/uc?export=download&id=1gaB73lLklzYxFIj3nmttG7UGisptz7mB')
# para_df = pd.read_csv('https://drive.google.com/uc?export=download&id=10LO_evy-N3uGR6QF9DatWIFb_nHPH6mv')

In [108]:
# All theme prediction.
from tqdm import tqdm
questions = json.loads(questions_df.to_json(orient="records"))
theme_intervals = json.loads(theme_df.to_json(orient="records"))
pred_out = []
theme_inf_time = {}
execution_times = []
for theme_interval in tqdm(theme_intervals):
  theme_ques = questions[int(theme_interval["start"]) - 1: int(theme_interval["end"])]
  execution_time = timeit.timeit(lambda: pred_theme_ans(theme_ques, pred_out), number=1)
  execution_times.append(execution_time/len(theme_ques))
  theme_inf_time[theme_interval["theme"]] = execution_time * 1000 # in milliseconds.
pred_df = pd.DataFrame.from_records(pred_out)
# Write prediction to a CSV file. Teams are required to submit this csv file.
pred_df.to_csv('/content/output_prediction.csv', index=False)
print("\navg_inference_time:",round(sum(execution_times)/len(theme_intervals),3)*1000)

100%|██████████| 327/327 [00:16<00:00, 19.42it/s]


avg_inference_time: 17.0





In [109]:
pred_df

Unnamed: 0,question_id,answers
0,1,1988 Summer Olympics
1,2,1200
2,3,the French flag
3,4,Baichung Bhutia
4,5,
...,...,...
995,996,view
996,997,
997,998,
998,999,small people


In [110]:
truth = pd.read_csv('https://drive.google.com/uc?export=download&id=1LghSTve3IzjZBmukNNpMOsjwDggTedyi')
truth

Unnamed: 0.1,Unnamed: 0,question_id,answers
0,0,1,['1988 Summer Olympics']
1,1,2,['1200']
2,2,3,['the French flag']
3,3,4,['Baichung Bhutia']
4,4,5,[]
...,...,...,...
995,995,996,['view']
996,996,997,[]
997,997,998,[]
998,998,999,['small people']


In [111]:
# Evaluation methodology.
metrics = {}
total_f1 = 0
pred = pd.read_csv('/content/output_prediction.csv')
truth = pd.read_csv('https://drive.google.com/uc?export=download&id=1LghSTve3IzjZBmukNNpMOsjwDggTedyi')
truth.answers = truth.answers.apply(literal_eval)
questions = questions_df.copy()
for idx in pred.index:
  q_id = pred["question_id"][idx]
  q_rows = questions.loc[questions['id'] == q_id].iloc[-1]
  theme = q_rows["Theme"]
  predicted_ans = pred["answers"][idx]
  
  if theme not in metrics.keys():
    metrics[theme] = {"true_positive": 0, "true_negative": 0, "total_predictions": 0, "f1_sum": 0}

  truth_row = truth.loc[truth['question_id'] == q_id].iloc[-1]
  if truth_row["answers"] == ' ':
     metrics[theme]["true_negative"] = metrics[theme]["true_negative"] + 1
  else:
     metrics[theme]["true_positive"] = metrics[theme]["true_positive"] + 1
 
  # Increase total predictions for that theme.
  metrics[theme]["total_predictions"] = metrics[theme]["total_predictions"] + 1
  f1 = calc_max_f1(predicted_ans, truth_row["answers"])
  metrics[theme]["f1_sum"] = metrics[theme]["f1_sum"] + f1
  total_f1+=f1
print(round(total_f1/len(questions),3))

0.952


In [112]:
# Final score.
inf_time_threshold = 200.0
# final_para_score = 0.0
final_qa_score = 0.0
# Weight would stay hidden from teams.
# theme_weights = {"Computer_security": 0.11, "Uranium": 0.44, "Canadian_football": 0.22,"Seven_Years%27_War": 0.11,"ASCII": 0.11}
for theme in metrics:
  inf_time_score = 1.0
  metric = metrics[theme]
  qa_score = metric["f1_sum"] / metric["total_predictions"]
  avg_inf_time = theme_inf_time[theme] / metric["total_predictions"]
  if avg_inf_time > inf_time_threshold:
    inf_time_score = inf_time_threshold / avg_inf_time
  final_qa_score += 1/len(metrics) * inf_time_score * qa_score
print (round(final_qa_score,3))


0.954
