## Task 1. Removing unanswerable QA pairs
Check back Tutorial 2 on how to fix a specific random seed for reproducibility!

In [1]:
#imports
import numpy as np
import pandas as pd
import torch as tc
import random
import sklearn as sk
from sklearn.model_selection import GroupShuffleSplit
import transformers
from transformers import EncoderDecoderModel, AutoTokenizer

In [12]:
import tensorflow as tf

### Dataset download

In [2]:
#Code provided in assignment docs
#Dataset download
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')
    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [4]:
# Train and test data loading
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')

### Reading data from JSON

In [2]:
#reading train.json to dataframe
train_df = pd.read_json('coqa/train.json')

#deleting the "version" json property
del train_df["version"]

cols = ["source", "id", "filename", "story", "question", "qturn_id", "span_start","span_end", "span_text", "answer", "aturn_id"]
comp_list = []
for index, row in train_df.iterrows():
   for i in range(len(row["data"]["questions"])):
        row_insert = []
        row_insert.append(row["data"]["source"])
        row_insert.append(row["data"]["id"])
        row_insert.append(row["data"]["filename"])
        row_insert.append(row["data"]["story"])
        for key in row["data"]["questions"][i]:
            row_insert.append(row["data"]["questions"][i][key])
        for key in row["data"]["answers"][i]:
            row_insert.append(row["data"]["answers"][i][key])
        #truncating bad_turn property if presented
        comp_list.append(row_insert[:11])

train_df = pd.DataFrame(comp_list, columns=cols)
#droping the unanswerable QA pairs
train_df = train_df[train_df["answer"] != "unknown"]

In [3]:
%pip install scikit-learn

Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.

  Downloading scikit_learn-1.2.0-cp310-cp310-win_amd64.whl (8.2 MB)
     ---------------------------------------- 8.2/8.2 MB 223.9 kB/s eta 0:00:00
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting scipy>=1.3.2
  Downloading scipy-1.9.3-cp310-cp310-win_amd64.whl (40.1 MB)
     -------------------------------------- 40.1/40.1 MB 252.9 kB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.0 scipy-1.9.3 threadpoolctl-3.1.0


### Splitting the train set into train and validation

In [3]:
#creating the group splitter
spl = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
split = spl.split(train_df, groups=train_df["story"])

train_inds, val_inds = next(split)

train = train_df.iloc[train_inds]
validation = train_df.iloc[val_inds]

In [None]:
train.head()

In [None]:
validation.head()

### Model definition - DistilRoBERTa and BERTTiny

In [None]:
drbt = EncoderDecoderModel.from_encoder_decoder_pretrained("distilroberta-base", "distilroberta-base")
berttiny = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
bert_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
drbt_tokenizer = AutoTokenizer.from_pretrained( "distilroberta-base")

In [19]:
def input_tokens(tokenizer, train_set):
    question = train_set["question"][:10000].tolist()
    story = train_set["story"][:10000].tolist()
    inputs = tokenizer(question,
                       story,
                       truncation=True,
                       padding=True,
                       max_length=512,
                       return_tensors="pt")
    return inputs

In [20]:
bt_inputs = input_tokens(bert_tokenizer, train)
dt_inputs = input_tokens(drbt_tokenizer, train)

  question = train_set["question"][:10000].tolist()
  story = train_set["story"][:10000].tolist()


In [21]:
input_ids = bt_inputs["input_ids"]
labels = bt_inputs["input_ids"]

loss = berttiny(input_ids = input_ids, decoder_input_ids=labels, labels=labels).loss
loss.backward()

berttiny.eval()

"""
input_ids = tokenizer('example question', return_tensors='pt').input_ids
labels = tokenizer('example answerr', return_tensors='pt').input_ids

loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
loss.backward()

bert2bert.eval()
greedy_output = bert2bert.generate(input_ids, decoder_start_token_id=bert2bert.config.decoder.pad_token_id)
"""