<a href="https://colab.research.google.com/github/DhumneMrinmayee/MLProjects/blob/main/EM800BioQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 !nvidia-smi

In [None]:
!pip install git+https://github.com/PyTorchLightning/pytorch-lightning

In [None]:
!pip install  transformers
!pip install  tokenizer
!pip install sentencepiece


In [None]:
!apt install pytorch.lightning

In [None]:
import pandas as pd
import numpy as np


import re
import time
import random
import logging        # logging API 
import json           #provides a simple command line interface to validate and pretty-print JSON objects
import  os
import glob          # the glob module is used to retrieve files/pathnames matching a specified pattern
import argparse      #Parser for command-line options, arguments and sub-command
import textwrap


from sklearn.model_selection import train_test_split   
from string import punctuation        
from itertools import chain    #an iterator that returns elements from the first iterable until it is exhausted, then proceeds to the next iterable,just like a chain
from pathlib import Path     # provides Object-oriented filesystem paths
from torch.utils.data import Dataset, DataLoader
from termcolor import colored  # module for ANSII Color formatting for output in the terminal
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)


In [None]:
#Function that sets seed for pseudo-random number generators in: pytorch, numpy, python.random 

pl.seed_everything(42)

In [None]:
from zipfile import ZipFile

In [None]:
with ZipFile('drive/MyDrive/QA.zip', 'r') as zipObj:
  zipObj.extractall('drive/MyDrive/train_BioASQ')

In [None]:
with Path('drive/MyDrive/train_BioASQ/BioASQ/BioASQ-train-factoid-4b.json').open() as json_file:
  data = json.load(json_file)

In [None]:
data.keys()

In [None]:
data['version']

In [None]:
len(data)

In [None]:
data['data'][0].keys()

In [None]:
data['data'][0]['title']

In [None]:
len(data['data'][0]['paragraphs'])

In [None]:
queries = data['data'][0]['paragraphs']

In [None]:
queries[1]

In [None]:
def extract_question_and_answers(factoid_path: Path):
  with factoid_path.open() as json_file:
    data = json.load(json_file)
 
  queries = data['data'][0]['paragraphs']  
  data_rows =[]
  for rows in queries:
    context = rows['context']
    for QA in rows['qas']:
      questions = QA['question']
      answers = QA['answers']
      for answers in answers:
        ans_text = answers['text']
        ans_start = answers['answer_start']
        ans_end= ans_start +len(ans_text)

        data_rows.append({
            'question':questions,
            'context' : context,
            'answer_text': ans_text,
            'answer_start': ans_start,
            'answer_end':ans_end
        })
    
  return pd.DataFrame(data_rows)


In [None]:
extract_question_and_answers(Path('drive/MyDrive/train_BioASQ/BioASQ/BioASQ-train-factoid-4b.json')).head()

In [None]:
factoid_path = sorted(list(Path('drive/MyDrive/train_BioASQ/BioASQ/').glob('BioASQ-train-*')))
factoid_path

In [None]:
#Extract data into dataframes and combine all train files

dataframe = []
for factoid_path in factoid_path:
  dataframe.append(extract_question_and_answers(factoid_path))

#concatinate all the dataframes
df = pd.concat(dataframe)

In [None]:
df.head()

In [None]:
df.shape

In [None]:

df = df.drop_duplicates(subset= ['context']).reset_index(drop=True)

In [None]:
df.shape

In [None]:
len(df.question.unique())

In [None]:
len(df.context.unique())

In [None]:
sample_question = df.iloc[100]
sample_question

In [None]:
def highlight_ans(question):
  answer_start, answer_end = question['answer_start'],question['answer_end']
  context= question['context']

  return colored(context[:answer_start],"white")+\
  colored(context[answer_start:answer_end + 1], "yellow")+\
  colored(context[answer_end+ 1:],"white")


In [None]:
print(sample_question["question"])
print("/n")
print("Answer:" )
for wrap in textwrap.wrap(highlight_ans(sample_question), width = 100):
  print(wrap)

In [None]:
#Tokenization

model = 't5-base'


In [None]:
tokenizer = T5Tokenizer.from_pretrained(model)

In [None]:
encoding_sample = tokenizer(
    "what is your preferred public transport?",
    "Mostly subway. Also it depends on what time of the day I have to travel." 
)

In [None]:
encoding_sample.keys() 

In [None]:
print(encoding_sample['input_ids'])    # input id for each task

In [None]:
print(encoding_sample['attention_mask']) 

In [None]:
p =[
    tokenizer.decode(input_id,skip_special_tokens=True, clean_up_tokenization_spaces=True)
    for input_id in encoding_sample['input_ids']
]


In [None]:
' '.join(p)

In [None]:
encoding_question = tokenizer(
    sample_question["question"],
    sample_question['context'],
    max_length = 1000,
    padding= "max_length",
    truncation = True,
    return_attention_mask =True,
    add_special_tokens = True,
    return_tensors = 'pt'
)

In [None]:
encoding_question.keys()

In [None]:
#understand the types of special tokens 
tokenizer.special_tokens_map

In [None]:
tokenizer.decode(encoding_question['input_ids'].squeeze())

In [None]:
encoding_answer = tokenizer(
    sample_question["answer_text"],
    max_length = 100,
    padding= "max_length",
    return_attention_mask =True,
    add_special_tokens = True,
    return_tensors = 'pt'
)

In [None]:
tokenizer.decode(encoding_answer['input_ids'].squeeze())

In [None]:
labels = encoding_answer['input_ids']
labels

In [None]:
class BioASQDataset(Dataset):

  def __init__(
      self,                           # self is the first parameter of methods that represents the instance of the class. Therefore, in order to call attributes and methods of a class we use self word
      data : pd.DataFrame,
      tokenizer: T5Tokenizer,
      source_max_token_len: int= 400,
      target_max_token_len: int = 70

  ):
    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)


  def __getitem__(self, index: int):
      rows = self.data.iloc[index]

      encoding_scource = tokenizer(
        rows["question"],
        rows['context'],
        max_length = self.source_max_token_len,
        padding= "max_length",
        truncation = True,
        return_attention_mask =True,
        add_special_tokens = True,
        return_tensors = 'pt'
      )

      encoding_target =tokenizer(
        rows["answer_text"],
        max_length = self.target_max_token_len,
        padding= "max_length",
        return_attention_mask =True,
        add_special_tokens = True,
        return_tensors = 'pt'
      )
      labels = encoding_target['input_ids']
      labels[labels == 0]= -100


      return dict(
          question = rows['question'],
          context = rows['context'],
          answer_text = rows['answer_text'],
          input_ids = encoding_scource['input_ids'].flatten(),                 # .flatten() Return a copy of the array collapsed into one dimension
          attention_mask= encoding_scource['attention_mask'].flatten(),
          labels = labels.flatten()
      )

In [None]:
sample_data = BioASQDataset(df, tokenizer)


In [None]:
for data in sample_data:
  print(data['question'])
  print(data['answer_text'])
  print(data['input_ids'][:20])
  print(data['labels'][:20])
  break

In [None]:
df_train, df_val = train_test_split(df, test_size= 0.10)   #test size default
df_train.shape

In [None]:
df_val.shape

In [None]:
#Whitening or Sphering is a data pre-processing step. 
#It can be used to remove correlation or dependencies between features in a dataset. 
#This may help to better train a machine learning model.

In [None]:
# Building a whitening data module to create the training, validation, test sets
# Using super() for building a class that extend the functionality of previously built classes.
class BioASQDatamodule(pl.LightningDataModule):
  def __init__(
    self,
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    tokenizer: T5Tokenizer,
    batch_size : int = 16,
    source_max_token_len: int= 400,
    target_max_token_len: int = 70
  ):
    super().__init__()
    self.df_train =  df_train
    self.df_test =  df_test
    self.batch_size =  batch_size
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def frame(self):
    self.train_dataset = BioASQDataset(
        self.df_train,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )

    self.test_data = BioASQDataset(
        self.df_test,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )
 
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True,
        num_workers =4 
    )
  
  def val_dataloader(self):
    return DataLoader(
        self.test_data,
        batch_size = 1,
        num_workers =4 
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_data,
        batch_size = 1,
        num_workers =4 
    )
                     

In [None]:
Batch_size = 8
N_Epochs =6
DataModule = BioASQDatamodule(df_train, df_val, tokenizer, batch_size= Batch_size)


DataModule.frame()