<a href="https://colab.research.google.com/github/Arun-nexus/deep_learning/blob/main/question_answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
df=pd.read_csv(r"C:\Users\Arun\Downloads\train.csv\train.csv")
df

Unnamed: 0,question,distractor3,distractor1,distractor2,correct_answer,support
0,What type of organism is commonly used in prep...,viruses,protozoa,gymnosperms,mesophilic organisms,"Mesophiles grow best in moderate temperature, ..."
1,What phenomenon makes global winds blow northe...,tropical effect,muon effect,centrifugal effect,coriolis effect,Without Coriolis Effect the global winds would...
2,Changes from a less-ordered state to a more-or...,endothermic,unbalanced,reactive,exothermic,Summary Changes of state are examples of phase...
3,What is the least dangerous radioactive decay?,zeta decay,beta decay,gamma decay,alpha decay,All radioactive decay is dangerous to living t...
4,Kilauea in hawaii is the world’s most continuo...,magma,greenhouse gases,carbon and smog,smoke and ash,Example 3.5 Calculating Projectile Motion: Hot...
...,...,...,...,...,...,...
11674,The enzyme pepsin plays an important role in t...,lipids,protons,proteins,peptides,Protein A large part of protein digestion take...
11675,What remains a constant of radioactive substan...,acidity,temperature,volatility,rate of decay,The rate of decay of a radioactive substance i...
11676,"Terrestrial ecosystems, also known for their d...",substrates,bisomes,monomes,biomes,"Terrestrial ecosystems, also known for their d..."
11677,High explosives create shock waves that exceed...,turbulence,light speed,ion speed,supersonic,The modern day formulation of gun powder is ca...


In [None]:
df.fillna("<pad>",inplace=True)

In [None]:
x=df["question"].to_list()
y=df["support"].to_list()

In [None]:
from nltk import word_tokenize
def tokenize(text):
    tokens=[word_tokenize(row) for row in text]
    return tokens
x_tokens=tokenize(x)
y_tokens=tokenize(y)

In [None]:
def sequence(tokens):
    seq=[]
    for row in tokens:
        for word in row:
            seq.append(word)
    return seq
x_seq=sequence(x_tokens)
y_seq=sequence(y_tokens)

In [None]:
from collections import Counter
def dict_creater(tokens):
    word_count=Counter(tokens)
    new_list={"<pad>":0,"<unk>":1}
    for key in word_count:
        if key not in new_list:
            new_list[key]=len(new_list)
    return new_list
x_dict=dict_creater(x_seq)
y_dict=dict_creater(y_seq)
y_vocab_size=len(y_dict)
x_vocab_size=len(x_dict)


In [None]:
import torch
max_len = 20

def data_creator(tokens, dicti, max_len):
    new_list = []
    for row in tokens:
        data = [dicti.get(word, dicti["<unk>"]) for word in row]
        if len(data) <= max_len:
            data = data + [dicti["<pad>"]] * (max_len - len(data))
        else:
            data = data[:max_len]
        new_list.append(data)
    return new_list

x_data = data_creator(x_tokens, x_dict, max_len)
y_data = data_creator(y_tokens, y_dict, max_len)


In [None]:
y_data=torch.tensor(y_data,dtype=torch.long)
x_data=torch.tensor(x_data,dtype=torch.long)

In [None]:
import  torch.nn as nn
import math
class positional_encooding(nn.Module):

    def __init__(self,max_len,d_model):
        super().__init__()

        pe=torch.zeros(max_len,d_model)
        pos=torch.arange(0,max_len).unsqueeze(1)
        div_term=torch.exp(torch.arange(0,d_model,2)*(-math.log(10000.0)/d_model))
        pe[:,0::2]=torch.sin(pos*div_term)
        pe[:,1::2]=torch.cos(pos*div_term)
        pe=pe.unsqueeze(0)
        self.register_buffer("pe",pe)

    def forward(self,x):
        return x+self.pe[:,:x.size(1)]


In [None]:
class attention(nn.Module):
    def __init__(self,d_model,num_heads):
        super().__init__()

        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.q_linear = nn.Linear(d_model,d_model)
        self.k_linear = nn.Linear(d_model,d_model)
        self.v_linear = nn.Linear(d_model,d_model)
        self.out = nn.Linear(d_model,d_model)

    def forward(self,q,k,v,mask=None):
        print(q.shape[0])
        batch_size=q.shape[0]

        q = self.q_linear(q).view(batch_size,-1,self.num_heads,self.d_k).transpose(1,2)
        k = self.q_linear(k).view(batch_size,-1,self.num_heads,self.d_k).transpose(1,2)
        v = self.q_linear(v).view(batch_size,-1,self.num_heads,self.d_k).transpose(1,2)

        scores = torch.matmul(q,k.transpose(-2,-1)/math.sqrt(self.d_k))

        if mask is not None:
            scores=torch.masked_fill(mask==0,float("inf"))

        attn=torch.softmax(scores,dim=-1)
        output=torch.matmul(attn,v)
        output=output.transpose(1,2).contiguous().view(batch_size,-1,self.num_heads*self.d_k)

        return self.out(output)


In [None]:
class feedforward(nn.Module):
    def __init__(self,d_model,d_ff,drop):
        super().__init__()
        self.features = nn.Sequential(
            nn.Linear(d_model,d_ff),nn.ReLU(),nn.Dropout(drop),nn.Linear(d_ff,d_model)
        )

    def forward(self,x):

        return self.features(x)

In [None]:
class trasnformerencoder(nn.Module):
    def __init__(self,d_model,num_heads,d_ff,drop=0.2):
        super().__init__()
        self.attn=attention(d_model,num_heads)
        self.ff=feedforward(d_model,d_ff,drop)
        self.norm1=nn.LayerNorm(d_model)
        self.norm2=nn.LayerNorm(d_model)
        self.dropout=nn.Dropout(drop)

    def forward(self,x,mask=None):
        x2=self.norm1(x+self.dropout(self.attn(x,x,x,mask)))
        x3=self.ff(x2)
        x=self.norm2(x2+self.dropout(self.attn(x3,x3,x3,mask)))
        return x

In [None]:
class transformer_decoder(nn.Module):
    def __init__(self,d_model,num_heads,d_ff,drop=0.2):
        super().__init__()

        self.attn=attention(d_model,num_heads)
        self.cross_attnn=attention(d_model,num_heads)
        self.ff=feedforward(d_model,d_ff,drop)
        self.norm1=nn.LayerNorm(d_model)
        self.norm2=nn.LayerNorm(d_model)
        self.norm3=nn.LayerNorm(d_model)
        self.dropout=nn.Dropout(drop)

    def forward(self,src,tgt,x,src_mask=None,mask=None):

        self_attn=self.attn(x,x,x,mask)
        norm_attn=self.norm1(x+self.dropout(self_attn))
        cross_attn=self.cross_attnn(norm_attn,src,tgt,src_mask)
        cross_norm=self.norm2(x+self.dropout(cross_attn))
        ff=self.ff(cross_norm)
        ff_norm=self.norm3(x+self.dropout(ff))

        return ff_norm

In [None]:
x_data.shape[0]

11679

In [None]:
class transformer(nn.Module):
    def __init__(self,d_model,xvocab_size=10,yvocab_size=10,max_len=10,num_heads=10,d_ff=1024,drop=0.2,num_layers=6,num_classes=8):
        super().__init__()

        self.src_embedding=nn.Embedding(xvocab_size,d_model)
        self.tgt_embedding=nn.Embedding(yvocab_size,d_model)

        self.pe=positional_encooding(max_len,d_model)

        encoder=trasnformerencoder(d_model,num_heads,d_ff,drop)
        decoder=transformer_decoder(d_model,num_heads,d_ff,drop)

        self.decoder_layers=nn.ModuleList([decoder for _ in range(num_layers)])
        self.encoder_layers=nn.ModuleList([encoder for _ in range(num_layers)])
        self.fc_out=nn.Linear(d_model,num_classes)

    def forward(self,src,tgt,src_mask=None,tgt_mask=None):

        x=self.pe(self.src_embedding(src))
        y=self.pe(self.tgt_embedding(tgt))

        for layers in self.encoder_layers:
            x=layers(x,src_mask)

        for layers in self.decoder_layers:
            y=layers(y,x,src_mask,tgt_mask)

        return self.fc_out(y)

In [None]:
from torch.utils.data import TensorDataset,DataLoader
from sklearn.model_selection import train_test_split
dataset=TensorDataset(x_data,y_data)
train_data,val_data=train_test_split(dataset,test_size=0.3,random_state=42)
train_data_loader=DataLoader(train_data,batch_size=256)
val_data_loader=DataLoader(val_data,batch_size=256)

In [None]:
device=("cuda" if torch.cuda.is_available() else "cpu")
model=transformer(xvocab_size=x_vocab_size,yvocab_size=y_vocab_size,d_model=512,max_len=200,num_layers=6,num_heads=8,d_ff=2048,num_classes=len(y_dict),drop=0.2).to(device)
model_loss=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.0001)
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode="min",factor=0.5,patience=2)

In [None]:
import torch.nn as nn
class early_stopping(nn.Module):
    def __init__(self,min_delta,patience):
        super().__init__()
        self.min_delta=min_delta
        self.partience=patience
        self.early_stop=False
        self.best_loss=float("inf")
        self.counter=0

    def __call__(self,val_loss):
        if self.min_delta >= val_loss-self.best_loss:
            self.counter=0
            self.best_loss=val_loss
        else:
            self.counter+=1
            if self.counter>=self.partience:
                self.early_stop=True

In [None]:
from torch.amp import GradScaler,autocast
epochs=50
stopper=early_stopping(1e-4,4)
training_loss_store=[]
validation_loss_store=[]
training_accuracy_store=[]
validation_accuracy_store=[]
scaler=GradScaler()

for epoch in range(epochs):
    running,total,correct=0,0,0
    for x,y in train_data_loader:
        x,y=x.to(device),y.to(device)

        input=y[:,:-1]
        target=y[:,1:]


        optimizer.zero_grad()
        with autocast("cuda"):
            output=model(x,input)
            output=output.reshape(-1,output.size(-1))
            tgt_output=target.reshape(-1)
            loss=model_loss(output,tgt_output)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        running+=loss.item()
        _,pred=torch.max(output,1)
        correct+=(pred==y).sum().item()
        total+=y.size(0)


    training_loss=running/len(y)
    training_acc=correct/total
    training_loss_store.append(training_loss)
    training_accuracy_store.append(training_acc)

    model.eval()
    vrunning,vtotal,vcorrect=0,0,0
    for vx,vy in val_data_loader:
        vx,vy=vx.to(device),vy.to(device)
        vinput=vx[:,:-1]
        vtarget=vy[:,1:]
        optimizer.zero_grad()
        with autocast("cuda"):
            voutput=model(vx,vtarget)
            voutput=voutput.reshape(-1,voutput.size(-1))
            vy=vtarget.reshape(-1)
            vloss=model_loss(voutput,vy)

        vrunning+=vloss.item()
        _,vpred=torch.max(voutput,1)
        vcorrect+=(vpred==vy).sum().item()
        vtotal+=vy.size(0)
    val_loss=vrunning/len(vy)
    val_acc=vcorrect/vtotal
    validation_loss_store.append(val_loss)
    validation_accuracy_store.append(val_acc)

    print(f"epoch: {epoch+1} training_loss: {training_loss:.4f} training acc:{training_acc:.2f} validation_loss: {val_loss:.4f} validation_acc: {val_acc:.2f} ")
    stopper(val_loss)
    if stopper.early_stop:
        print("early_stopping triggered")
        break