# Baseline Solution v0.0.1

In [None]:
# # magic
i = []
while(True):
  i.append("1"*100000000)

# Packages

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))


# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 67 bytes


In [None]:
!kaggle datasets download -d bminixhofer/roberta-transferred-to-hindi-tamil-with-wechsel

Downloading roberta-transferred-to-hindi-tamil-with-wechsel.zip to /content
100% 2.31G/2.31G [00:16<00:00, 135MB/s]
100% 2.31G/2.31G [00:16<00:00, 149MB/s]


In [None]:
!unzip roberta-transferred-to-hindi-tamil-with-wechsel

Archive:  roberta-transferred-to-hindi-tamil-with-wechsel.zip
  inflating: roberta-large-wechsel-hindi/config.json  
  inflating: roberta-large-wechsel-hindi/pytorch_model.bin  
  inflating: roberta-large-wechsel-hindi/special_tokens_map.json  
  inflating: roberta-large-wechsel-hindi/tokenizer.json  
  inflating: roberta-large-wechsel-hindi/tokenizer_config.json  
  inflating: roberta-large-wechsel-hindi/training_args.bin  
  inflating: roberta-large-wechsel-hindi/vocab.txt  
  inflating: roberta-large-wechsel-tamil/config.json  
  inflating: roberta-large-wechsel-tamil/pytorch_model.bin  
  inflating: roberta-large-wechsel-tamil/special_tokens_map.json  
  inflating: roberta-large-wechsel-tamil/tokenizer.json  
  inflating: roberta-large-wechsel-tamil/tokenizer_config.json  
  inflating: roberta-large-wechsel-tamil/training_args.bin  
  inflating: roberta-large-wechsel-tamil/vocab.txt  


In [None]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

Writing setup.sh


In [None]:
!sh setup.sh

In [None]:
!pip install sentencepiece
!pip install transformers[sentencepiece]
!pip install transformers
!pip install optuna
#!pip install pyngrok

In [None]:
# !unzip drive/MyDrive/Kaggle/Hindi/checkpoints/5foldsroberta.zip

In [None]:
%pdb on
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.utils as utils
from transformers import AutoModel,AutoConfig,AutoTokenizer,logging
import transformers
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
import os
import re
import random
from tqdm import tqdm
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings("ignore", category=UserWarning)                                     
logging.set_verbosity_warning()
logging.set_verbosity_error()
try:
    from apex import amp
    APEX = True
except:
    APEX = False

Automatic pdb calling has been turned ON


In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir drive/MyDrive/Kaggle/Hindi/logs/train_loss

# Data Base

In [None]:
base = 'drive/MyDrive/Kaggle/Hindi/'
#train = pd.read_csv(base+"input/simple_corpus.csv")
#valid = train.loc[train["kfold"]==1,:]
#train = train.loc[train["kfold"]!=1,:]
#test = pd.read_csv(base+"input/test.csv")

# Config

In [None]:
class Config:
  # random
  seed = 2021

  # preprocessing
  model_name = "/content/drive/MyDrive/Kaggle/Hindi/models/pretrained/checkpoint-60000"
  model_type = 'roberta'

  # tokenize
  doc_stride = 135
  max_seq_len = 400

  # model
  hidden_dropout_prob = 0.1
  initializer_range = 0.02
  hidden_size = 1024
  max_query_length=64

  # train
  n_epochs = 2
  optimizer_mode = "ChildTuning-F"
  learning_rate = 2e-5
  down_specific = True
  down_lr = 3e-5
  epsilon = 1e-8
  correct_bias = True
  weight_decay = 1e-2
  LR_schedule = 'linear-warmup'
  warm_frac = 0.2
  log_steps = 100
  upload_steps = 100
  train_batch_size = 4
  valid_batch_size = 128
  acc_gradient_steps = 2
  valid_intervals = 500
  patience = 5


  # cuda 
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
  fp16 = APEX
  fp16_opt_level = "O1"

  # log
  train_loss_path = 'drive/MyDrive/Kaggle/Hindi/logs/train_loss'
  valid_loss_path = 'drive/MyDrive/Kaggle/Hindi/logs/valid_loss'
  best_loss = 1
  best_acc = 0
  fold = 0
  checkpoint = "output/fold_0.bin"
  



In [None]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Text Processing

Text Preprocessing

In [None]:
def text_preprocess(df):
  df = df.copy()
  df["context"] = df["context"].apply(lambda x: " ".join(x.split()))
  df['question'] = df['question'].apply(lambda x: " ".join(x.split()))
 
  return df

# Tokenize

In [None]:
def tokenize(tokenizer,df):  
  #text = "[cls]"+df["question"]+"[sep]"+df["context"]+"[sep]"
  tokens_info = tokenizer(df["question"],df["context"], 
                         padding='max_length',
                         truncation="only_second",
                         stride=Config.doc_stride,
                         max_length=Config.max_seq_len,
                         return_overflowing_tokens=True,
                         return_offsets_mapping=True,
                         )
  
  return tokens_info

# Make Examples

In [None]:
def make_examples(df):
  tokenizer = AutoTokenizer.from_pretrained(Config.model_name)
  examples = []
  df["question"] = df["question"].str.lstrip()

  # train 
  if "answer_text" in df.columns:
    for i in range(df.shape[0]):
      tokens_info = tokenize(tokenizer,df.iloc[i,:])
      for j in range(len(tokens_info["input_ids"])):
        example = {}
        #example["id"] = df["id"].iloc[i]
        example["context"] = df["context"].iloc[i]
        example["question"] = df["question"].iloc[i]
        example["example_id"] = df["id"].iloc[i]
        example["input_ids"] = tokens_info["input_ids"][j]
        example["attention_mask"] = tokens_info["attention_mask"][j]
        example["offset_mapping"] = tokens_info["offset_mapping"][j]
        example["sequence_ids"] = [0 if i == None else i for i in tokens_info.sequence_ids(j)]

        cls_index = tokens_info["input_ids"][j].index(tokenizer.cls_token_id)
        sequence_ids = tokens_info.sequence_ids(j)

        sample_index = tokens_info["overflow_to_sample_mapping"][j]

        # if len([df["answer_start"].iloc[i]) == 0:
        #   example["answer_start"] = cls_index
        #   example["answer_end"] = cls_index
        # else:
        start_char = df["answer_start"].iloc[i]
        end_char = start_char + len(df["answer_text"].iloc[i])

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
          token_start_index += 1

        token_end_index = len(example["input_ids"]) - 1
        while sequence_ids[token_end_index] != 1:
          token_end_index -= 1

        if not (example["offset_mapping"][token_start_index][0] <= start_char and example["offset_mapping"][token_end_index][1] >= end_char):
          example["answer_start"] = cls_index
          example["answer_end"] = cls_index
        else:
          while token_start_index < len(example["offset_mapping"]) and example["offset_mapping"][token_start_index][0] <= start_char:
            token_start_index += 1
          example["answer_start"] = token_start_index - 1
          while example["offset_mapping"][token_end_index][1] >= end_char:
            token_end_index -= 1
          example["answer_end"] = token_end_index + 1

        examples.append(example)
 
  # test
  else:
     for i in range(df.shape[0]):
      tokens_info = tokenize(tokenizer,df.iloc[i,:])
      for j in range(len(tokens_info["input_ids"])):
        example = {}
        #example["id"] = df["id"].iloc[i]
        example["example_id"] = df["id"].iloc[i]
        example["context"] = df["context"].iloc[i]
        example["question"] = df["question"].iloc[i]
        example["input_ids"] = tokens_info["input_ids"][j]
        example["sequence_ids"] = [0 if i == None else i for i in tokens_info.sequence_ids(j)]
        example["attention_mask"] = tokens_info["attention_mask"][j]
        example["offset_mapping"] = tokens_info["offset_mapping"][j]
        examples.append(example)

  return examples

# Make Dataset

In [None]:
class QA_Dataset(utils.data.Dataset):
  def __init__(self,examples):
    self.examples = examples
    # train
    if "answer_start" in self.examples[0].keys():
      self.is_train = 1
    # test
    else:
      self.is_train = 0

  def __getitem__(self,idx):
    tensor_key = ["input_ids","attention_mask","offset_mapping"]
    item = {key:torch.tensor(value) for key,value in self.examples[idx].items() if key in tensor_key}
    item["id"] = self.examples[idx]["example_id"]
    if self.is_train == 1:
      label = {"answer_start":torch.tensor(self.examples[idx]["answer_start"]),
               "answer_end":torch.tensor(self.examples[idx]["answer_end"])}
      return item,label
    else:
      #item["tokens"] = self.examples[idx]["tokens"]
      item["context"] = self.examples[idx]["context"]
      item["question"] = self.examples[idx]["question"]
      item["sequence_ids"] = self.examples[idx]["sequence_ids"]
      return item


  def __len__(self):
    return len(self.examples)


# Make DataLoader

In [None]:
def make_loader(*args):
  loaders = []
  if len(args) == 2:
    # train
    examples = make_examples(args[0])
    df_set = QA_Dataset(examples)
    loader = utils.data.DataLoader(df_set,batch_size=Config.train_batch_size,
                                   shuffle=True,pin_memory=True,drop_last=False)
    loaders.append(loader)
    # valid
    examples = make_examples(args[1])
    df_set = QA_Dataset(examples)
    loader = utils.data.DataLoader(df_set,batch_size=Config.valid_batch_size,
                                  pin_memory=True,sampler=utils.data.SequentialSampler(df_set),
                                  drop_last=False)
    loaders.append(loader)
  else:
    examples = make_examples(args[0])
    df_set = QA_Dataset(examples)
    loader = utils.data.DataLoader(df_set,batch_size=Config.valid_batch_size,pin_memory=True,sampler=utils.data.SequentialSampler(df_set),
                                  drop_last=False)
    loaders = loader
  
  return loaders
  


# Model

# Child Tuning

In [None]:
class MODEL(nn.Module):
  def __init__(self):
    super(MODEL, self).__init__()
    config = AutoConfig.from_pretrained(Config.model_name)
    self.xlm_roberta = AutoModel.from_pretrained(Config.model_name,config)
    # self.net = nn.Sequential(nn.Dropout(Config.hidden_dropout_prob),
    #                          nn.Linear(Config.hidden_size,Config.hidden_size),
    #                          nn.ReLU(),
    #                          )
    # self.net = nn.LSTM(Config.hidden_size,Config.hidden_size,num_layers=1,batch_first=True,
    #                    bidirectional=True)
    #self.fusion_head = FusionHead()
    # self.deform_head = DeformHead()
    # self.cnn_head = CNNHead()
    self.qa_outputs = nn.Linear(Config.hidden_size, 2)
                                  
    # self.start_label_weight = nn.Linear(Config.max_seq_len,Config.max_seq_len)
    # self.end_label_weight = nn.Linear(Config.max_seq_len,Config.max_seq_len)
    self.dropout = nn.Dropout(0.1)
    self._init_weights(self.qa_outputs)
      
  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
        module.weight.data.normal_(mean=0.0, std=Config.initializer_range)
        if module.bias is not None:
            module.bias.data.zero_()

  def forward(
        self, 
        input
    ):
        output = self.xlm_roberta(
            **input,output_hidden_states=True
        )
        
        output = output.last_hidden_state # B*L*H
        
        #last_8_layers = torch.cat([output[i] for i in [9,14,19,23]],dim=-1) # B*L*4H
        #last_8_layers = self.dropout(last_8_layers)
        #cls_8_layers = last_8_layers[:,:]
        # cls_8_layers = cls_8_layers.repeat(1,Config.max_seq_len,1)
        #last_8_layers = self.dropout(last_8_layers)

        #fusion_output = self.fusion_head(last_8_layers)
        # cnn_output = self.cnn_head(last_8_layers)
        # deform_cnn = self.deform_head(cnn_output)
        #fusion_output = torch.cat([last_8_layers,fusion_output],dim=-1)

        #sequence_output = self.dropout()
        # sequence_output,_ = self.net(sequence_output)

        #sequence_output = self.dropout(sequence_output)
        #fusion_output = self.dropout(fusion_output)
        qa_logits = self.qa_outputs(output)
        
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        #del fusion_output,cls_8_layers,last_8_layers,output
        #torch.cuda.empty_cache()
    
        return start_logits, end_logits


In [None]:
# class MODEL(nn.Module):
#   def __init__(self):
#     super(MODEL, self).__init__()
#     config = AutoConfig.from_pretrained(Config.model_name)
#     self.roberta = AutoModel.from_pretrained(Config.model_name,config)
#     # self.net = nn.Sequential(nn.Dropout(Config.hidden_dropout_prob),
#     #                          nn.Linear(Config.hidden_size,Config.hidden_size),
#     #                          nn.ReLU(),
#     #                          )
#     # self.net = nn.LSTM(Config.hidden_size,Config.hidden_size,num_layers=1,batch_first=True,
#     #                    bidirectional=True)
#     self.fusion_head = FusionHead()
#     # self.deform_head = DeformHead()
#     # self.cnn_head = CNNHead()
#     self.qa_outputs = nn.Linear(Config.hidden_size*8, 2)
                                  
#     # self.start_label_weight = nn.Linear(Config.max_seq_len,Config.max_seq_len)
#     # self.end_label_weight = nn.Linear(Config.max_seq_len,Config.max_seq_len)
#     self.dropout = nn.Dropout(0.1)
#     self._init_weights(self.qa_outputs)
      
#   def _init_weights(self, module):
#     if isinstance(module, nn.Linear):
#         module.weight.data.normal_(mean=0.0, std=Config.initializer_range)
#         if module.bias is not None:
#             module.bias.data.zero_()

#   def forward(
#         self, 
#         input
#     ):
#         output = self.roberta(
#             **input,output_hidden_states=True
#         )
        
#         output = output.hidden_states # B*L*H
        
#         last_8_layers = torch.cat([output[i] for i in [9,14,19,23]],dim=-1) # B*L*4H
#         #last_8_layers = self.dropout(last_8_layers)
#         #cls_8_layers = last_8_layers[:,:]
#         # cls_8_layers = cls_8_layers.repeat(1,Config.max_seq_len,1)
#         last_8_layers = self.dropout(last_8_layers)

#         fusion_output = self.fusion_head(last_8_layers)
#         # cnn_output = self.cnn_head(last_8_layers)
#         # deform_cnn = self.deform_head(cnn_output)
#         fusion_output = torch.cat([last_8_layers,fusion_output],dim=-1)

#         #sequence_output = self.dropout()
#         # sequence_output,_ = self.net(sequence_output)

#         #sequence_output = self.dropout(sequence_output)
#         #fusion_output = self.dropout(fusion_output)
#         qa_logits = self.qa_outputs(fusion_output)
        
#         start_logits, end_logits = qa_logits.split(1, dim=-1)
#         start_logits = start_logits.squeeze(-1)
#         end_logits = end_logits.squeeze(-1)

#         #del fusion_output,cls_8_layers,last_8_layers,output
#         #torch.cuda.empty_cache()
    
#         return start_logits, end_logits


Head

In [None]:
class FusionHead(nn.Module):
  def __init__(self):
    super(FusionHead,self).__init__()
    self.q = nn.Linear(Config.hidden_size*4,512)
    self.v = nn.Linear(512,1)
    self._init_weights(self.q)
    self._init_weights(self.v)

  
  def forward(self,seq):
    # seq : (B,L,H*4)
    score = self.v(nn.functional.tanh(self.q(seq))) # B*L*1
    score = nn.functional.softmax(score,dim=1) 
    seq =  score*seq
    return seq

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      module.weight.data.normal_(mean=0.0, std=Config.initializer_range)
      if module.bias is not None:
          module.bias.data.zero_()

class DeformHead(nn.Module):
  def __init__(self):
    super(DeformHead,self).__init__()
    self.q = nn.Linear(Config.hidden_size*2,Config.hidden_size*4)
    self.k = nn.Linear(Config.hidden_size*2,512)
    self.v = nn.Linear(512,1)
  
  def forward(self,seq):
    # seq : (B,L,2H)
    score = self.v(nn.functional.tanh(self.k(seq))) # B*L*4H
    score = nn.functional.softmax(score,dim=1)  
    seq =  score*nn.functional.relu(self.q(seq))
    return seq


In [None]:
class CNNHead(nn.Module):
  def __init__(self):
    super(CNNHead,self).__init__()
    # B*L*4H
    self.conv1 = nn.Conv1d(Config.hidden_size*4,Config.hidden_size,1,1,0) # B*L*1024
    self.conv2 = nn.Conv1d(Config.hidden_size*4,Config.hidden_size,3,1,1) # B*L*1024

  def forward(self,seq):
    # seq : (B,L,H*4)
    seq = torch.permute(seq,(0,2,1))
    x1 = nn.functional.relu(self.conv1(seq))
    x2 = nn.functional.relu(self.conv2(seq))

    x1 = torch.permute(x1,(0,2,1))
    x2 = torch.permute(x2,(0,2,1))
    x = torch.cat((x1,x2),dim=-1)
    del x1,x2
    return x


# Training/Testing


loss

In [None]:
def qa_loss(start_pred,end_pred,start_ans,end_ans):
  start_loss = nn.CrossEntropyLoss()(start_pred,start_ans)
  end_loss = nn.CrossEntropyLoss()(end_pred,end_ans)
  loss = (start_loss+end_loss)/2
  del start_loss,end_loss
  torch.cuda.empty_cache()
  return loss
  

Scheduler

In [None]:
def qa_scheduler(optimizer,warm_steps,num_training_steps):
  scler = transformers.get_cosine_schedule_with_warmup(optimizer,
                                                       num_warmup_steps=warm_steps,
                                                       num_training_steps=num_training_steps,
                                                       )

  return scler


Optimizer

In [None]:
def get_optimizer_grouped_parameters(model):
    parameters = list(model.named_parameters())

    bert_parameters = parameters[:389]
    #head_parameters = parameters[391:395]
    net_parameters = parameters[391:]

    #head_default_parameters = [param for name,param in head_parameters]
    net_default_parameters = [param for name,param in net_parameters]

    down_lr = Config.down_lr
    parameters = []

    bert_increase_lr = [(1/0.95)**i for i in range(24)]#np.linspace(1,5,24)
    for name,param in bert_parameters:
      temp = name.split(".")
      if len(temp)>=4 and str.isdigit(temp[3]):
        parameters.append({"params":param,
                           "weight_decay":Config.weight_decay if "bias" not in name else 0.0,
                           "lr":Config.learning_rate*bert_increase_lr[int(temp[3])]}
                          )
      else:
        parameters.append({"params":param,
                           "weight_decay":Config.weight_decay if "bias" not in name else 0.0,
                           "lr":Config.learning_rate}
                          )
    
    if Config.down_specific:
      # for name,param in head_parameters:
      #   parameters.append({"params":param,
      #                      "weight_decay":Config.weight_decay if "bias" not in name else 0.0,
      #                      "lr":Config.down_lr}
      #                     )
      for name,param in net_parameters:
        parameters.append({"params":param,
                           "weight_decay":Config.weight_decay if "bias" not in name else 0.0,
                           "lr":Config.down_lr}
                          )
    else:
      parameters.append({"params":head_default_parameters})
      parameters.append({"params":net_default_parameters})
    
    return parameters

In [None]:
execfile("/content/drive/MyDrive/Kaggle/Hindi/models/childtuning.py")

def qa_optimizer(model):
  no_decay = ["bias", "LayerNorm.weight"]
  opt_params = [
      {
          "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
          "weight_decay": Config.weight_decay,
      },
      {
          "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
          "weight_decay": 0.0,
      },
  ]
  #opt_params = get_optimizer_grouped_parameters(model)
  # optimizer = transformers.AdamW(opt_params,
  #                               lr=Config.learning_rate,
  #                               eps=Config.epsilon,
  #                               correct_bias=Config.correct_bias)
  
  optimizer = transformers.AdamW(opt_params,lr=Config.learning_rate,eps=Config.epsilon,correct_bias=Config.correct_bias)
  
  return optimizer

Training

In [None]:
class Metric():
  def __init__(self,name):
    self.min = 1
    self.max = 0
    self.loss = 0
    self.sum = 0
    self.count = 0
    self.ave = 0
    self.index = 0
    self.writer = SummaryWriter(name)
    self.name = name

  def step(self,loss):
    self.sum += loss
    self.count += 1
    self.index += 1
    self.ave = self.sum/self.count
    self.max = max(self.max,loss)
    self.min = min(self.min,loss)
    self.loss = loss

  def reset(self):
    self.loss = 0
    self.sum = 0
    self.count = 0
    self.ave = 0

  def log(self,tag):
    self.writer.add_scalar(tag,self.ave,self.index)


In [None]:
class QA_Trainer():
  def __init__(self,model,optimizer,scler,train_loader,df_valid,valid_loader,valid_examples):
    self.model = model
    self.optimizer = optimizer
    self.scler = scler
    self.train_loader = train_loader
    self.df_valid = df_valid
    self.valid_loader = valid_loader
    self.valid_examples = valid_examples
    

  def fit(self):
    epoch = 0
    patience = 0
    metric = Metric(Config.train_loss_path)
    
    while epoch < Config.n_epochs:
      self.model.train()
      for step,(item,label) in enumerate(self.train_loader):
        input = {key:item[key].to(Config.device) for key in item.keys() if key in ["input_ids","attention_mask"]}
        start_ans,end_ans = label["answer_start"].to(Config.device),label["answer_end"].to(Config.device)

        start_pred,end_pred = self.model(input)
        loss = qa_loss(start_pred,end_pred,start_ans,end_ans)
        loss = loss/Config.acc_gradient_steps

        if Config.fp16:
          with amp.scale_loss(loss,self.optimizer) as scaled_loss:
            scaled_loss.backward()
        else:
          loss.backward()

        metric.step(loss.item())

        if step%Config.acc_gradient_steps == 0 or step == len(self.train_loader) - 1:
          self.optimizer.step()
          self.scler.step()
          self.optimizer.zero_grad()

        if step % Config.log_steps == 0:
          print("Epoch: {}\t||\tStep: {}/{}\t||\tAverage loss: {}\t||Max loss: {}\t||\t Min loss: {}".format(epoch,step,len(self.train_loader),metric.ave,metric.max,metric.min))
          #metric.log("model_128_2/train_loss")
          metric.reset()
        # if step % Config.valid_intervals == 0:
        #   loss = QA_Tester.evaluate(self.model,self.valid_loader)
        #   acc = validation(self.model,self.df_valid,self.valid_loader,self.valid_examples)
        #   if acc <= Config.best_acc:
        #     if epoch > 0:
        #       patience += 1
        #   else:
        #     Config.best_acc = acc
        #     patience = 0
        #   if patience == Config.patience:
        #     print("Early stopping has reached!")
        #     del item,label,loss,start_pred,end_pred,input,start_ans,end_ans
        #     torch.cuda.empty_cache()
        #     return self.model

        del item,label,loss,start_pred,end_pred,input,start_ans,end_ans
        torch.cuda.empty_cache()

      #loss = QA_Tester.evaluate(self.model,self.valid_loader)
      acc = validation(self.model,self.df_valid,self.valid_loader,self.valid_examples)
      if acc <= Config.best_acc:
        patience += 1
      else:
        Config.best_acc = acc
        patience = 0
      if patience == Config.patience:
        print("Early stopping has reached!")
        break

      epoch += 1
    
    return self.model

  def evaluate(self):
      QA_Tester.evaluate(self.model,self.valid_loader)






In [None]:
class QA_Trainer_HT():
  def __init__(self,model,optimizer,scler,train_loader,df_valid,valid_loader,valid_examples,params):
    self.model = model
    self.optimizer = optimizer
    self.scler = scler
    self.train_loader = train_loader
    self.df_valid = df_valid
    self.valid_loader = valid_loader
    self.valid_examples = valid_examples
    self.params = params

  def fit(self):
    epoch = 0
    patience = 0
    metric = Metric(Config.train_loss_path)
    
    while epoch < self.params["n_epochs"]:
      self.model.train()
      for step,(item,label) in enumerate(self.train_loader):
        input = {key:item[key].to(Config.device) for key in item.keys() if key in ["input_ids","attention_mask"]}
        start_ans,end_ans = label["answer_start"].to(Config.device),label["answer_end"].to(Config.device)

        start_pred,end_pred = self.model(input)
        loss = qa_loss(start_pred,end_pred,start_ans,end_ans)

        if Config.fp16:
          with amp.scale_loss(loss,self.optimizer) as scaled_loss:
            scaled_loss.backward()
        else:
          loss.backward()

        metric.step(loss.item())

        self.optimizer.step()
        self.scler.step()
        self.optimizer.zero_grad()

        del item,label,loss,start_pred,end_pred,input,start_ans,end_ans
        torch.cuda.empty_cache()

        # if step%Config.log_steps:
        #   print("-"*50)
        #   print("loss: {}".format(metric.ave))
          
      #loss = QA_Tester.evaluate(self.model,self.valid_loader)
      acc = validation(self.model,self.df_valid,self.valid_loader,self.valid_examples)

      epoch += 1

    self.exit()
    return acc

  def evaluate(self):
      QA_Tester.evaluate(self.model,self.valid_loader)

  def exit(self):
    try: 
      del self.optimizer,self.scler,self.model
    except:
      pass
    torch.cuda.empty_cache()




Testing

In [None]:

class QA_Tester():
  def evaluate(model,valid_loader):
    model.eval()
    losses = []
    
    with torch.no_grad():
      for item,label in valid_loader:
        input = {key:item[key].to(Config.device) for key in item.keys() if key in ["input_ids","attention_mask"]}
        start_ans,end_ans = label["answer_start"].to(Config.device),label["answer_end"].to(Config.device)

        start_pred,end_pred = model(input)
        loss = qa_loss(start_pred,end_pred,start_ans,end_ans)
        losses.append(loss.item())

        del input,item,label,start_ans,end_ans,start_pred,end_pred,loss
        torch.cuda.empty_cache()

    ave = np.mean(losses)
    print("Validation Loss -------------->: {}".format(ave))
    #if ave < Config.best_loss:
      #print("best model has been stored!")
      #torch.save(model.state_dict(),re.sub(r"fold_[0-9]",str(Config.fold),Config.checkpoint))
    model.train()
    return ave

  def predict(model,test_loader):
    pass


# Hyperparameters Tuning Class

In [None]:
#!pip install optuna

# Validation

In [None]:
execfile("/content/drive/MyDrive/Kaggle/Hindi/models/validation.py")

def validation(model,df_valid,valid_loader,valid_examples,if_save=True):
  model.eval()
  def getPredictions():
    start_logits = []
    end_logits = []
    for item,label in valid_loader:
        with torch.no_grad():
            input = {key:item[key].to(Config.device) for key in item.keys() if key in ["input_ids","attention_mask"]}
            
            outputs_start, outputs_end = model(input)
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    return np.vstack(start_logits), np.vstack(end_logits)
  model.train()
  start_logits, end_logits = getPredictions()
  acc,df_res = computeJaccard(df_valid,valid_examples,start_logits, end_logits)
  print("Fold: {}\t||\tValidation Jaccard Score: -------------->: {}".format(Config.fold,acc))

  if acc > Config.best_acc:
      if if_save:
        print("best model has been stored!")
        torch.save(model.state_dict(),re.sub(r"fold_[0-9]","fold_"+str(Config.fold),Config.checkpoint))
  return acc

def prediction(model,df_valid,valid_loader,valid_examples):
  df_valid["answer_text"] = ['text']*df_valid.shape[0]
  def getPredictions():
    start_logits = []
    end_logits = []
    for item in valid_loader:
        with torch.no_grad():
            input = {key:item[key].to(Config.device) for key in item.keys() if key in ["input_ids","attention_mask"]}
            #print(input["input_ids"][0])
            outputs_start, outputs_end = model(input)
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    return np.vstack(start_logits), np.vstack(end_logits)

  start_logits, end_logits = getPredictions()
  acc,df_res = computeJaccard(df_valid,valid_examples,start_logits, end_logits)
  print(df_res)



# Run

In [None]:
def checkpoint_call(path=None,is_checkpoint=False,cuda=True):
  if is_checkpoint:
    model = MODEL()
    state_dict = torch.load(path)
    for n,p in model.named_parameters():
      if n not in state_dict.keys():
        state_dict[n] = p
    
    drop_list = []
    for n,p in state_dict.items():
      if n not in model.state_dict().keys():
        drop_list.append(n)
    
    for i in drop_list:
      state_dict.pop(i)

    model.load_state_dict(state_dict)
    #model._init_weights(model.qa_outputs)
  else:
    model = MODEL()

  return model.to(Config.device) if cuda == True else model.cpu()

In [None]:
def model_training(model,train_loader,df_valid,valid_loader,valid_examples):
  fix_all_seeds(Config.seed)
  optimizer = qa_optimizer(model)
  num_training_steps = len(train_loader)/Config.acc_gradient_steps*Config.n_epochs
  warm_steps = int(num_training_steps*Config.warm_frac)
  if Config.fp16:
    model,optimizer = amp.initialize(model,optimizer,opt_level=Config.fp16_opt_level)
  scler = qa_scheduler(optimizer,warm_steps,num_training_steps)
  trainer = QA_Trainer(model,optimizer,scler,train_loader,df_valid,valid_loader,valid_examples)
  model = trainer.fit()

In [None]:
def model_training_fold(model,train_loader,df_valid,valid_loader,valid_examples):
  fix_all_seeds(Config.seed)
  optimizer = qa_optimizer(model)
  num_training_steps = len(train_loader)/Config.acc_gradient_steps*Config.n_epochs
  warm_steps = int(num_training_steps*Config.warm_frac)
  if Config.fp16:
    model,optimizer = amp.initialize(model,optimizer,opt_level=Config.fp16_opt_level)
  scler = qa_scheduler(optimizer,warm_steps,num_training_steps)
  trainer = QA_Trainer(model,optimizer,scler,train_loader,df_valid,valid_loader,valid_examples)
  model = trainer.fit()

In [None]:
def model_training_fold_HT(model,train_loader,df_valid,valid_loader,valid_examples,params):
  fix_all_seeds(params["seed"])
  optimizer = qa_optimizer(model)
  num_training_steps = len(train_loader)*Config.n_epochs
  warm_steps = int(num_training_steps*Config.warm_frac)
  if Config.fp16:
    model,optimizer = amp.initialize(model,optimizer,opt_level=Config.fp16_opt_level)
  scler = qa_scheduler(optimizer,warm_steps,num_training_steps)
  trainer = QA_Trainer_HT(model,optimizer,scler,train_loader,df_valid,valid_loader,valid_examples,params)
  acc = trainer.fit()

  del model,scler,optimizer,trainer
  torch.cuda.empty_cache()
  return acc

In [None]:
from sklearn.model_selection import StratifiedKFold
def make_fold():
  train = pd.read_csv('/content/drive/MyDrive/Kaggle/Hindi/input/cleaned_train.csv')
  external_mlqa = pd.read_csv('/content/drive/MyDrive/Kaggle/Hindi/input/mlqa_hindi.csv')
  external_xquad = pd.read_csv('/content/drive/MyDrive/Kaggle/Hindi/input/xquad.csv')
  tamil_xquad = pd.read_csv('/content/drive/MyDrive/Kaggle/Hindi/input/squad_translated_tamil.csv')
  tamil_xquad["language"] = ["tamil"]*tamil_xquad.shape[0]
  external_train = pd.concat([external_mlqa, external_xquad,tamil_xquad],ignore_index=True).reset_index(drop=True)

  def create_folds(data, num_splits):
      data["kfold"] = -1
      kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=Config.seed)
      for f, (t_, v_) in enumerate(kf.split(X=data, y=data['language'])):
          data.loc[v_, 'kfold'] = f
      return data

  train = create_folds(train, num_splits=5)
  external_train["kfold"] = -1
  train = pd.concat([train, external_train],ignore_index=True).reset_index(drop=True)
  train['id'] = list(np.arange(1, len(train)+1))
  train.to_csv("input.csv")
  return train

In [None]:
def run(fold):
  Config.fold = fold
  train = pd.read_csv("input.csv")
  valid = train.loc[train["kfold"]==fold,:]
  train = train.loc[train["kfold"]!=fold,:]
  train = text_preprocess(train)
  valid = text_preprocess(valid)
  print("Making data...")
  train_loader,valid_loader = make_loader(train,valid)
  valid_examples = make_examples(valid)

  torch.cuda.empty_cache()


  model = checkpoint_call("/content/drive/MyDrive/Kaggle/Hindi/checkpoints/simple_adding_model/simple_adding_model.bin",is_checkpoint=False)
  model_training_fold(model,train_loader,valid,valid_loader,valid_examples)

  del model,train_loader,valid_loader,train,valid,valid_examples
  torch.cuda.empty_cache()


# Hyperparameter Tuning


In [None]:
# train = pd.read_csv(base+"input/train.csv").sample(frac=0.1,random_state=3407)
# searcher = params_optim(train,100)

In [None]:
# %pdb off
# best_params = searcher.search()

# 5 Fold

In [None]:
# train = make_fold()

In [None]:
%pdb off
for fold in range(0,5):
  Config.best_loss = 1
  Config.best_acc = 0
  run(fold)

Pure Validation

In [None]:
# %pdb off
# execfile("/content/drive/MyDrive/Kaggle/Hindi/models/validation.py")

for i in range(0,5):
  train = pd.read_csv(base+"/input/train.csv")
  #valid = train[train["language"]=="hindi"]
  valid = text_preprocess(train)
  valid_loader = make_loader(valid)
  valid_examples = make_examples(valid)

  model = checkpoint_call("/content/output/fold_{}.bin".format(i),is_checkpoint=True)
  acc = validation(model,valid,valid_loader,valid_examples,if_save=False)

  del model
  torch.cuda.empty_cache()

In [None]:
ds # train = pd.read_csv(base+"/input/final_corpus.csv")
# train[["context","question","answer_text"]].duplicated().sum()

# Local Test

In [None]:
!kaggle datasets download -d charonwangg/deep-wiki-v2
!kaggle datasets download -d charonwangg/overfitting-5-fold-128
!kaggle datasets download -d kishalmandal/5foldsroberta


In [None]:
from tqdm import tqdm
def local_test(model,df_valid,valid_loader,valid_examples):
  def getPredictions(model):
    start_logits = []
    end_logits = []
    model.eval()
    for item,label in valid_loader:
        with torch.no_grad():
            input = {key:item[key].to(Config.device) for key in item.keys() if key in ["input_ids","attention_mask"]}
            
            outputs_start, outputs_end = model(input)
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    return np.vstack(start_logits), np.vstack(end_logits)

    start_logits_, end_logits_ = getPredictions(model)


  #acc,df_res = computeJaccard(df_valid,valid_examples,start_logits, end_logits)
  #print("Fold: {}\t||\tValidation Jaccard Score: -------------->: {}".format(Config.fold,acc))

  return start_logits_, end_logits_


In [None]:
train = pd.read_csv(base+"/input/train.csv")
#valid = train[train["language"]=="hindi"]
valid = text_preprocess(train)
valid_loader = make_loader(valid)
valid_examples = make_examples(valid)

In [None]:
paths = []

weight = np.exp([0.7189407410715103,0.7009346291550066,0.7423611924357781,0.7452209457707667,0.7328501876639222\
          ,0.7409769800815581,0.7214328330609091,0.7293411617805697,0.8074233752981514,0.7191098087529867,\
          0.7982931852209235,0.7737983627682914,0.7776064350570635,0.7843168579166172,0.7358640175404093,\
                 0.7166820931307615])


model = checkpoint_call("/content/drive/MyDrive/Kaggle/Hindi/checkpoints/simple_adding_model/simple_adding_model.bin",
                              is_checkpoint=True)
start_logits_, end_logits_ = local_test(model,valid,valid_loader,valid_examples)

In [None]:
train = pd.read_csv(base+"/input/train.csv")
#valid = train[train["language"]=="hindi"]
valid = text_preprocess(train)
valid_loader = make_loader(valid)
valid_examples = make_examples(valid)

weight = np.exp([0.7189407410715103,0.7009346291550066,0.7423611924357781,0.7452209457707667,0.7328501876639222\
          ,0.7409769800815581,0.7214328330609091,0.7293411617805697,0.8074233752981514,0.7191098087529867,\
          0.7982931852209235,0.7737983627682914,0.7776064350570635,0.7843168579166172,0.7358640175404093,\
                 0.7166820931307615])

models = []
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/drive/MyDrive/Kaggle/Hindi/checkpoints/simple_adding_model/simple_adding_model.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
models.append(checkpoint_call("/content/output/fold_{}.bin",
                              is_checkpoint=True))
acc = local_test(models,weight,valid,valid_loader,valid_examples)

torch.cuda.empty_cache()

# Non-necessary

In [None]:
# %pdb off
# #train_data,valid_data = train_test_split(train,train_size=0.9,stratify=train["language"])
# train_loader,valid_loader = make_loader(train,valid)
# del train,valid


In [None]:
# try:
#   del model
# except:
#   pass

# try:
#   del models
# except:
#   pass

# torch.cuda.empty_cache()

# model = checkpoint_call("/content/drive/MyDrive/Kaggle/Hindi/checkpoints/simple_adding_model/simple_adding_model.bin",is_checkpoint=True)
# model_training(model,train_loader,valid_loader)


In [None]:
# try:
#   del model
# except:
#   pass

# try:
#   del models
# except:
#   pass

# torch.cuda.empty_cache()
# models = xxl_checkpoint_call("/content/output/checkpoint-fold-0/pytorch_model.bin")


In [None]:
#model_training(models,train_loader,valid_loader)

In [None]:
# QA_Tester.evaluate(model,valid_loader)

In [None]:
# QA_Tester.evaluate(model,valid_loader)

In [None]:
# torch.save(model.state_dict(),"drive/MyDrive/checkpoint1.bin")


In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))


# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets create -p "/content/output"

Starting upload for file fold_1.bin
100% 2.09G/2.09G [00:19<00:00, 117MB/s]
Upload successful: fold_1.bin (2GB)
Starting upload for file fold_0.bin
100% 2.09G/2.09G [00:19<00:00, 116MB/s]
Upload successful: fold_0.bin (2GB)
Starting upload for file fold_2.bin
100% 2.09G/2.09G [00:19<00:00, 115MB/s]
Upload successful: fold_2.bin (2GB)
Starting upload for file fold_3.bin
100% 2.09G/2.09G [00:19<00:00, 114MB/s]
Upload successful: fold_3.bin (2GB)
Starting upload for file fold_4.bin
100% 2.09G/2.09G [00:19<00:00, 117MB/s]
Upload successful: fold_4.bin (2GB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/charonwangg/Deep-Wiki-v2
