# Making the 0.792 model : 

### Making the model and Inferencing steps:

<ul>  
    <li>Using the datasets : chaii, mlqa, squad, tamil_xquad </li>
    <li>Training for 2 epochs : <a href="https://www.kaggle.com/kishalmandal/chaii-fit-2-epochs-mlqa-xquad-chaii/">chaii | FIT - 2 epochs | mlqa, xquad, chaii</a> </li>
    <li>Training for 7 epochs with tamil_xquad: <a href="https://www.kaggle.com/kishalmandal/chaii-fit-7-epochs-extra-tamil-data/">chaii | FIT - 7 epochs | Extra Tamil Data</a> </li>
    <li>Inferencing from 5 folds | fold-0 and fold-1 (7-epochs) | fold-2, fold-3 and fold-4 (2-epochs)| based on cross validation scores and a little bit of experimentation 😜: <a href="https://www.kaggle.com/kishalmandal/5-epochs-infer-combined-model-0-792/">5 epochs | INFER | combined model (0.792)</a></li>
    
    
</ul>

### References : 

<ul> 
    <li>For training: <a href="https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-fit">chaii QA - 5 Fold XLMRoberta Torch | FIT</a> by <a href="https://www.kaggle.com/rhtsingh">torch</a></li>
    <li>For Inference: <a href="https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-infer">chaii QA - 5 Fold XLMRoberta Torch | Infer</a> by <a href="https://www.kaggle.com/rhtsingh">torch</a></li>
    <li>For post processing outputs: <a href="https://www.kaggle.com/nbroad/chaii-qa-torch-5-fold-with-post-processing-765">chaii QA-Torch 5 fold with post-processing (.765)</a> by <a href="https://www.kaggle.com/nbroad">Nicholas Broad 🟢</a></li>
</ul>


### Important Notes:

<ul>
    <li>Some one experimented with the max_seq_length and doc_stride to bring my previous 0.785 to 0.792. Then I performed some more experiments and saw more than 400 decreases the score (~0.3-0.4) while decreasing below 400 upto around 384 does not change the score by much (~0.2)</li>
    <li>Vigourous change or lr and decay rates reduces the performance.</li>
    <li>I took weighted average based on the validation loss increased score by (~0.01)</li>
</ul>

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim
from torch.utils.data import (
    Dataset, DataLoader,
    SequentialSampler, RandomSampler
)
from torch.utils.data.distributed import DistributedSampler

try:
    from apex import amp
    APEX_INSTALLED = True
except ImportError:
    APEX_INSTALLED = False

import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

print(f"Apex AMP Installed :: {APEX_INSTALLED}")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

2021-11-15 13:22:19.478667: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


Apex AMP Installed :: False


In [2]:
class Config:
    # model
    model_type = 'xlm_roberta'
    model_name_or_path = "../input/5foldsroberta/output/checkpoint-fold-0"#"../input/xlm-roberta-large-squad-v2"
    config_name = "../input/5foldsroberta/output/checkpoint-fold-0"#"../input/xlm-roberta-large-squad-v2"
    fp16 = True if APEX_INSTALLED else False
    fp16_opt_level = "O1"
    gradient_accumulation_steps = 2

    # tokenizer
    tokenizer_name = "../input/5foldsroberta/output/checkpoint-fold-0"#"../input/xlm-roberta-large-squad-v2"
    max_seq_length = 400
    doc_stride = 135

    # train
    epochs = 1
    train_batch_size = 4
    eval_batch_size = 128

    # optimzer
    optimizer_type = 'AdamW'
    learning_rate = 1e-5
    weight_decay = 1e-2
    epsilon = 1e-8
    max_grad_norm = 1.0

    # scheduler
    decay_name = 'linear-warmup'
    warmup_ratio = 0.1

    # logging
    logging_steps = 10

    # evaluate
    output_dir = 'output'
    seed = 2021

In [3]:
class DatasetRetriever(Dataset):
    def __init__(self, features, mode='train'):
        super(DatasetRetriever, self).__init__()
        self.features = features
        self.mode = mode
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):   
        feature = self.features[item]
        if self.mode == 'train':
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(feature['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(feature['start_position'], dtype=torch.long),
                'end_position':torch.tensor(feature['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':feature['offset_mapping'],
                'sequence_ids':feature['sequence_ids'],
                'id':feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            }

In [4]:
class Model(nn.Module):
    def __init__(self, modelname_or_path, config):
        super(Model, self).__init__()
        self.config = config
        self.xlm_roberta = AutoModel.from_pretrained(modelname_or_path, config=config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(
        self, 
        input_ids, 
        attention_mask=None, 
    ):
        outputs = self.xlm_roberta(
            input_ids,
            attention_mask=attention_mask,
        )

        sequence_output = outputs[0]
        pooled_output = outputs[1]
        
        # sequence_output = self.dropout(sequence_output)
        qa_logits = self.qa_outputs(sequence_output)
        
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
    
        return start_logits, end_logits

In [5]:
def make_model(args):
    config = AutoConfig.from_pretrained(args.config_name)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
    model = Model(args.model_name_or_path, config=config)
    return config, tokenizer, model

In [6]:
def prepare_test_features(args, example, tokenizer):
    example["question"] = example["question"].lstrip()
    
    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=args.max_seq_length,
        stride=args.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example['id']
        feature['context'] = example['context']
        feature['question'] = example['question']
        feature['input_ids'] = tokenized_example['input_ids'][i]
        feature['attention_mask'] = tokenized_example['attention_mask'][i]
        feature['offset_mapping'] = tokenized_example['offset_mapping'][i]
        feature['sequence_ids'] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]
        features.append(feature)
    return features

In [7]:
test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')#[:1]
# test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')#[:1]

test['context'] = test['context'].apply(lambda x: ' '.join(x.split()))
test['question'] = test['question'].apply(lambda x: ' '.join(x.split()))

#base_model_path = '../input/chaii-qa-5-fold-xlmroberta-torch-fit'

tokenizer = AutoTokenizer.from_pretrained(Config().tokenizer_name)

test_features = []
for i, row in test.iterrows():
    test_features += prepare_test_features(Config(), row, tokenizer)

args = Config()
test_dataset = DatasetRetriever(test_features, mode='test')
test_dataloader = DataLoader(
    test_dataset,
    batch_size=args.eval_batch_size, 
    sampler=SequentialSampler(test_dataset),
    num_workers=optimal_num_of_loader_workers(),
    pin_memory=True, 
    drop_last=False
)

In [8]:
base_model = '../input/5foldsroberta/output/'

In [9]:
def get_predictions(checkpoint_path):
    config, tokenizer, model = make_model(Config())
    model.cuda();
    model.load_state_dict(
        torch.load(checkpoint_path)
    );
    
    start_logits = []
    end_logits = []
    for batch in test_dataloader:
        with torch.no_grad():
            outputs_start, outputs_end = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

In [10]:
start_logits1, end_logits1 = get_predictions('../input/5foldsroberta/output/checkpoint-fold-0/pytorch_model.bin')
start_logits2, end_logits2 = get_predictions('../input/5foldsroberta/output/checkpoint-fold-1/pytorch_model.bin')
start_logits3, end_logits3 = get_predictions('../input/5foldsroberta/output/checkpoint-fold-2/pytorch_model.bin')
start_logits4, end_logits4 = get_predictions('../input/5foldsroberta/output/checkpoint-fold-3/pytorch_model.bin')
start_logits5, end_logits5 = get_predictions('../input/5foldsroberta/output/checkpoint-fold-4/pytorch_model.bin')




start_logits = (start_logits1 + start_logits2 + start_logits3 +start_logits4+ start_logits5 )/5
end_logits = (end_logits1 + end_logits2 + end_logits3 +end_logits4 + end_logits5)/5

In [11]:
# normalize
normalizer = torch.nn.Softmax(dim=1)

start_logits = normalizer(torch.from_numpy(start_logits))
end_logits = normalizer(torch.from_numpy(end_logits))

start_logits = start_logits.numpy()
end_logits = end_logits.numpy()

In [12]:
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30, n = 1, extension=20):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    
                    re_start_char = max(0,start_char-extension)
                    re_end_char = min(len(context),end_char+extension)
#                     try:
#                         re_start_char = offset_mapping[start_index-extension][0]
#                     except:
#                         re_start_char=0
#                     try:
#                         re_end_char = offset_mapping[end_index+extension][1]
#                     except:
#                         re_end_char=len(context)
#                     print(re_start_char)
                    valid_answers.append(
                        {
                            "score": (start_logits[start_index] + end_logits[end_index])/2,
                            "text": context[start_char: end_char],
#                             "example": example
#                             "feature_index":feature_index,
#                             "features":features[feature_index],
#                             "offset_mapping":offset_mapping,
#                             "context":context,
#                             "start_index":start_index,
#                             "end_index":end_index,
#                             "start_char":start_char,
#                             "end_char":end_char,
#                             "re_start_char":re_start_char,
#                             "re_end_char":re_end_char,
                            "required_text": context[re_start_char: re_end_char]
                        }
                    )
#         print(valid_answers)
        if len(valid_answers) > 0:
            answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)
#             print(answers)
            best_answer = []
            for ans in  answers[:n]:
                best_answer.append(ans)
#             best_answer = answers[:5]
        else:
            best_answer = [""]
        
        predictions[example["id"]] = best_answer#["text"]
        
        
    return predictions

In [13]:
fin_preds = postprocess_qa_predictions(test, test_features, (start_logits, end_logits),
                                       n=5, extension=128)

Post-processing 5 example predictions split into 63 features.


In [14]:
# fin_preds

In [15]:
z = list(fin_preds.values())

In [16]:
# z[:1]

In [17]:
questions = test['question'].values
contexts = test['context'].values
# answers = test['answer_text'].values
# score = test[]

# Postprocess added here in this version

In [18]:
bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"

def postprocess(pred,context):
    if pred == "":
        return pred
        
    while any([pred.startswith(y) for y in bad_starts]):
        pred = pred[1:]
    while any([pred.endswith(y) for y in bad_endings]):
        if pred.endswith("..."):
            pred = pred[:-3]
        else:
            pred = pred[:-1]
    
    if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
        pred = pred+"."
    
    return pred

In [19]:
final_ans = []
final_ques = []
final_pre = []
final_req_text = []
pre_score =[]
for i, val in enumerate(tqdm(z)):
#     print(i)
    for j in val:
#         print(j)
#         final_ans.append(answers[i])
        pred_text = postprocess(j['text'],contexts[i])
        final_ques.append(questions[i])
        final_pre.append(pred_text)
        final_req_text.append(j['required_text'])
        pre_score.append(j['score'])
#         jac_score.append(cal_jac(answers[i],j['text']))

100%|██████████| 5/5 [00:00<00:00, 18347.79it/s]


In [20]:
final_df = pd.DataFrame()

In [21]:
test.head()

Unnamed: 0,id,context,question,language
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ம...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil


In [22]:
# final_df['answer_text'] = final_ans
final_df['question'] = final_ques
final_df['pred_text'] = final_pre
final_df['require_text'] = final_req_text
final_df['score'] = pre_score
final_df.head()

Unnamed: 0,question,pred_text,require_text,score
0,ज्वाला गुट्टा की माँ का नाम क्या है,येलन,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.673885
1,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन से हैं। उनकी मां येलन गुट्टा,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.430525
2,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.402283
3,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन से हैं। उनकी मां येलन,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.355371
4,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन से,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.349024


In [23]:
final_df.to_csv('1st_model_pred.csv',index=False)

# Reranking pipeline

In [24]:
import pandas as pd
data = pd.read_csv('./1st_model_pred.csv')#[:512]
# print(len(data))
# data.head()

In [25]:
def combine_text(row):
    return "<s>"+row['question']+"</s></s>"+row['require_text']+"   "+row['pred_text']+"</s>"
#     return "<s>"+row['question']+"</s></s>"+row['require_text']+" "+row['pred_text']+"</s>"

In [26]:
data['excerpt'] = data.apply(combine_text,axis=1)
# data.head()

In [27]:
# data['excerpt'].values[0]

In [28]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold

import gc
gc.enable()

In [29]:
# !ls ../input/reranking-qa-model-train

In [30]:
NUM_FOLDS = 5
NUM_EPOCHS = 7#10#3
BATCH_SIZE = 16#24#32#16#1
MAX_LEN = 128#248
EVAL_SCHEDULE = [(0.35, 16), (0.30, 8), (0.25, 4), (0.20, 2), (-1., 1)]
BASE_MODEL_PATH = "../input/reranking-qa-model-train/xlm-roberta-large"#"../input/reranking-qa-model-train/xlm-roberta-large"
ROBERTA_PATH = BASE_MODEL_PATH+"/pytorch_model.bin"#"./xlm-roberta-base/pytorch_model.bin"
TOKENIZER_PATH = BASE_MODEL_PATH#"./xlm-roberta-base"#"../input/clrp-roberta-base/clrp_roberta_base"
CONFIG_PATH = BASE_MODEL_PATH+"/config.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [31]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [32]:
train_df = data

In [33]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [34]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        self.question = df.question.tolist()
        self.pred_text = df.pred_text.tolist()
        self.require_text = df.require_text.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
#         self.encoded = tokenizer.batch_encode_plus(
#             self.text,
#             padding = 'max_length',            
#             max_length = MAX_LEN,
#             truncation = True,
#             return_attention_mask=True
#         )  
        self.encoded = tokenizer.batch_encode_plus(
#             self.question, self.pred_text, self.require_text,
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True,
            add_special_tokens=False
        )  
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Rerank model

In [35]:
class LitModel(nn.Module):  
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained(CONFIG_PATH)                     
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=self.config)  
#         self.cnn1 = nn.Conv1d(768, MAX_LEN, kernel_size=1)
#         self.cnn2 = nn.Conv1d(MAX_LEN, 1, kernel_size=1)
        self.cnn1 = nn.Conv1d(768, 512, kernel_size=1)
        self.cnn2 = nn.Conv1d(512, MAX_LEN, kernel_size=1)
         
#         self.layernorm = nn.LayerNorm(MAX_LEN,MAX_LEN)    
        self.layernorm = nn.LayerNorm(MAX_LEN)
            
        self.attention = nn.Sequential(            
            nn.Linear(MAX_LEN, MAX_LEN),            
            nn.Tanh(),  
            nn.Linear(MAX_LEN, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(      
#             nn.LayerNorm(768),
            nn.Linear(self.config.hidden_size, 1),      
#             nn.Sigmoid()
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)   
        last_hidden_state = roberta_output[0]
        cls_embeddings = last_hidden_state[:, 0]
#         print(cls_embeddings.shape)
        logits = self.regressor(cls_embeddings) # regression head
#         print(logits.shape)
        return logits#16

In [36]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [37]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

# infer

In [38]:
test_dataset = LitDataset(train_df, inference_only=True)

In [39]:
all_predictions = np.zeros((NUM_FOLDS, len(train_df)))

test_dataset = LitDataset(train_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for index in range(NUM_FOLDS):            
    model_path = f"../input/reranking-qa-model-train/model_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()


Using ../input/reranking-qa-model-train/model_1.pth

Using ../input/reranking-qa-model-train/model_2.pth

Using ../input/reranking-qa-model-train/model_3.pth

Using ../input/reranking-qa-model-train/model_4.pth

Using ../input/reranking-qa-model-train/model_5.pth


In [40]:
predictions = all_predictions.mean(axis=0)

In [41]:
train_df['predicted_jac_score']= predictions

In [42]:
train_df.head()

Unnamed: 0,question,pred_text,require_text,score,excerpt,predicted_jac_score
0,ज्वाला गुट्टा की माँ का नाम क्या है,येलन,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.673885,<s>ज्वाला गुट्टा की माँ का नाम क्या है</s></s>...,0.44103
1,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन से हैं। उनकी मां येलन गुट्टा,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.430525,<s>ज्वाला गुट्टा की माँ का नाम क्या है</s></s>...,0.200791
2,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.402283,<s>ज्वाला गुट्टा की माँ का नाम क्या है</s></s>...,0.536799
3,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन से हैं। उनकी मां येलन,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.355371,<s>ज्वाला गुट्टा की माँ का नाम क्या है</s></s>...,0.172356
4,ज्वाला गुट्टा की माँ का नाम क्या है,येलन चीन से,ाडी हैं। प्रारंभिक जीवन ज्वाला गुट्टा का जन्म ...,0.349024,<s>ज्वाला गुट्टा की माँ का नाम क्या है</s></s>...,0.50174


# Combining with Reranking

In [43]:
answers = train_df['pred_text'].values
score = train_df['score'].values
predicted_jac_score = train_df['predicted_jac_score'].values

In [44]:
# len(answers)

In [45]:
temp_dict = []
tot_answers = len(answers)
final_answer = []
for i in range(tot_answers):
    temp_dict.append(
        {
            'text':answers[i],
            'score':score[i]+0.05*predicted_jac_score[i]
        }
    )
#     if i==4:
#         break
    if i%5==4:
        final_answer.append(sorted(temp_dict, key=lambda x: x["score"], reverse=True)[0]['text'])
        temp_dict=[]

In [46]:
final_answer[:5]

[' येलन',
 ' 20 अप्रैल 2010',
 '१२ मार्च १८२४',
 ' 13',
 'சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்']

In [47]:
submission = []
for p2 in final_answer:
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
#     submission.append((p1, p2))
    submission.append(p2)

In [48]:
test.head()

Unnamed: 0,id,context,question,language
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ம...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil


In [49]:
test["PredictionString"] = submission
test.head()

Unnamed: 0,id,context,question,language,PredictionString
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi,येलन
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi,20 अप्रैल 2010
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi,१२ मार्च १८२४
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil,13
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ம...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil,சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்


# Post processing(Not required as it is done earlier)

In [50]:
# bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
# bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

# tamil_ad = "கி.பி"
# tamil_bc = "கி.மு"
# tamil_km = "கி.மீ"
# hindi_ad = "ई"
# hindi_bc = "ई.पू"

# cleaned_preds = []
# for pred, context in test[["PredictionString", "context"]].to_numpy():
#     if pred == "":
#         cleaned_preds.append(pred)
#         continue
#     while any([pred.startswith(y) for y in bad_starts]):
#         pred = pred[1:]
#     while any([pred.endswith(y) for y in bad_endings]):
#         if pred.endswith("..."):
#             pred = pred[:-3]
#         else:
#             pred = pred[:-1]
    
#     if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
#         pred = pred+"."

#     cleaned_preds.append(pred)

# test["PredictionString"] = cleaned_preds

In [51]:
test[['id','PredictionString']].to_csv('submission.csv',index=False) 

In [52]:
# test.head()