## Install and Import Dependencies

### Install Apex

In [1]:
# %%writefile setup.sh
# git clone https://github.com/NVIDIA/apex
# cd apex
# pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
# rm -rf ./apex

In [2]:
# %%capture
# try:
#     from apex import amp
#     APEX_INSTALLED = True
# except ImportError:
#     !sh setup.sh
# .    APEX_INSTALLED = True

APEX_INSTALLED = False

### Import Dependencies

In [3]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import gc

gc.enable()

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

import collections
import multiprocessing

import numpy as np
import pandas as pd

from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SequentialSampler


import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)

logging.set_verbosity_warning()
logging.set_verbosity_error()

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


print(f"Apex AMP Installed :: {APEX_INSTALLED}")
CPUS_COUNT = multiprocessing.cpu_count()
GPUS_COUNT = torch.cuda.device_count()

Apex AMP Installed :: False


## Config Define

In [4]:
class Config:
    # model
    model_type = "xlm_roberta"
    model_name_or_path = "../input/hugging-face-hub-xlmrobertalargesquad2/xlm-roberta-large-squad2"
    config_name = "../input/hugging-face-hub-xlmrobertalargesquad2/xlm-roberta-large-squad2"
    fp16 = True if APEX_INSTALLED else False
    fp16_opt_level = "O1"

    # tokenizer
    tokenizer_name = "../input/hugging-face-hub-xlmrobertalargesquad2/xlm-roberta-large-squad2"
    max_seq_length = 400
    doc_stride = 135

    # test
    eval_batch_size = 128

In [5]:
base_model = "../input/5foldsroberta/output/"

## Data Process

In [6]:
class DatasetRetriever(Dataset):
    def __init__(self, features):
        super(DatasetRetriever, self).__init__()
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, item):
        feature = self.features[item]

        return {
            "input_ids": torch.tensor(feature["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(feature["attention_mask"], dtype=torch.long),
            "offset_mapping": feature["offset_mapping"],
            "sequence_ids": feature["sequence_ids"],
            "id": feature["example_id"],
            "context": feature["context"],
            "question": feature["question"],
        }

## Model Define

In [7]:
class Model(nn.Module):
    def __init__(self, modelname_or_path, config):
        super(Model, self).__init__()
        self.config = config
        self.xlm_roberta = AutoModel.from_pretrained(modelname_or_path, config=config)
 
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(self, input_ids, attention_mask=None):
        seq = self.xlm_roberta(input_ids, attention_mask=attention_mask)
        #sequence_output = self.dropout(seq.last_hidden_state)

        qa_logits = self.qa_outputs(seq.last_hidden_state)
        
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [8]:
# class Model(nn.Module):
#     def __init__(self, modelname_or_path, config):
#         super(Model, self).__init__()
#         self.config = config
#         self.xlm_roberta = AutoModel.from_pretrained(modelname_or_path, config=config)
#         self.qa_outputs = nn.Linear(config.hidden_size*8, 2)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.fusion_head = FusionHead()
#         self._init_weights(self.qa_outputs)
        
#     def _init_weights(self, module):
#         if isinstance(module, nn.Linear):
#             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
#             if module.bias is not None:
#                 module.bias.data.zero_()

#     def forward(
#         self, 
#         input_ids, 
#         attention_mask=None, 
#         # token_type_ids=None
#     ):
#         outputs = self.xlm_roberta(
#             input_ids,
#             attention_mask=attention_mask,
#             output_hidden_states=True
#         )
#         #
#         last_4_output = outputs.hidden_states
#         last_4_layers = torch.cat([last_4_output[i] for i in [9,14,19,23]],dim=-1) # B*L*4H
#         last_4_layers = self.dropout(last_4_layers)
#         #sequence_output = outputs.last_hidden_state
#         fusion_output = self.fusion_head(last_4_layers)
#         fusion_output = torch.cat((last_4_layers,fusion_output),dim=-1)

#         fusion_output = self.dropout(fusion_output)
#         qa_logits = self.qa_outputs(fusion_output)
        
#         start_logits, end_logits = qa_logits.split(1, dim=-1)
#         start_logits = start_logits.squeeze(-1)
#         end_logits = end_logits.squeeze(-1)
    
#         return start_logits, end_logits

In [9]:
class FusionHead(nn.Module):
  def __init__(self):
    super(FusionHead,self).__init__()
    self.q = nn.Linear(1024*4,512)
    self.v = nn.Linear(512,1)
  
  def forward(self,seq):
    # seq : (B,L,H*4)
    score = self.v(nn.functional.tanh(self.q(seq))) # B*L*1
    score = nn.functional.softmax(score,dim=1) 
    seq =  score*seq
    return seq

In [10]:
def make_model(params):
    config = AutoConfig.from_pretrained(params.config_name)
    tokenizer = AutoTokenizer.from_pretrained(params.tokenizer_name)
    model = Model(params.model_name_or_path, config=config)
    return config, tokenizer, model

## Model Test

### Prepare Test Features

In [11]:
def getTestFeatures(params, example, tokenizer):
    example["question"] = example["question"].lstrip()

    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=params.max_seq_length,
        stride=params.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example["id"]
        feature["context"] = example["context"]
        feature["question"] = example["question"]
        feature["input_ids"] = tokenized_example["input_ids"][i]
        feature["attention_mask"] = tokenized_example["attention_mask"][i]
        feature["offset_mapping"] = tokenized_example["offset_mapping"][i]
        feature["sequence_ids"] = [
            0 if i is None else i for i in tokenized_example.sequence_ids(i)
        ]
        features.append(feature)

    return features

### Post-process

In [12]:
def getFeatureExampleIdex(features, examples):
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    return features_per_example


def getOffset(feature):
    return [
        (off if feature["sequence_ids"][i] == 1 else None)
        for i, off in enumerate(feature["offset_mapping"])
    ]


def validateAnswer(start_index, end_index, offset_mapping, max_answer_length):
    if (
        start_index >= len(offset_mapping)
        or end_index >= len(offset_mapping)
        or offset_mapping[start_index] is None
        or offset_mapping[end_index] is None
        or end_index < start_index
        or end_index - start_index + 1 > max_answer_length
    ):
        return False

    return True

In [13]:
def postprocess(
    examples, features, raw_predictions, n_best_size=20, max_answer_length=30
):
    features_per_example = getFeatureExampleIdex(features, examples)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions")

    all_start_logits, all_end_logits = raw_predictions

    for example_index, example in examples.iterrows():
        feature_index = features_per_example[example_index]

        min_score = None
        valid_answers = []

        context = example["context"]
        for fi in feature_index:
            start_logits = all_start_logits[fi]
            end_logits = all_end_logits[fi]

            offset_mapping = getOffset(features[fi])

            cls_index = features[fi]["input_ids"].index(tokenizer.cls_token_id)
            feature_score = start_logits[cls_index] + end_logits[cls_index]
            if min_score is None or min_score < feature_score:
                min_score = feature_score

            start_indexes = np.argsort(start_logits)[
                -1 : -n_best_size - 1 : -1
            ].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if validateAnswer(
                        start_index, end_index, offset_mapping, max_answer_length
                    ):

                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append(
                            {
                                "score": start_logits[start_index]
                                + end_logits[end_index],
                                "text": context[start_char:end_char],
                            }
                        )

        best_answer = (
            sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            if valid_answers
            else {"text": "", "score": 0.0}
        )
        #print(sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:5])
        predictions[example["id"]] = best_answer["text"]

    return predictions

### Predict

In [14]:
def loadTestData():
    test = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")

    test["context"] = test["context"].apply(lambda x: " ".join(x.split()))
    test["question"] = test["question"].apply(lambda x: " ".join(x.split()))

    tokenizer = AutoTokenizer.from_pretrained(Config().tokenizer_name)

    test_features = []
    for i, row in test.iterrows():
        test_features += getTestFeatures(Config(), row, tokenizer)

    test_dataset = DatasetRetriever(test_features)

    test_dataloader = DataLoader(
        test_dataset,
        batch_size=Config().eval_batch_size,
        sampler=SequentialSampler(test_dataset),
        num_workers=min(CPUS_COUNT, GPUS_COUNT * 4) if GPUS_COUNT else CPUS_COUNT - 1,
        pin_memory=True,
        drop_last=False,
    )

    return test_dataloader

In [15]:
def getPredictions(checkpoint_path):
    config, tokenizer, model = make_model(Config())
    model.cuda()
    model.load_state_dict(torch.load(checkpoint_path))
    model.eval()

    start_logits = []
    end_logits = []
    for batch in loadTestData():
        with torch.no_grad():
            outputs_start, outputs_end = model(
                batch["input_ids"].cuda(), batch["attention_mask"].cuda()
            )
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

In [16]:
test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')

test['context'] = test['context'].apply(lambda x: ' '.join(x.split()))
test['question'] = test['question'].apply(lambda x: ' '.join(x.split()))

tokenizer = AutoTokenizer.from_pretrained(Config().tokenizer_name)

test_features = []
for i, row in test.iterrows():
    test_features += getTestFeatures(Config(), row, tokenizer)

In [17]:
weight = np.exp([0.7189407410715103,0.7009346291550066,0.7423611924357781,0.7452209457707667,0.7328501876639222\
          ,0.7409769800815581,0.7214328330609091,0.7293411617805697,0.8074233752981514,0.7191098087529867,\
          0.7982931852209235,0.7737983627682914,0.7776064350570635,0.7843168579166172,0.7358640175404093,\
                 0.7166820931307615])


start_logits = [0 for i in range(len(weight))]
end_logits = [0 for i in range(len(weight))]
start_logits[0], end_logits[0] = getPredictions("../input/5foldsroberta/output/checkpoint-fold-0/pytorch_model.bin")
start_logits[1], end_logits[1] = getPredictions("../input/5foldsroberta/output/checkpoint-fold-1/pytorch_model.bin")
start_logits[2], end_logits[2] = getPredictions("../input/5foldsroberta/output/checkpoint-fold-2/pytorch_model.bin")
start_logits[3], end_logits[3] = getPredictions("../input/5foldsroberta/output/checkpoint-fold-3/pytorch_model.bin")
start_logits[4], end_logits[4] = getPredictions("../input/5foldsroberta/output/checkpoint-fold-4/pytorch_model.bin")
start_logits[5], end_logits[5] = getPredictions("../input/adding-fusion-1/simple_adding_model.bin")
start_logits[6], end_logits[6] = getPredictions("../input/self-hindi-v6/fold-0.bin")
start_logits[7], end_logits[7] = getPredictions("../input/self-hindi-v6/fold-1.bin")
start_logits[8], end_logits[8] = getPredictions("../input/overfitting-5-fold-128/fold-2.bin")
start_logits[9], end_logits[9] = getPredictions("../input/self-hindi-v6/fold-3.bin")
start_logits[10], end_logits[10] = getPredictions("../input/overfitting-5-fold-128/fold-3.bin")
start_logits[11], end_logits[11] = getPredictions("../input/deep-wiki-v2/fold_0.bin")
start_logits[12], end_logits[12] = getPredictions("../input/deep-wiki-v2/fold_1.bin")
start_logits[13], end_logits[13] = getPredictions("../input/deep-wiki-v2/fold_2.bin")
start_logits[14], end_logits[14] = getPredictions("../input/deep-wiki-v2/fold_3.bin")
start_logits[15], end_logits[15] = getPredictions("../input/deep-wiki-v2/fold_4.bin")





In [18]:

start_logits_,end_logits_ = 0,0
for i in range(len(weight)):
    start_logits_ += start_logits[i]*weight[i]/np.sum(weight)
    end_logits_ += end_logits[i]*weight[i]/np.sum(weight)
#     start_logits_ += start_logits[i]*weight[i]/np.sum(weight)
#     end_logits_ += end_logits[i]*weight[i]/np.sum(weight)


In [19]:
fin_preds = postprocess(test, test_features, (start_logits_, end_logits_))

submission = []
for p1, p2 in fin_preds.items():
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
    submission.append((p1, p2))

sample = pd.DataFrame(submission, columns=["id", "PredictionString"])

test_data = pd.merge(left=test, right=sample, on="id")

Post-processing 5 example predictions


In [20]:
bad_starts = [".", ",", "(", ")", "-", "–", ",", ";"]
bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"


cleaned_preds = []
for pred, context in test_data[["PredictionString", "context"]].to_numpy():
    if pred == "":
        cleaned_preds.append(pred)
        continue
    while any([pred.startswith(y) for y in bad_starts]):
        pred = pred[1:]
    while any([pred.endswith(y) for y in bad_endings]):
        if pred.endswith("..."):
            pred = pred[:-3]
        else:
            pred = pred[:-1]
    if pred.endswith("..."):
        pred = pred[:-3]

    if (
        any(
            [
                pred.endswith(tamil_ad),
                pred.endswith(tamil_bc),
                pred.endswith(tamil_km),
                pred.endswith(hindi_ad),
                pred.endswith(hindi_bc),
            ]
        )
        and pred + "." in context
    ):
        pred = pred + "."

    cleaned_preds.append(pred)

test_data["PredictionString"] = cleaned_preds
test_data[["id", "PredictionString"]].to_csv("submission.csv", index=False)

In [21]:
# test_data[["id", "PredictionString"]]