In [1]:
#!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-cloud 0.1.13 requires tensorflow<3.0,>=1.15.0, which is not installed.[0m


In [2]:
import sys
sys.path.append("../input/tez-lib/")
import collections
import numpy as np
import transformers
import pandas as pd
from datasets import Dataset
from functools import partial
from tqdm import tqdm
import torch

from sklearn import metrics
import transformers
import torch
import torch.nn as nn
import numpy as np
import tez
from string import punctuation

In [3]:
class ChaiiModel(tez.Model):
    def __init__(self, model_name, num_train_steps, steps_per_epoch, learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.steps_per_epoch = steps_per_epoch
        self.model_name = model_name
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

        hidden_dropout_prob: float = 0.0
        layer_norm_eps: float = 1e-7

        config = transformers.AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = transformers.AutoModel.from_pretrained(model_name, config=config)
        self.output = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, ids, mask, token_type_ids=None, start_positions=None, end_positions=None):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out[0]
        logits = self.output(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        return (start_logits, end_logits), 0, {}

In [4]:
class ChaiiDataset:
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return {
            "ids": torch.tensor(self.data[item]["input_ids"], dtype=torch.long),
            "mask": torch.tensor(self.data[item]["attention_mask"], dtype=torch.long),
        }

In [5]:
def prepare_validation_features(examples, tokenizer, pad_on_right, max_length, doc_stride):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [6]:
def postprocess_qa_predictions(
    examples, tokenizer, features, raw_predictions, n_best_size=20, max_answer_length=30, squad_v2=False
):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None  # Only used if squad_v2 is True.
        valid_answers = []

        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": (start_logits[start_index] + end_logits[end_index])/2,
                            "text": context[start_char:end_char],
                        }
                    )

        if len(valid_answers) > 0:
            answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)
#             print(answers)
            best_answer = answers[0]
#             best_answer = answers[:5]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer#["text"]

    return predictions

In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../input/xlmrob")

In [8]:
pad_on_right = tokenizer.padding_side == "right"
max_length = 384
doc_stride = 128

test_data = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test_dataset = Dataset.from_pandas(test_data)
test_features = test_dataset.map(
    partial(
        prepare_validation_features, 
        tokenizer=tokenizer,
        pad_on_right=pad_on_right, 
        max_length=max_length,
        doc_stride=doc_stride
    ),
    batched=True,
    remove_columns=test_dataset.column_names
)
test_feats_small = test_features.map(
    lambda example: example, remove_columns=['example_id', 'offset_mapping']
)

fin_start_logits = None
fin_end_logits = None

for fold_ in tqdm(range(10)):
    model = ChaiiModel(model_name="../input/xlmrob", num_train_steps=0, steps_per_epoch=0, learning_rate=0)
    model.load(f"../input/deepsetsquad2-v2/pytorch_model_f{fold_}.bin", weights_only=True)
    model.to("cuda")
    model.eval()
    data_loader = torch.utils.data.DataLoader(
        ChaiiDataset(test_feats_small), 
        batch_size=32,
        num_workers=4,
        pin_memory=True,
        shuffle=False
    )
    start_logits = []
    end_logits = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output, _, _ = model(**data)
            start = output[0].detach().cpu().numpy()
            end = output[1].detach().cpu().numpy()
            start_logits.append(start)
            end_logits.append(end)

    start_logits = np.vstack(start_logits)
    end_logits = np.vstack(end_logits)
    
    if fin_start_logits is None:
        fin_start_logits = start_logits
        fin_end_logits = end_logits
    else:
        fin_start_logits += start_logits
        fin_end_logits += end_logits
        
    del model
    torch.cuda.empty_cache()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))

  0%|          | 0/10 [00:00<?, ?it/s]




Some weights of the model checkpoint at ../input/xlmrob were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 10%|█         | 1/10 [00:55<08:17, 55.28s/it]Some weights of the model checkpoint at ../input/xlmrob were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification mod

In [9]:
fin_start_logits /= 10
fin_end_logits /= 10

In [10]:
# normalize
normalizer = torch.nn.Softmax(dim=1)

fin_start_logits = normalizer(torch.from_numpy(fin_start_logits))
fin_end_logits = normalizer(torch.from_numpy(fin_end_logits))

fin_start_logits = fin_start_logits.numpy()
fin_end_logits = fin_end_logits.numpy()

In [11]:
fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))

  0%|          | 0/5 [00:00<?, ?it/s]

Post-processing 5 example predictions split into 67 features.


100%|██████████| 5/5 [00:00<00:00, 14.22it/s]


In [12]:
final_list = list(fin_preds.values())
final_list[:5]

[{'score': 0.6914839744567871, 'text': ' येलन'},
 {'score': 0.21004313230514526, 'text': ' 28 नवम्बर 2007'},
 {'score': 0.9119396805763245, 'text': '१२ मार्च १८२४'},
 {'score': 0.975170910358429, 'text': ' 13'},
 {'score': 0.639376163482666, 'text': 'சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்'}]

# Model 2

In [13]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim
from torch.utils.data import (
    Dataset, DataLoader,
    SequentialSampler, RandomSampler
)
from torch.utils.data.distributed import DistributedSampler

try:
    from apex import amp
    APEX_INSTALLED = True
except ImportError:
    APEX_INSTALLED = False

import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

print(f"Apex AMP Installed :: {APEX_INSTALLED}")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

Apex AMP Installed :: False


In [14]:
class Config:
    # model
    model_type = 'xlm_roberta'
    model_name_or_path = "../input/5foldsroberta/output/checkpoint-fold-0"#"../input/xlm-roberta-large-squad-v2"
    config_name = "../input/5foldsroberta/output/checkpoint-fold-0"#"../input/xlm-roberta-large-squad-v2"
    fp16 = True if APEX_INSTALLED else False
    fp16_opt_level = "O1"
    gradient_accumulation_steps = 2

    # tokenizer
    tokenizer_name = "../input/5foldsroberta/output/checkpoint-fold-0"#"../input/xlm-roberta-large-squad-v2"
    max_seq_length = 400
    doc_stride = 135

    # train
    epochs = 1
    train_batch_size = 4
    eval_batch_size = 128

    # optimzer
    optimizer_type = 'AdamW'
    learning_rate = 1e-5
    weight_decay = 1e-2
    epsilon = 1e-8
    max_grad_norm = 1.0

    # scheduler
    decay_name = 'linear-warmup'
    warmup_ratio = 0.1

    # logging
    logging_steps = 10

    # evaluate
    output_dir = 'output'
    seed = 2021

In [15]:
class DatasetRetriever(Dataset):
    def __init__(self, features, mode='train'):
        super(DatasetRetriever, self).__init__()
        self.features = features
        self.mode = mode
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):   
        feature = self.features[item]
        if self.mode == 'train':
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(feature['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(feature['start_position'], dtype=torch.long),
                'end_position':torch.tensor(feature['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':feature['offset_mapping'],
                'sequence_ids':feature['sequence_ids'],
                'id':feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            }

In [16]:
class Model(nn.Module):
    def __init__(self, modelname_or_path, config):
        super(Model, self).__init__()
        self.config = config
        self.xlm_roberta = AutoModel.from_pretrained(modelname_or_path, config=config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(
        self, 
        input_ids, 
        attention_mask=None, 
    ):
        outputs = self.xlm_roberta(
            input_ids,
            attention_mask=attention_mask,
        )

        sequence_output = outputs[0]
        pooled_output = outputs[1]
        
        # sequence_output = self.dropout(sequence_output)
        qa_logits = self.qa_outputs(sequence_output)
        
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
    
        return start_logits, end_logits

In [17]:
def make_model(args):
    config = AutoConfig.from_pretrained(args.config_name)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
    model = Model(args.model_name_or_path, config=config)
    return config, tokenizer, model

In [18]:
def prepare_test_features(args, example, tokenizer):
    example["question"] = example["question"].lstrip()
    
    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=args.max_seq_length,
        stride=args.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example['id']
        feature['context'] = example['context']
        feature['question'] = example['question']
        feature['input_ids'] = tokenized_example['input_ids'][i]
        feature['attention_mask'] = tokenized_example['attention_mask'][i]
        feature['offset_mapping'] = tokenized_example['offset_mapping'][i]
        feature['sequence_ids'] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]
        features.append(feature)
    return features

In [19]:
test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')

test['context'] = test['context'].apply(lambda x: ' '.join(x.split()))
test['question'] = test['question'].apply(lambda x: ' '.join(x.split()))

#base_model_path = '../input/chaii-qa-5-fold-xlmroberta-torch-fit'
# test=test[:10]
#test=test[:10]
tokenizer = AutoTokenizer.from_pretrained(Config().tokenizer_name)

test_features = []
for i, row in test.iterrows():
    test_features += prepare_test_features(Config(), row, tokenizer)

args = Config()
test_dataset = DatasetRetriever(test_features, mode='test')
test_dataloader = DataLoader(
    test_dataset,
    batch_size=args.eval_batch_size, 
    sampler=SequentialSampler(test_dataset),
    num_workers=optimal_num_of_loader_workers(),
    pin_memory=True, 
    drop_last=False
)

In [20]:
base_model = '../input/5foldsroberta/output/'

In [21]:
def get_predictions(checkpoint_path):
    config, tokenizer, model = make_model(Config())
    model.cuda();
    model.load_state_dict(
        torch.load(base_model + checkpoint_path)
    );
    
    start_logits = []
    end_logits = []
    for batch in test_dataloader:
        with torch.no_grad():
            outputs_start, outputs_end = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

In [22]:
start_logits1, end_logits1 = get_predictions('checkpoint-fold-0/pytorch_model.bin')
start_logits2, end_logits2 = get_predictions('checkpoint-fold-1/pytorch_model.bin')
start_logits3, end_logits3 = get_predictions('checkpoint-fold-2/pytorch_model.bin')
start_logits4, end_logits4 = get_predictions('checkpoint-fold-3/pytorch_model.bin')
start_logits5, end_logits5 = get_predictions('checkpoint-fold-4/pytorch_model.bin')



start_logits = (start_logits1 + start_logits2 + start_logits3 + start_logits4+ start_logits5)/5
end_logits = (end_logits1 + end_logits2 + end_logits3 + end_logits4 + end_logits5 )/5

In [23]:
# normalize
normalizer = torch.nn.Softmax(dim=1)

start_logits = normalizer(torch.from_numpy(start_logits))
end_logits = normalizer(torch.from_numpy(end_logits))

start_logits = start_logits.numpy()
end_logits = end_logits.numpy()

In [24]:
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": (start_logits[start_index] + end_logits[end_index])/2,
                            "text": context[start_char: end_char],
#                             "example": example
#                             "feature_index":feature_index,
#                             "features":features[feature_index],
#                             "start_index":start_index,
#                             "end_index":end_index,
#                             "offset_mapping":offset_mapping,
#                             "context":context
                        }
                    )
        
        if len(valid_answers) > 0:
            answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)
#             print(answers)
            best_answer = answers[0]
#             best_answer = answers[:5]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer#["text"]
        
        
    return predictions

In [25]:
final_predictions_model2 = postprocess_qa_predictions(test, test_features, (start_logits, end_logits))

Post-processing 5 example predictions split into 63 features.


In [26]:
final_list_model2 = list(final_predictions_model2.values())
final_list_model2[:5]

[{'score': 0.6738852538461213, 'text': ' येलन'},
 {'score': 0.2364240767299597, 'text': ' 20 अप्रैल 2010'},
 {'score': 0.8744408926944801, 'text': '१२ मार्च १८२४'},
 {'score': 0.9539678124400571, 'text': ' 13'},
 {'score': 0.6116797119144188, 'text': 'சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்'}]

In [27]:
# final_list_model2[1][0]

In [28]:
final_list[:5]

[{'score': 0.6914839744567871, 'text': ' येलन'},
 {'score': 0.21004313230514526, 'text': ' 28 नवम्बर 2007'},
 {'score': 0.9119396805763245, 'text': '१२ मार्च १८२४'},
 {'score': 0.975170910358429, 'text': ' 13'},
 {'score': 0.639376163482666, 'text': 'சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்'}]

# Combine results

In [29]:
final_predictions = []
total_ans = len(final_list) 
threshold = 0.20#0.10#0.50#0.05#0.15#0.25#0.80#0.65
for i in range(total_ans):
    if final_list_model2[i]['score']>=threshold:
        final_predictions.append(final_list_model2[i]['text'])
    elif final_list[i]['score']>final_list_model2[i]['score']:
        final_predictions.append(final_list[i]['text'])
    else:
        final_predictions.append(final_list_model2[i]['text'])

In [30]:
final_predictions[:5]

[' येलन',
 ' 20 अप्रैल 2010',
 '१२ मार्च १८२४',
 ' 13',
 'சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்']

In [31]:
submission = []
for p2 in final_predictions:
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
#     submission.append((p1, p2))
    submission.append(p2)

In [32]:
# sample = pd.DataFrame(submission, columns=["id", "PredictionString"])
# sample.head()

In [33]:
test["PredictionString"] = submission
test.head()

Unnamed: 0,id,context,question,language,PredictionString
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi,येलन
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi,20 अप्रैल 2010
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi,१२ मार्च १८२४
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil,13
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ம...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil,சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்


In [34]:
# test_data['PredictionString'] = sample['PredictionString']
# test_data.head()

In [35]:
bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"

cleaned_preds = []
for pred, context in test[["PredictionString", "context"]].to_numpy():
    if pred == "":
        cleaned_preds.append(pred)
        continue
    while any([pred.startswith(y) for y in bad_starts]):
        pred = pred[1:]
    while any([pred.endswith(y) for y in bad_endings]):
        if pred.endswith("..."):
            pred = pred[:-3]
        else:
            pred = pred[:-1]
    
    if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
        pred = pred+"."

    cleaned_preds.append(pred)

test["PredictionString"] = cleaned_preds

In [36]:
test.head()

Unnamed: 0,id,context,question,language,PredictionString
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi,येलन
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi,20 अप्रैल 2010
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi,१२ मार्च १८२४
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil,13
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ம...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil,சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்


In [37]:
test[['id','PredictionString']].to_csv('submission.csv',index=False)
