# Baseline Model for Kaggel chaii competition.
## In this competition we are suppose to predict Answer (index) given the question and context document
## Data Language is Hindi and Tamil

In [5]:
import pandas as pd
import numpy as np

from datasets import Dataset
from transformers import (AutoTokenizer,AutoModelForQuestionAnswering,DataCollatorWithPadding)
from transformers import get_scheduler
from transformers import AdamW

import torch
from torch.utils.data import DataLoader

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## path for kaggel notebook
# data_dir = '/kaggle/input/chaii-hindi-and-tamil-question-answering/'
# output_dir = '/kaggle/output/kaggle/working/'


data_dir = '/home/ubuntu/repo/chaii4deeplearningkaggler/data/'
output_dir = data_dir+'outputs/'

train_fn = 'train.csv'
test_fn='test.csv'
sub_fn = 'sample_submission.csv'

BATCH_SIZE= 8
## max length of padding and trunc
MAX_LENGTH = 512
lr = 5e-5
num_epochs = 20

In [6]:
df = pd.read_csv(data_dir+train_fn)
test_df = pd.read_csv(data_dir+test_fn)

In [7]:
example_id2idx = { eid:idx for idx,eid in enumerate(df['id'].values)}
example_idx2id = { idx:eid for idx,eid in enumerate(df['id'].values)}

# checking if give answer start (index) label is correct.

In [8]:
corp_lens = []
for idx in range(0,df.shape[0]):
    subdf = df.iloc[idx]
    strt = subdf['answer_start']
    end = strt+len(subdf['answer_text'])
    ans = subdf['context'][strt:end]
    
    corp_lens.append(len(subdf['context'].split()))
    if not ans==subdf['answer_text']:
        print(idx,ans,'****',subdf['answer_text'])
        
## This is very rough estimation of corpus length. It doesnt represent actual token lenght from tokenizer        
print(f'top 10 token len of the corpus in training dataset: {sorted(corp_lens,reverse=True)[:10]}')

top 10 token len of the corpus in training dataset: [10259, 9650, 8946, 8944, 8848, 8803, 8433, 8403, 8403, 7725]


In [9]:
## function to generate answer end using start index and answer
def add_end_index(row):
    strt = row['answer_start']
    end = strt+len(row['answer_text'])
    ans = row['context'][strt:end]
    assert(ans==row['answer_text'])
    return end
df['answer_end'] = df.apply(add_end_index,axis=1)
df.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language,answer_end
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,56
1,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,2368
2,29d154b56,சர் அலெக்ஸாண்டர் ஃபிளெமிங் (Sir Alexander Flem...,பென்சிலின் கண்டுபிடித்தவர் யார்?,சர் அலெக்ஸாண்டர் ஃபிளெமிங்,0,tamil,26
3,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil,76
4,b29c82c22,சூரியக் குடும்பம் \nசூரியக் குடும்பம் (Solar S...,பூமியின் அருகில் உள்ள விண்மீன் எது?,சூரியனும்,585,tamil,594


# Preparing dataset

In [10]:
## random shuffle and splitting data set
## ratio is 85-15
df = df.sample(frac=1)

split_idx = int(0.85*df.shape[0])
train_df = df.iloc[:split_idx].reset_index(drop=True)
valid_df = df.iloc[split_idx:].reset_index(drop=True)

print(train_df.shape,valid_df.shape,df.shape)

(946, 7) (168, 7) (1114, 7)


In [11]:
## creating Dataset from pandas. This is hugging face dataset not pytorch
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# Initializing the model and tokenizer using pretrain weights. 

In [12]:
pretrain_path = 'ai4bharat/indic-bert'
# pretrain_path = '/kaggle/input/indicbert/indic-bert-v1'

tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
model = AutoModelForQuestionAnswering.from_pretrained(pretrain_path)
model = model.to(device)
## data collator to prepare batch data
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForQuestionAnswering: ['predictions.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.weight', 'sop_classifier.classifier.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly

In [13]:
all_vocab = tokenizer.get_vocab()
print(f'number of vocab our tokenizer have {len(all_vocab)}')
print(f'printing ... few **vocab** {list(all_vocab.keys())[:10]} and their **index** {list(all_vocab.values())[:10]}')

number of vocab our tokenizer have 200000
printing ... few **vocab** ['▁পুঁজিবাদ', 'ുന്തോറും', '▁spur', 'వులను', '▁পদক্ষেপের', '▁ઓફીસ', '▁பலத்த', '▁বাংলাদেশিদের', 'ിലാ', '▁பொறி'] and their **index** [158243, 160427, 62651, 173786, 118925, 86150, 27947, 48624, 96248, 70064]


In [14]:
## function to create token out of each question and context pair.
## context and question pair are concat together
def dataprep(data):
    data['question'] = [q.strip() for q in data['question']]
    data['context'] = [c.strip() for c in data['context']]
    
    data_tokenizer = tokenizer(data['context'],
                               data['question'],
                               truncation='only_first',
                               max_length=MAX_LENGTH)
    
    
    data_tokenizer["start_positions"] = [s for s in data['answer_start']]
    data_tokenizer["end_positions"] = [e for e in data['answer_end']]
    data_tokenizer['example_id'] = [example_id2idx[i] for i in data['id']]
    return data_tokenizer
    

In [15]:
## creating token using dataprep function
tokenized_train_ds = train_dataset.map(dataprep,batched=True,remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(dataprep,batched=True,remove_columns=valid_dataset.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
tokenized_train_ds.set_format("torch")
tokenized_valid_ds.set_format("torch")

In [17]:
tokenized_train_ds.data.to_pandas().head()

Unnamed: 0,attention_mask,end_positions,example_id,input_ids,start_positions,token_type_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",57,534,"[2, 23269, 1134, 4761, 11611, 8430, 1134, 68, ...",44,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",124,396,"[2, 1209, 19080, 20, 68508, 154664, 216, 79460...",109,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",555,873,"[2, 3039, 9691, 5458, 25397, 1976, 1883, 130, ...",527,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1084,378,"[2, 1500, 10495, 1301, 6511, 1883, 1539, 494, ...",1075,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",8,725,"[2, 1883, 11339, 25272, 65990, 1546, 1134, 188...",5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [18]:
## Using pytorch DataLoader function 

train_dataloader = DataLoader(
    tokenized_train_ds, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_valid_ds, batch_size=BATCH_SIZE, collate_fn=data_collator
)

In [19]:
## learning rate decay scheduler

optimizer = AdamW(model.parameters(), lr=lr)

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [20]:
from tqdm.auto import tqdm

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def jacard_score_(data_dict):
    score = 0
    for data in data_dict:
        p,tp = list(data.values())[0]
        if p is not '' and tp is not '':
            score += jaccard(p,tp)
    return score
            
## extracting answer from logits
def postprocess(dataset,eid,slogits,elogits,spos,epos,idx2id,n_best_size=20,max_answer_length=30):
    predictions = {}
    for idx in range(len(slogits)):
        start_logits = slogits[idx]
        end_logits = elogits[idx]
        tstart = spos[idx]
        tend = epos[idx]
        ## getting from pandas dataframe using id
        data = dataset[dataset.id == idx2id[eid[idx].item()]]
        ## selecting top 20 logits
        start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
        end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
        valid_ans = []
        corp = data['context'].values[0]
        
        ## iterating over each index pair and discarding invalid ones
        for start_index in start_indexes:
            for end_index in end_indexes:
                
                if end_index < start_index or end_index-start_index+1 > max_answer_length:
                    continue
                valid_ans.append({'score':start_logits[start_index]+end_logits[end_index],
                 'text':corp[start_index:end_index]})
        # selecting best answer        
        if len(valid_ans) >0:
            best_answer = sorted(valid_ans, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": '', "score": 0.0}
        ## predicting answer with true answer
        
        predictions[data['id'].values[0]] = [best_answer['text'],corp[int(tstart):int(tend)]]        
    return [predictions]

## evaluation 
def model_eval(model,evaldataset_,evaldf,idx2id):
    model.eval()
    loss = 0
    preds = []
    n = len(evaldataset_)
    for batch in evaldataset_:
        batch_ = {k: v.to(device) for k, v in batch.items() if k != 'example_id'}
        with torch.no_grad():
            outputs = model(**batch_)
        loss+= outputs.loss.item()
        
        slogits = outputs.start_logits.cpu().detach().numpy()
        elogits = outputs.end_logits.cpu().detach().numpy()
        spos = batch['start_positions']
        epos = batch['end_positions']

        preds.extend(postprocess(evaldf,batch['example_id'],slogits,elogits,spos,epos,idx2id))
    jac_score = jacard_score_(preds)
    print(f'valid loss is {loss/n} and jaccard score is {jac_score/n}')

## training
def train(model,traindata,evaldata,progress_steps,epochs):
    progress_bar = tqdm(range(progress_steps))
    model.train()
    
    for epoch in range(epochs):
        epoch_loss = 0
        for batch in traindata:
            
            
            optimizer.zero_grad()
            
            batch_ = {k: v.to(device) for k, v in batch.items() if k != 'example_id'}
            outputs = model(**batch_)
            loss = outputs.loss
            loss.backward()
            
            optimizer.step()
            lr_scheduler.step()
            epoch_loss += loss.item()
            progress_bar.update(1)
        print(f'training loss for epoch {epoch} is {epoch_loss/len(traindata)}')
        model_eval(model,evaldata[0],evaldata[1],example_idx2id)

In [21]:
train(model,train_dataloader,[eval_dataloader,valid_df],num_training_steps,num_epochs)

  0%|          | 0/2380 [00:00<?, ?it/s]

training loss for epoch 0 is 5.948097802009903
valid loss is 5.945135411762056 and jaccard score is 0.08412698412698412


In [22]:
texample_id2idx = { eid:idx for idx,eid in enumerate(test_df['id'].values)}
texample_idx2id = { idx:eid for idx,eid in enumerate(test_df['id'].values)}

def testdataprep(data):
    
    data['question'] = [q.strip() for q in data['question']]
    data['context'] = [c.strip() for c in data['context']]
    
    data_tokenizer = tokenizer(data['context'],
                               data['question'],
                               truncation='only_first',
                               padding="max_length",
                               max_length=MAX_LENGTH)
    data_tokenizer['example_id'] = [texample_id2idx[i] for i in data['id']]
    return data_tokenizer


tokenized_test_ds = test_dataset.map(testdataprep,batched=True,remove_columns=test_dataset.column_names)
tokenized_test_ds.set_format("torch")
test_dataloader = DataLoader(tokenized_test_ds,collate_fn=data_collator)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
def predict(predset,idx2id):
    model.eval()
    preds = []
    for batch in predset:
        batch_ = {k: v.to(device) for k, v in batch.items() if k != 'example_id'}
        with torch.no_grad():
            outputs = model(**batch_)
        
        slogits = outputs.start_logits.cpu().detach().numpy()
        elogits = outputs.end_logits.cpu().detach().numpy()
        spos=epos =np.zeros(slogits.shape[0])
        preds.extend(postprocess(test_df,batch['example_id'],slogits,elogits,spos,epos,idx2id))
    return preds

predictions =predict(test_dataloader,texample_idx2id)  

In [24]:
def prep_sub(idx):
    for p in predictions:
        if idx in p:
            return p[idx][0]
test_df['PredictionString'] = test_df.id.apply(prep_sub)

In [25]:
test_df = test_df.drop(columns=['context','question','language'], axis=1)
test_df.to_csv('submission.csv', index=False)