In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ls /content/drive/MyDrive/HASOC

best_xlmr_hi_model_0.1split_taska.pt
best_xlmr_hi_model_full_split_taska.pt
best_xlmr_hi_model_more_features_taska.pt
best_xlmr_hi_model_taskb.pt
best_xlmr_hi_model_taskb_with_emoji.pt
hi_test_pre_processed.csv
hi_test_pre_processed_nonorm.csv
hi_test_pre_processed_nonorm_tok.csv
hi_train_pre_processed.csv
hi_train_pre_processed_nonorm.csv
hi_train_pre_processed_nonorm_tok.csv


In [3]:
models_path = "/content/drive/MyDrive/HASOC"

In [4]:
!pip install wandb -qqq
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmallika2011[0m (use `wandb login --relogin` to force relogin)


True

In [5]:
wandb.init(project="xlmr-hi-new", config={
    "learning_rate": 1e-5,
    "architecture": "xlmr",
    "dataset": "all",
})
config = wandb.config

In [6]:
%%capture
!pip install transformers
!pip install datasets
!pip install demoji
!pip install nltk
!pip install emoji
!pip install indic-nlp-library

In [7]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig, AutoModel, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from datasets import ClassLabel, Value
import torch
from tqdm.notebook import tqdm
import numpy as np
import seaborn as sns
import pickle
from collections import Counter
import sys, os
import demoji
import emoji
import string
import re

In [8]:
def remove_punctuation(text):
    return ''.join([c for c in text if c not in string.punctuation])

def remove_url(text):
    return re.sub(r'https?://\S+', ' ', text)

def remove_mention(text):
    return re.sub(r'@[A-Za-z0-9_]+', ' ', text)

def remove_hashtag(text):
    return re.sub(r'.*?(#\w+)|.+', ' ', text)  #TODO: check if using/removing the hashtag has any improvement

def remove_emoji(text):
    # return emoji.demojize(text)
    return demoji.replace(text, ' ')

def translate_to_hindi(text):                   #CHECK translation of sentences (since some are in english also)
    print(text)
    res = translator.translate(text).text
    print(res)
    return res

def normalize_hindi_script(text):
    from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
    factory=IndicNormalizerFactory()
    normalizer=factory.get_normalizer("hi",remove_nuktas=False)
    return normalizer.normalize(text)

def clean_row(text):
    return ' '.join(remove_punctuation(remove_url(remove_mention(remove_emoji(text)))).split()).lower()

def clean(df):
    df['clean_text'] = df.text.apply(clean_row)
    return df

In [9]:
test_df = pd.read_csv(os.path.join(models_path, "hi_test_pre_processed.csv"))
print(test_df.columns)
test_df.drop(['Unnamed: 0', '_id'], axis=1, inplace=True)
test_df.columns = ['id', 'text', 'frac_profane', 'polarity']
test_df.head()

Index(['Unnamed: 0', '_id', 'tweet_id', 'text', 'frac_profane', 'polarity'], dtype='object')


Unnamed: 0,id,text,frac_profane,polarity
0,hi_hasoc_2021_5,गधा तू हैइसलिए एक ही बक रहा है।,0.125,-0.2543
1,hi_hasoc_2021_7,वोडाफोन ने एक कुत्ता पाला था बहुत फेमस हुआ फिर...,0.066667,0.063371
2,hi_hasoc_2021_12,1818 घंटे दीमक ने जाकर 70 साल के मज़बूत पेड़ क...,0.0,-0.076036
3,hi_hasoc_2021_13,हमारे ग्राम पंचायत सिधांव जिला फतेहपुर उ प्रमे...,0.0,-0.738464
4,hi_hasoc_2021_15,यह मुझे चैन क्यों नहीं पड़ता एक ही शख़्स था जह...,0.0,-0.160516


In [10]:
master_df = pd.read_csv(os.path.join(models_path, "hi_train_pre_processed.csv"))
master_df.drop(['Unnamed: 0', 'Unnamed: 0.1', '_id', 'task_2', 'tweet_id'], axis=1, inplace=True)
master_df.head()

Unnamed: 0,text,task_1,frac_profane,polarity
0,hindus dying hinduslivesmatter kindattnpmnaren...,NOT,0.0,-0.530294
1,सब लोग इतने पैसे डोनेट ही कर रहे हैं फिर भी आम...,NOT,0.0,-0.468993
2,शेरए सिवान शहाबुद्दीन साहब से रिश्ता क्या لا إ...,NOT,0.0,0.170181
3,आसमानी किताब के नाजायज औलाद है।,HOF,0.0,-0.313701
4,इसे कहते हैं दोगला पंती जिस सपा की दम पर 0 से ...,NOT,0.0,-0.045605


In [11]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(master_df, test_size=0.07, stratify = master_df['task_1'])

train_df.columns = ['text', 'labels', 'frac_profane', 'polarity' ]
valid_df.columns = ['text', 'labels', 'frac_profane', 'polarity' ]

print(f"Train Shape: {train_df.shape}, Valid Shape: {valid_df.shape}")

train_df.to_csv('./train.csv',columns=['text', 'labels', 'frac_profane', 'polarity' ], index = False)
valid_df.to_csv('./valid.csv',columns=['text', 'labels', 'frac_profane', 'polarity' ], index = False)

display(train_df)
display(valid_df)


Train Shape: (4272, 4), Valid Shape: (322, 4)


Unnamed: 0,text,labels,frac_profane,polarity
1942,सरकारी स्कूल से पढ़कर कलाम और मोदी देश के राष्...,HOF,0.018519,-0.561061
1480,save bengali hindus from massacre by tmc lsiam...,NOT,0.000000,-0.508290
3131,आप चुप रहो,NOT,0.000000,-0.463395
3103,भाजपा का देशव्यापी धरना देख के तृणमूल के गुंडो...,NOT,0.000000,-0.803204
1303,tmc कि जीत पर वही लोग हंस रहे हैं जो पृथ्वीराज...,NOT,0.000000,-0.004308
...,...,...,...,...
2970,an inquiry also to be conducted on urbannaxal ...,NOT,0.000000,-0.423993
292,यह भेंस चोर दुसरो को अपना बाप बनाकर अपनी नाजाय...,HOF,0.000000,-0.590702
2767,modikavaccinejumla modiresign indiafightscoron...,NOT,0.000000,-0.304071
3505,बहनों भाइयों अपने देशी प्लेटफॉर्म कूkoo को डाउ...,NOT,0.000000,-0.183813


Unnamed: 0,text,labels,frac_profane,polarity
2151,तुम खुद जामुनार के नाजायज औलाद हो। मलेक्ष की स...,HOF,0.0,-0.236538
2827,दोस्तों आओ हम सब सर्वसम्मति से महान अर्थशास्त्...,NOT,0.0,0.206256
1187,एक जानवर को भी पेड़ से कितना लगाव हैं और एक मा...,NOT,0.0,-0.360580
578,देख को बंगाल की लोगों खुले आम धमकी मिल रही है ...,NOT,0.0,-0.691467
4197,जो लोग मुसलमानों के मसीहा बने बैठे थे एक एक कर...,HOF,0.0,-0.423814
...,...,...,...,...
1808,आज़म खान साहब का इलाज aiims में कराया जाए... श...,NOT,0.0,-0.320064
4063,धुंदमय आकाश का मौसम है मेरे देश मे और मौत का स...,NOT,0.0,-0.069700
4249,पहले दिया जलाया गया तभी अब लोग जल रहे है उसके ...,NOT,0.0,-0.368207
594,शहाबुद्दीन साहब आज़म खान जी के जूतों के नोख के...,HOF,0.0,-0.411545


In [12]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 4
LEARNING_RATE = 2e-05
# tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large', truncation=True)
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', truncation=True)

In [13]:
labels = ClassLabel(num_classes = 2, names = ['HOF', 'NOT'])

In [14]:
class HindiData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, isTest = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.features = dataframe[['frac_profane', 'polarity']].values
        self.max_len = max_len
        self.isTest = isTest
        if not self.isTest:
            self.targets = self.data.labels
        print(self.features)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        if not self.isTest:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(labels.str2int(self.targets[index]), dtype=torch.float),
                'features': torch.tensor(self.features[index],dtype=torch.float)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'features': torch.tensor(self.features[index],dtype=torch.float)
            }

In [15]:
train_data=train_df.reset_index(drop=True)
valid_data = valid_df.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_data.shape))
print("VALID Dataset: {}".format(valid_data.shape))

training_set = HindiData(train_data, tokenizer, MAX_LEN, False)
valid_set = HindiData(valid_data, tokenizer, MAX_LEN, False)

TRAIN Dataset: (4272, 4)
VALID Dataset: (322, 4)
[[ 0.01851852 -0.56106149]
 [ 0.         -0.50829006]
 [ 0.         -0.46339522]
 ...
 [ 0.         -0.30407143]
 [ 0.         -0.18381339]
 [ 0.         -0.53948902]]
[[ 0.00000000e+00 -2.36538097e-01]
 [ 0.00000000e+00  2.06256300e-01]
 [ 0.00000000e+00 -3.60580161e-01]
 [ 0.00000000e+00 -6.91467479e-01]
 [ 0.00000000e+00 -4.23813656e-01]
 [ 0.00000000e+00 -9.08162247e-01]
 [ 0.00000000e+00 -1.13751709e-01]
 [ 0.00000000e+00  2.06256300e-01]
 [ 0.00000000e+00 -2.67473042e-01]
 [ 0.00000000e+00 -1.18747652e-02]
 [ 0.00000000e+00 -3.77899840e-01]
 [ 0.00000000e+00  9.32231247e-02]
 [ 0.00000000e+00  2.28865042e-01]
 [ 0.00000000e+00 -7.54064046e-01]
 [ 1.00000000e-01 -7.48248581e-01]
 [ 0.00000000e+00 -5.67227602e-03]
 [ 0.00000000e+00 -1.38341427e-01]
 [ 0.00000000e+00 -4.69244316e-01]
 [ 0.00000000e+00  1.73982888e-01]
 [ 0.00000000e+00  4.61893380e-02]
 [ 0.00000000e+00  5.93748093e-02]
 [ 0.00000000e+00  2.52247632e-01]
 [ 0.00000000

In [16]:
test_data=test_df.reset_index(drop=True)
print("TEST Dataset: {}".format(test_data.shape))
testing_set = HindiData(test_data, tokenizer, MAX_LEN, True)

TEST Dataset: (1532, 4)
[[ 0.125      -0.25430013]
 [ 0.06666667  0.06337073]
 [ 0.         -0.07603595]
 ...
 [ 0.         -0.34955376]
 [ 0.         -0.17074753]
 [ 0.07142857 -0.55607095]]


In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)

In [18]:
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(testing_set, **test_params)

In [19]:
from torch import nn

class CustomXLMRModel(torch.nn.Module):
    def __init__(self,num_labels=2):
        super(CustomXLMRModel, self).__init__()
        self.num_labels = num_labels
        self.xlmr = AutoModel.from_pretrained("xlm-roberta-base", num_labels = num_labels)
        # self.xlmr = AutoModel.from_pretrained("xlm-roberta-large", num_labels = num_labels)
        # self.xlmr = RobertaModel.from_pretrained("roberta-base")
        ### New layers:
        self.conv_layer = torch.nn.Conv1d(768, 1, kernel_size=3)
        
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(14, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, ids, mask, token_type_ids, features):
        outputs = self.xlmr(ids, attention_mask=mask, token_type_ids=token_type_ids, output_hidden_states=True)
        
        last_hidden_state, pooler_output, allhidden = outputs[0], outputs[1], outputs[2]
        
        # print(len(allhidden))
        # print(allhidden[0].size())
        # print(allhidden[1].size())

        feature_vec = []

        for i, hstate in enumerate(allhidden[1:]):
            
            # print("For hidden state:", i+1)
            # print("original shape", hstate.size())

            hstate = hstate.permute(0, 2, 1)
            # print("transposed shape", hstate.size())

            conv_out = F.relu(self.conv_layer(hstate))
            conv_out = conv_out.to(device)

            # print("shape after conv:", conv_out.size())
            
            max_out = F.max_pool1d(conv_out, kernel_size=conv_out.shape[2])
            # print("shape after max pool:", max_out.size())

            feature_vec.append(max_out)

        result = torch.cat(feature_vec, dim=1).squeeze(dim=2)
        #print("result shape:", result.size())
        #print("feature shape:", features.size())
        result = torch.cat((result,features),1)
        result = result.to(device)
        logits = self.fc(self.dropout(result))
        logits = logits.to(device)

        return logits

In [20]:
torch.cuda.empty_cache()
# model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels = 2)
model = CustomXLMRModel()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomXLMRModel(
  (xlmr): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [21]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
# loss_function = torch.nn.BCELoss()
# loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-5)
# optimizer = torch.optim.Adadelta(model.parameters(),
#                                lr=2e-3,
#                                rho=0.95)

In [22]:
device

'cuda'

In [23]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [24]:
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             precision_recall_fscore_support)

In [25]:
best_val_loss = 0
best_val_acc = 0
best_val_f1 = 0

def train(epoch):

    #------------------------------------------------#
    #                TRAIN BLOCK                     #
    #------------------------------------------------#
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    y_pred = []
    y_true = []
    y_pred_val = []
    y_true_val = []
    model.train()


    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        features = data['features'].to(device, dtype = torch.float32)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids, features)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        # big_idx = outputs.data
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        y_pred.extend(big_idx.tolist())
        y_true.extend(targets.cpu().numpy())
        
        if _%10==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            # print(f"Training Loss per 10 steps: {loss_step}")
            # print(f"Training Accuracy per 10 steps: {accu_step}")
            log_dict = {
                "train_loss_steps": loss_step,
                "train_acc_steps": accu_step,
            }
            wandb.log(log_dict)

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    #------------------------------------------------#
    #                VALID BLOCK                     #
    #------------------------------------------------#

    val_loss = 0
    n_correct_val = 0
    nb_val_steps = 0
    nb_val_examples = 0
    model.eval()

    for _,data in tqdm(enumerate(valid_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        features = data['features'].to(device, dtype = torch.float32)

        outputs = model(ids, mask, token_type_ids, features)
        vloss = loss_function(outputs, targets)
        val_loss += vloss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct_val += calcuate_accuracy(big_idx, targets)

        nb_val_steps += 1
        nb_val_examples+=targets.size(0)
        
        if _%10==0:
            val_loss_step = val_loss/nb_val_steps
            val_accu_step = (n_correct_val*100)/nb_val_examples 
            # print(f"Training Loss per 10 steps: {loss_step}")
            # print(f"Training Accuracy per 10 steps: {accu_step}")
            log_dict = {
                "val_loss_steps": val_loss_step,
                "val_acc_steps": val_accu_step,
            }
            wandb.log(log_dict)

        y_pred_val.extend(big_idx.tolist())
        y_true_val.extend(targets.cpu().numpy())


    #------------------------------------------------#
    #                LOG BLOCK                       #
    #------------------------------------------------#

    print(f'The Total Train Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    print(f'The Total Val Accuracy for Epoch {epoch}: {(n_correct_val*100)/nb_val_examples}')
    epoch_loss_val = tr_loss/nb_val_steps
    epoch_accu_val = (n_correct_val*100)/nb_val_examples
    print(f"Val Loss Epoch: {epoch_loss_val}")
    print(f"Val Accuracy Epoch: {epoch_accu_val}")

    log_dict = {
        "Epoch": epoch,
        "Train Loss": epoch_loss,
        "Train Acc": epoch_accu,
        "Valid Loss": epoch_loss_val,
        "Valid Acc": epoch_accu_val
    }
    wandb.log(log_dict)

    print("--------------------------------------------------")

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', labels=np.unique(y_pred))
    epoch_accu = accuracy_score(y_true, y_pred)
    print(f"Train Stats -- Accu: {epoch_accu}, Prec: {precision}, Recall: {recall}, F1: {f1}")

    precision, recall, val_f1, _ = precision_recall_fscore_support(y_true_val, y_pred_val, average='macro', labels=np.unique(y_pred_val))
    epoch_accu = accuracy_score(y_true_val, y_pred_val)
    print(f"Val Stats   -- Accu: {epoch_accu}, Prec: {precision}, Recall: {recall}, F1: {val_f1}")

    print("\n")

    #------------------------------------------------#
    #                SAVE  BLOCK                     #
    #------------------------------------------------#

    global best_val_loss
    global best_val_acc
    global best_val_f1

    if epoch == 0:
        torch.save(model, os.path.join(models_path, 'best_xlmr_hi_model_more_features_taska.pt'))
        best_val_loss = epoch_loss_val
        best_val_acc = epoch_accu_val
        best_val_f1 = val_f1

    elif val_f1 >= best_val_f1:
        best_val_f1 = val_f1
        torch.save(model, os.path.join(models_path, 'best_xlmr_hi_model_more_features_taska.pt'))

    # elif epoch_accu_val > best_val_acc:
    #     best_val_acc = epoch_accu_val
    #     torch.save(model, os.path.join(models_path, 'best_xlmr_hi_model_more_features_taska.pt'))

    # elif epoch_loss_val < best_val_loss:
    #     best_val_loss = epoch_loss_val
    #     torch.save(model, os.path.join(models_path, 'best_xlmr_hi_model_more_features_taska.pt'))

    print("Best f1 so far", best_val_f1)
    print("Best loss so far", best_val_loss)
    # print("Best Accu so far", best_val_acc)

    return 

In [26]:
EPOCHS = 15
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


KeyboardInterrupt: ignored

# Run model on test set and prepare submission

In [20]:
BEST_MODEL_PATH_1 = os.path.join(models_path, "best_xlmr_hi_model_more_features_taska.pt")
BEST_MODEL_PATH_2 = os.path.join(models_path, "best_xlmr_hi_model_0.1split_taska.pt")
BEST_MODEL_PATH_3 = os.path.join(models_path, "best_xlmr_hi_model_full_split_taska.pt")

In [21]:
testing_model1 = torch.load(BEST_MODEL_PATH_1)
testing_model2 = torch.load(BEST_MODEL_PATH_2)
testing_model3 = torch.load(BEST_MODEL_PATH_3)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
testing_model1.to(device)
testing_model2.to(device)
testing_model3.to(device)

testing_model1.eval()
testing_model2.eval()
testing_model3.eval()

CustomXLMRModel(
  (xlmr): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [22]:
device

'cuda'

In [36]:
with torch.no_grad():

    preds1 = []
    preds11 = []
    preds2 = []
    preds22 = []
    preds3 = []
    preds33 = []
    og_preds = []

    for _,data in tqdm(enumerate(testing_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        features = data['features'].to(device, dtype = torch.float32)
        
        outputs1 = testing_model1(ids, mask, token_type_ids, features)
        outputs2 = testing_model2(ids, mask, token_type_ids, features)
        outputs3 = testing_model3(ids, mask, token_type_ids, features)
        
        big_val1, big_idx1 = torch.max(outputs1.data, dim=1)
        big_val1 = big_val1.cpu().tolist()
        big_idx1 = big_idx1.cpu().tolist()
        # preds1 += big_idx1.cpu().tolist()
        # preds11 += big_val1.cpu().tolist()

        big_val2, big_idx2 = torch.max(outputs2.data, dim=1)
        big_val2 = big_val2.cpu().tolist()
        big_idx2 = big_idx2.cpu().tolist()
        # preds2 += big_idx2.cpu().tolist()
        # preds22 += big_val2.cpu().tolist()

        big_val3, big_idx3 = torch.max(outputs3.data, dim=1)
        big_val3 = big_val3.cpu().tolist()
        big_idx3 = big_idx3.cpu().tolist()
        # preds3 += big_idx3.cpu().tolist()
        # preds33 += big_val3.cpu().tolist()

        for i in range(len(big_idx1)):

            if big_val1[i] > big_val2[i] and big_val1[i] > big_val3[i]:
                og_preds.append(big_idx1[i])

            elif big_val2[i] > big_val3[i] and big_val2[i] > big_val1[i]:
                og_preds.append(big_idx2[i])

            elif big_val3[i] > big_val2[i] and big_val3[i] > big_val1[i]:
                og_preds.append(big_idx3[i])

        

0it [00:00, ?it/s]



In [37]:
print(len(preds1))
print(len(preds11))
print(len(preds2))
print(len(preds22))
print(len(preds3))
print(len(preds33))
print(len(og_preds))

0
0
0
0
0
0
1532


In [38]:
preds_labels1 = [labels.int2str(p) for p in preds1]
print(len(preds_labels1))

preds_labels2 = [labels.int2str(p) for p in preds2]
print(len(preds_labels2))

preds_labels3 = [labels.int2str(p) for p in preds3]
print(len(preds_labels3))

og_preds_labels = [labels.int2str(p) for p in og_preds]
print(len(og_preds_labels))

0
0
0
1532


In [39]:
test_df_submit = test_df.copy()
test_df_submit.head()

Unnamed: 0,id,text,frac_profane,polarity
0,hi_hasoc_2021_5,गधा तू हैइसलिए एक ही बक रहा है।,0.125,-0.2543
1,hi_hasoc_2021_7,वोडाफोन ने एक कुत्ता पाला था बहुत फेमस हुआ फिर...,0.066667,0.063371
2,hi_hasoc_2021_12,1818 घंटे दीमक ने जाकर 70 साल के मज़बूत पेड़ क...,0.0,-0.076036
3,hi_hasoc_2021_13,हमारे ग्राम पंचायत सिधांव जिला फतेहपुर उ प्रमे...,0.0,-0.738464
4,hi_hasoc_2021_15,यह मुझे चैन क्यों नहीं पड़ता एक ही शख़्स था जह...,0.0,-0.160516


In [40]:
# test_df_submit['label1'] = preds_labels1
# test_df_submit['label2'] = preds_labels2
# test_df_submit['label3'] = preds_labels3
# test_df_submit['label11'] = preds11
# test_df_submit['label22'] = preds22
# test_df_submit['label33'] = preds33
test_df_submit['label'] = og_preds_labels
test_df_submit.drop(['text', 'frac_profane', 'polarity'], axis=1, inplace=True)
test_df_submit.head()

Unnamed: 0,id,label
0,hi_hasoc_2021_5,HOF
1,hi_hasoc_2021_7,HOF
2,hi_hasoc_2021_12,NOT
3,hi_hasoc_2021_13,NOT
4,hi_hasoc_2021_15,NOT


In [41]:
test_df_submit.label.value_counts()

NOT    1157
HOF     375
Name: label, dtype: int64

In [35]:
test_df_submit.to_csv("submission_with_extra_features30Aug.csv", index=False)

In [36]:
test_df_submit.head()

Unnamed: 0,id,label
0,hi_hasoc_2021_5,HOF
1,hi_hasoc_2021_7,HOF
2,hi_hasoc_2021_12,NOT
3,hi_hasoc_2021_13,HOF
4,hi_hasoc_2021_15,HOF


In [37]:
old_df = pd.read_csv("submission_without_emojis__new_29aug.csv")

In [39]:
old_df.label.value_counts()

NOT    996
HOF    536
Name: label, dtype: int64

In [40]:
test_df_submit[test_df_submit['id']=='hi_hasoc_2021_6047']

Unnamed: 0,id,label
1512,hi_hasoc_2021_6047,HOF
