In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ls /content/drive/MyDrive/HASOC

best_xlmr_hi_model_0.1split_taska.pt  best_xlmr_hi_model_taskb_with_emoji.pt
best_xlmr_hi_model_taskb.pt


In [3]:
models_path = "/content/drive/MyDrive/HASOC"

In [4]:
!pip install wandb -qqq
import wandb

wandb.login()

[K     |████████████████████████████████| 1.7 MB 4.3 MB/s 
[K     |████████████████████████████████| 97 kB 5.8 MB/s 
[K     |████████████████████████████████| 170 kB 45.4 MB/s 
[K     |████████████████████████████████| 133 kB 44.1 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
wandb.init(project="xlmr-hi-new", config={
    "learning_rate": 1e-5,
    "architecture": "xlmr",
    "dataset": "all",
})
config = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33mmallika2011[0m (use `wandb login --relogin` to force relogin)


In [6]:
%%capture
!pip install transformers
!pip install datasets
!pip install demoji
!pip install nltk
!pip install emoji
!pip install indic-nlp-library

In [7]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig, AutoModel, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from datasets import ClassLabel, Value
import torch
from tqdm.notebook import tqdm
import numpy as np
import seaborn as sns
import pickle
from collections import Counter
import sys, os
import demoji
import emoji
import string
import re

In [8]:
def remove_punctuation(text):
    return ''.join([c for c in text if c not in string.punctuation])

def remove_url(text):
    return re.sub(r'https?://\S+', ' ', text)

def remove_mention(text):
    return re.sub(r'@[A-Za-z0-9_]+', ' ', text)

def remove_hashtag(text):
    return re.sub(r'.*?(#\w+)|.+', ' ', text)  #TODO: check if using/removing the hashtag has any improvement

def remove_emoji(text):
    # return emoji.demojize(text)
    return demoji.replace(text, ' ')

def translate_to_hindi(text):                   #CHECK translation of sentences (since some are in english also)
    print(text)
    res = translator.translate(text).text
    print(res)
    return res

def normalize_hindi_script(text):
    from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
    factory=IndicNormalizerFactory()
    normalizer=factory.get_normalizer("hi",remove_nuktas=False)
    return normalizer.normalize(text)

def clean_row(text):
    return ' '.join(remove_punctuation(remove_url(remove_mention(remove_emoji(text)))).split()).lower()

def clean(df):
    df['clean_text'] = df.text.apply(clean_row)
    return df

In [9]:
test_df = pd.read_csv("hi_Hasoc2021_test_task1.csv")
test_df = clean(test_df)
print(test_df.columns)
test_df = test_df.drop(['text', '_id'], axis=1)
test_df.columns = ['id', 'text']

test_df.head()

Index(['_id', 'tweet_id', 'text', 'clean_text'], dtype='object')


Unnamed: 0,id,text
0,hi_hasoc_2021_5,गधा तू हैइसलिए एक ही बक रहा है।
1,hi_hasoc_2021_7,वोडाफोन ने एक कुत्ता पाला था बहुत फेमस हुआ फिर...
2,hi_hasoc_2021_12,1818 घंटे दीमक ने जाकर 70 साल के मज़बूत पेड़ क...
3,hi_hasoc_2021_13,हमारे ग्राम पंचायत सिधांव जिला फतेहपुर उ प्रमे...
4,hi_hasoc_2021_15,यह मुझे चैन क्यों नहीं पड़ता एक ही शख़्स था जह...


In [10]:
master_df = pd.read_csv("./hi_Hasoc2021_train.csv", index_col=0)
master_df = master_df.drop(['_id', 'tweet_id', 'task_2'], axis=1)
master_df = clean(master_df)
master_df = master_df.drop(['text'], axis=1)

master_df.head()

Unnamed: 0,task_1,clean_text
998,NOT,hindus dying hinduslivesmatter kindattnpmnaren...
4049,NOT,सब लोग इतने पैसे डोनेट ही कर रहे हैं फिर भी आम...
1757,NOT,शेरए सिवान शहाबुद्दीन साहब से रिश्ता क्या لا إ...
5175,HOF,आसमानी किताब के नाजायज औलाद है।
1825,NOT,इसे कहते हैं दोगला पंती जिस सपा की दम पर 0 से ...


In [11]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(master_df, test_size=0.01, stratify = master_df['task_1'])

train_df.columns = ['labels', 'text']
valid_df.columns = ['labels', 'text']

print(f"Train Shape: {train_df.shape}, Valid Shape: {valid_df.shape}")

train_df.to_csv('./train.csv',columns=['labels', 'text'], index = False)
valid_df.to_csv('./valid.csv',columns=['labels', 'text'], index = False)

display(train_df)
display(valid_df)


Train Shape: (4548, 2), Valid Shape: (46, 2)


Unnamed: 0,labels,text
5475,HOF,ऑक्सीजन की कमी हुई मोदी जी ने आपूर्ति की।रेमेड...
4526,HOF,एम डी बोले क्लासिकल तबला रेडियो पर जाकिर हुसैन...
1541,NOT,prime minister narendra modi ji 𝗣𝗿𝗼𝗺𝗶𝘀𝗲𝗱 ache ...
2019,NOT,मोदी देश छोड़ो resignpmmodi modikavaccinejumla...
3228,NOT,modikavaccinejumla modiresign indiafightscoron...
...,...,...
1285,NOT,आज कश्मीर बेहतर है बंगाल से वेस्ट बंगाल हल्दीय...
3447,NOT,सीवान खत्म हो गयी तेरी पहचान जिसके नाम से तू ज...
2203,NOT,भीक नहीं लिया दोस्त से एहसान लिया seems entire...
1732,NOT,बंगाल विजय करने के चक्कर में पूरा देश हारेंगे ...


Unnamed: 0,labels,text
3323,NOT,महिला दिवस पर अंदर का जानवर मर जाता हैजैसे 26ज...
269,NOT,lets face it 𝗖𝗼𝘃𝗶𝗱𝟭𝟵 day 𝟰𝟮𝟳 𝟯𝟵𝟮𝟰𝟴𝟴 fresh indi...
2984,NOT,पुरुष जब प्रेम में छले गए तो रोगी बने स्त्रियो...
1638,NOT,टीकाकरण अवश्य करायें vaccination covidvaccine ...
1528,NOT,मुंबई पोलिसांचा वर्सोवा येथील हॉटेलमध्ये सुरू ...
4946,NOT,सब याद रखा जायेगा justiceforshahabuddin
5823,HOF,so then according to your tweet i think people...
4334,NOT,bjp it cell ममताखूनीहै isupportmodiji राष्ट्रप...
3873,NOT,क्या से क्या हो गया कहा से कहा आ गये हम modika...
3058,NOT,ना जानें कितने चिरागों को मिल गई शोहरत एक आफता...


In [12]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 8
LEARNING_RATE = 1e-05
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', truncation=True)

In [13]:
labels = ClassLabel(num_classes = 2, names = ['HOF', 'NOT'])

In [14]:
class HindiData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, isTest = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.max_len = max_len
        self.isTest = isTest
        if not self.isTest:
            self.targets = self.data.labels

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        if not self.isTest:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(labels.str2int(self.targets[index]), dtype=torch.float)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            }

In [15]:
train_data=train_df.reset_index(drop=True)
valid_data = valid_df.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_data.shape))
print("VALID Dataset: {}".format(valid_data.shape))

training_set = HindiData(train_data, tokenizer, MAX_LEN, False)
valid_set = HindiData(valid_data, tokenizer, MAX_LEN, False)

TRAIN Dataset: (4548, 2)
VALID Dataset: (46, 2)


In [16]:
test_data=test_df.reset_index(drop=True)
print("TEST Dataset: {}".format(test_data.shape))
testing_set = HindiData(test_data, tokenizer, MAX_LEN, True)

TEST Dataset: (1532, 2)


In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)

In [18]:
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(testing_set, **test_params)

In [19]:
from torch import nn

class CustomXLMRModel(torch.nn.Module):
    def __init__(self,num_labels=2):
        super(CustomXLMRModel, self).__init__()
        self.num_labels = num_labels
        self.xlmr = AutoModel.from_pretrained("xlm-roberta-base", num_labels = num_labels)
        # self.xlmr = RobertaModel.from_pretrained("roberta-base")
        ### New layers:
        self.conv_layer = torch.nn.Conv1d(768, 1, kernel_size=3)
        
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(12, 2)
        self.dropout = nn.Dropout(0.7)

    def forward(self, ids, mask, token_type_ids):
        outputs = self.xlmr(ids, attention_mask=mask, token_type_ids=token_type_ids, output_hidden_states=True)
        
        last_hidden_state, pooler_output, allhidden = outputs[0], outputs[1], outputs[2]
        
        # print(len(allhidden))
        # print(allhidden[0].size())
        # print(allhidden[1].size())

        feature_vec = []

        for i, hstate in enumerate(allhidden[1:]):
            
            # print("For hidden state:", i+1)
            # print("original shape", hstate.size())

            hstate = hstate.permute(0, 2, 1)
            # print("transposed shape", hstate.size())

            conv_out = F.relu(self.conv_layer(hstate))
            conv_out = conv_out.to(device)

            # print("shape after conv:", conv_out.size())
            
            max_out = F.max_pool1d(conv_out, kernel_size=conv_out.shape[2])
            # print("shape after max pool:", max_out.size())

            feature_vec.append(max_out)

        result = torch.cat(feature_vec, dim=1).squeeze(dim=2)
        result = result.to(device)
        # print("result shape:", result.size())

        logits = self.fc(self.dropout(result))
        logits = logits.to(device)

        return logits


In [20]:
torch.cuda.empty_cache()
# model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels = 2)
model = CustomXLMRModel()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomXLMRModel(
  (xlmr): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [21]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
# loss_function = torch.nn.BCELoss()
# loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-5)
# optimizer = torch.optim.Adadelta(model.parameters(),
#                                lr=2e-5,
#                                rho=0.95)

In [22]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [23]:
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             precision_recall_fscore_support)

In [24]:
best_val_loss = 0
best_val_acc = 0

def train(epoch):

    #------------------------------------------------#
    #                TRAIN BLOCK                     #
    #------------------------------------------------#
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    y_pred = []
    y_true = []
    y_pred_val = []
    y_true_val = []
    model.train()


    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        # big_idx = outputs.data
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        y_pred.extend(big_idx.tolist())
        y_true.extend(targets.cpu().numpy())
        
        if _%10==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            # print(f"Training Loss per 10 steps: {loss_step}")
            # print(f"Training Accuracy per 10 steps: {accu_step}")
            log_dict = {
                "train_loss_steps": loss_step,
                "train_acc_steps": accu_step,
            }
            wandb.log(log_dict)

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    #------------------------------------------------#
    #                VALID BLOCK                     #
    #------------------------------------------------#

    val_loss = 0
    n_correct_val = 0
    nb_val_steps = 0
    nb_val_examples = 0
    model.eval()

    for _,data in tqdm(enumerate(valid_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        vloss = loss_function(outputs, targets)
        val_loss += vloss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct_val += calcuate_accuracy(big_idx, targets)

        nb_val_steps += 1
        nb_val_examples+=targets.size(0)
        
        if _%10==0:
            val_loss_step = val_loss/nb_val_steps
            val_accu_step = (n_correct_val*100)/nb_val_examples 
            # print(f"Training Loss per 10 steps: {loss_step}")
            # print(f"Training Accuracy per 10 steps: {accu_step}")
            log_dict = {
                "val_loss_steps": val_loss_step,
                "val_acc_steps": val_accu_step,
            }
            wandb.log(log_dict)

        y_pred_val.extend(big_idx.tolist())
        y_true_val.extend(targets.cpu().numpy())


    #------------------------------------------------#
    #                LOG BLOCK                       #
    #------------------------------------------------#

    print(f'The Total Train Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    print(f'The Total Val Accuracy for Epoch {epoch}: {(n_correct_val*100)/nb_val_examples}')
    epoch_loss_val = tr_loss/nb_val_steps
    epoch_accu_val = (n_correct_val*100)/nb_val_examples
    print(f"Val Loss Epoch: {epoch_loss_val}")
    print(f"Val Accuracy Epoch: {epoch_accu_val}")

    log_dict = {
        "Epoch": epoch,
        "Train Loss": epoch_loss,
        "Train Acc": epoch_accu,
        "Valid Loss": epoch_loss_val,
        "Valid Acc": epoch_accu_val
    }
    wandb.log(log_dict)

    print("--------------------------------------------------")

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    epoch_accu = accuracy_score(y_true, y_pred)
    print(f"Train Stats -- Accu: {epoch_accu}, Prec: {precision}, Recall: {recall}, F1: {f1}")

    precision, recall, f1, _ = precision_recall_fscore_support(y_true_val, y_pred_val, average='macro')
    epoch_accu = accuracy_score(y_true_val, y_pred_val)
    print(f"Val Stats   -- Accu: {epoch_accu}, Prec: {precision}, Recall: {recall}, F1: {f1}")

    print("\n")

    #------------------------------------------------#
    #                SAVE  BLOCK                     #
    #------------------------------------------------#

    global best_val_loss
    global best_val_acc

    if epoch == 0:
        torch.save(model, os.path.join(models_path, 'best_xlmr_hi_model_full_split_taska.pt'))
        best_val_loss = epoch_loss_val
        best_val_acc = epoch_accu_val

    elif epoch_accu_val > best_val_acc:
        best_val_acc = epoch_accu_val
        torch.save(model, os.path.join(models_path, 'best_xlmr_hi_model_full_split_taska.pt'))

    # elif epoch_loss_val < best_val_loss:
    #     best_val_loss = epoch_loss_val
    #     torch.save(model, os.path.join(models_path, 'best_xlmr_hi_model.pt'))

    print("Best loss so far", best_val_loss)
    print("Best Accu so far", best_val_acc)

    return 

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 0: 64.9736147757256
Training Loss Epoch: 0.6504276500121152
Training Accuracy Epoch: 64.9736147757256
The Total Val Accuracy for Epoch 0: 69.56521739130434
Val Loss Epoch: 30.841111071407795
Val Accuracy Epoch: 69.56521739130434
--------------------------------------------------
Train Stats -- Accu: 0.649736147757256, Prec: 0.49656032229965164, Recall: 0.49857704337179914, F1: 0.46319853906292896
Val Stats   -- Accu: 0.6956521739130435, Prec: 0.34782608695652173, Recall: 0.5, F1: 0.41025641025641024




  _warn_prf(average, modifier, msg_start, len(result))


Best loss so far 30.841111071407795
Best Accu so far 69.56521739130434


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 1: 69.10729991204926
Training Loss Epoch: 0.581579778494022
Training Accuracy Epoch: 69.10729991204926
The Total Val Accuracy for Epoch 1: 71.73913043478261
Val Loss Epoch: 27.576574496924877
Val Accuracy Epoch: 71.73913043478261
--------------------------------------------------
Train Stats -- Accu: 0.6910729991204925, Prec: 0.6269522133374144, Recall: 0.5149464499394264, F1: 0.45000522026550965
Val Stats   -- Accu: 0.717391304347826, Prec: 0.8555555555555556, Recall: 0.5357142857142857, F1: 0.4822510822510822


Best loss so far 30.841111071407795
Best Accu so far 71.73913043478261


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 2: 70.51451187335093
Training Loss Epoch: 0.5731475195849088
Training Accuracy Epoch: 70.51451187335093
The Total Val Accuracy for Epoch 2: 71.73913043478261
Val Loss Epoch: 27.176744886984427
Val Accuracy Epoch: 71.73913043478261
--------------------------------------------------
Train Stats -- Accu: 0.7051451187335093, Prec: 0.6942594740995993, Recall: 0.540578587948652, F1: 0.4995627746402563
Val Stats   -- Accu: 0.717391304347826, Prec: 0.8555555555555556, Recall: 0.5357142857142857, F1: 0.4822510822510822


Best loss so far 30.841111071407795
Best Accu so far 71.73913043478261


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 3: 71.50395778364116
Training Loss Epoch: 0.5695518662462336
Training Accuracy Epoch: 71.50395778364116
The Total Val Accuracy for Epoch 3: 76.08695652173913
Val Loss Epoch: 27.006250991175573
Val Accuracy Epoch: 76.08695652173913
--------------------------------------------------
Train Stats -- Accu: 0.7150395778364116, Prec: 0.6875159699683047, Recall: 0.5699143996318962, F1: 0.5545289071363855
Val Stats   -- Accu: 0.7608695652173914, Prec: 0.7417582417582418, Recall: 0.6473214285714286, F1: 0.6606304493628438


Best loss so far 30.841111071407795
Best Accu so far 76.08695652173913


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 4: 73.54881266490766
Training Loss Epoch: 0.5440398510709169
Training Accuracy Epoch: 73.54881266490766
The Total Val Accuracy for Epoch 4: 71.73913043478261
Val Loss Epoch: 25.796556271612644
Val Accuracy Epoch: 71.73913043478261
--------------------------------------------------
Train Stats -- Accu: 0.7354881266490765, Prec: 0.7192159414798096, Recall: 0.6078832202603078, F1: 0.6095431087360745
Val Stats   -- Accu: 0.717391304347826, Prec: 0.8555555555555556, Recall: 0.5357142857142857, F1: 0.4822510822510822


Best loss so far 30.841111071407795
Best Accu so far 76.08695652173913


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 5: 73.30694810905892
Training Loss Epoch: 0.5234886474108654
Training Accuracy Epoch: 73.30694810905892
The Total Val Accuracy for Epoch 5: 76.08695652173913
Val Loss Epoch: 24.822086698065203
Val Accuracy Epoch: 76.08695652173913
--------------------------------------------------
Train Stats -- Accu: 0.7330694810905892, Prec: 0.7015759493596043, Recall: 0.6151760418968161, F1: 0.6203698890662838
Val Stats   -- Accu: 0.7608695652173914, Prec: 0.7252252252252251, Recall: 0.6674107142857143, F1: 0.681159420289855


Best loss so far 30.841111071407795
Best Accu so far 76.08695652173913


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 6: 76.12137203166228
Training Loss Epoch: 0.5041106013558242
Training Accuracy Epoch: 76.12137203166228
The Total Val Accuracy for Epoch 6: 78.26086956521739
Val Loss Epoch: 23.903244347621996
Val Accuracy Epoch: 78.26086956521739
--------------------------------------------------
Train Stats -- Accu: 0.7612137203166227, Prec: 0.74138721061847, Recall: 0.6612410533122255, F1: 0.6752433343432813
Val Stats   -- Accu: 0.782608695652174, Prec: 0.7697368421052632, Recall: 0.6830357142857143, F1: 0.7012987012987013


Best loss so far 30.841111071407795
Best Accu so far 78.26086956521739


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 7: 77.3306948109059
Training Loss Epoch: 0.4904322481872956
Training Accuracy Epoch: 77.3306948109059
The Total Val Accuracy for Epoch 7: 84.78260869565217
Val Loss Epoch: 23.25466243488093
Val Accuracy Epoch: 84.78260869565217
--------------------------------------------------
Train Stats -- Accu: 0.773306948109059, Prec: 0.7553816790028453, Recall: 0.681776290407475, F1: 0.6978311645838401
Val Stats   -- Accu: 0.8478260869565217, Prec: 0.8633633633633633, Recall: 0.7700892857142857, F1: 0.7971014492753623


Best loss so far 30.841111071407795
Best Accu so far 84.78260869565217


0it [00:00, ?it/s]

0it [00:00, ?it/s]

The Total Train Accuracy for Epoch 8: 77.88038698328936
Training Loss Epoch: 0.4735681692200511
Training Accuracy Epoch: 77.88038698328936
The Total Val Accuracy for Epoch 8: 86.95652173913044
Val Loss Epoch: 22.455024023850758
Val Accuracy Epoch: 86.95652173913044
--------------------------------------------------
Train Stats -- Accu: 0.7788038698328936, Prec: 0.7663779773201631, Recall: 0.6863488730197018, F1: 0.7035009953962922
Val Stats   -- Accu: 0.8695652173913043, Prec: 0.857843137254902, Recall: 0.8258928571428572, F1: 0.8391608391608392


Best loss so far 30.841111071407795
Best Accu so far 86.95652173913044


0it [00:00, ?it/s]

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

# Run model on test set and prepare submission

In [18]:
BEST_MODEL_PATH = os.path.join(models_path, "best_xlmr_hi_model_0.1split_taska.pt")

In [19]:
testing_model = torch.load(BEST_MODEL_PATH)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
testing_model.to(device)
testing_model.eval()

CustomXLMRModel(
  (xlmr): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [20]:
with torch.no_grad():

    preds = []

    for _,data in tqdm(enumerate(testing_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)

        outputs = testing_model(ids, mask, token_type_ids)
        big_val, big_idx = torch.max(outputs.data, dim=1)
        preds += big_idx.cpu().tolist()

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [21]:
print(len(preds))

1532


In [22]:
preds_labels = [labels.int2str(p) for p in preds]
print(len(preds_labels))

1532


In [23]:
test_df_submit = test_df.copy()
test_df_submit.head()

Unnamed: 0,id,text
0,hi_hasoc_2021_5,गधा तू हैइसलिए एक ही बक रहा है।
1,hi_hasoc_2021_7,वोडाफोन ने एक कुत्ता पाला था बहुत फेमस हुआ फिर...
2,hi_hasoc_2021_12,1818 घंटे दीमक ने जाकर 70 साल के मज़बूत पेड़ क...
3,hi_hasoc_2021_13,हमारे ग्राम पंचायत सिधांव जिला फतेहपुर उ प्रमे...
4,hi_hasoc_2021_15,यह मुझे चैन क्यों नहीं पड़ता एक ही शख़्स था जह...


In [24]:
test_df_submit['label'] = preds_labels
test_df_submit.drop(['text'], axis=1, inplace=True)
test_df_submit.head()

Unnamed: 0,id,label
0,hi_hasoc_2021_5,HOF
1,hi_hasoc_2021_7,HOF
2,hi_hasoc_2021_12,NOT
3,hi_hasoc_2021_13,NOT
4,hi_hasoc_2021_15,NOT


In [25]:
test_df_submit.label.value_counts()

NOT    1125
HOF     407
Name: label, dtype: int64

In [26]:
test_df_submit.to_csv("submission_without_emojis_29aug.csv", index=False)