In [None]:
import gc
import sys
# This line is required for the framework that I have created. In this notebook, 
# I'd rather place all my functions here itself.
# sys.path.append("./src")
import json
import math
import os
from abc import ABC, abstractmethod
from collections import OrderedDict
from random import randint, sample
from typing import Iterable, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from pandas import DataFrame

from matplotlib.figure import Figure
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.utils import column_or_1d
from torch import Tensor, nn
from torch.nn import functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, IterableDataset
from tqdm.auto import tqdm
import transformers
import datasets

In [None]:
class Config:
    def __init__(self):
        self.SEED = 1996
        self.MODEL_INDEX = 0
        self.BS = 8
        self.EPOCH = 10

config = Config()

In [None]:
def replacer(sentence):
    res = sentence.replace("<e1>", "[")
    res = res.replace("</e1>", "]")
    res = res.replace("<e2>", "{")
    res = res.replace("</e2>", "}")
    return res

def set_random_seed(seed, deterministic=False):
    """Set random seed.

    Args:
        seed (int): Seed to be used.
        deterministic (bool): Whether to set the deterministic option for
            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
            to True and `torch.backends.cudnn.benchmark` to False.
            Default: False.
    """
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_random_seed(config.SEED)

In [None]:
import torch
import numpy as np
from torch import nn
import time
import datetime
from tqdm import tqdm
from sklearn import metrics
import torch.multiprocessing as mp

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val
        self.count += n
        self.avg = self.sum / self.count

class F1Meter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.y_true = np.array([0, 1])
        self.y_pred = np.array([0, 1])
        self.score = 0

    def update(self, y_true, y_pred):
        # print(f"y_true = {y_true}, y_pred = {y_pred}")
        y_true = y_true.cpu().numpy()
        # y_pred = y_pred.detach().cpu().numpy()
        y_pred = nn.functional.softmax(y_pred, dim=1).argmax(axis=1).data.cpu().numpy()
        self.y_true = np.hstack((self.y_true, y_true))
        self.y_pred = np.hstack((self.y_pred, y_pred))
        # print(f"y_true = {self.y_true}, y_pred = {self.y_pred}")
        self.score = metrics.f1_score(self.y_true, self.y_pred, average="micro")
        # print(self.score)
        # exit()

    @property
    def avg(self):
        return self.score

class RocAucMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.y_true = np.array([0, 1])
        self.y_pred = np.array([0.5, 0.5])
        self.score = 0

    def update(self, y_true, y_pred):
        # print(f"y_true = {y_true}, y_pred = {y_pred}")
        y_true = y_true.cpu().numpy()
        y_pred = y_pred.data.cpu().numpy()[:, 1]
        # print(f"y_true = {y_true}, y_pred = {y_pred}")
        # exit()
        self.y_true = np.hstack((self.y_true, y_true))
        self.y_pred = np.hstack((self.y_pred, y_pred))
        self.score = self.y_true/len(self.y_true)

    @property
    def avg(self):
        return self.score

def train_loop_fn(
    data_loader,
    model,
    optimizer,
    device,
    scheduler=None,
    epoch=None,
    wandb=None,
    scaler=None,
    validation_dl=None,
):

    model.train()
    losses = AverageMeter()
    f1score = F1Meter()
    auc = AverageMeter()
    start_time = time.time()
    total_batches = len(data_loader)
    progress = tqdm(data_loader, leave=True, disable=False)
    progress.set_description(f"{epoch}")
    val_auc = 0
    val_f1 = 0
    for bi, d in enumerate(progress):

        ids = d["input_ids"]
        targets = d["labels"]

        ids = ids.to(device, dtype=torch.long)
        # mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()

        if type(scaler) != type(None):
            with torch.cuda.amp.autocast():
                logits = model(input_ids=ids)
                outputs = logits.softmax(dim=1)
                # print(dir(outputs))
                # print(outputs, targets)
                # print(outputs.dtype, targets.dtype)
                loss = torch.nn.functional.cross_entropy(
                    outputs.to(torch.float), targets.to(torch.long)
                )
                # loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)
            scaler.step(optimizer)
            scaler.update()
            # optimizer.zero_grad()
        else:
            outputs = model(input_ids=ids)

            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()

        loss_val = loss.detach().item()
        auc.update(
            torch.sum(outputs.argmax(axis=1).detach() == targets.detach()), ids.size(0)
        )
        f1score.update(targets, outputs)
        losses.update(loss_val, ids.size(0))

        if bi % 100000 == 0 and bi != 0:
            f1score.reset()
            # auc.reset()
            if type(validation_dl) != type(None):
                o, t = eval_loop_fn(validation_dl, model, device)
                y_true = np.array(t)
                y_pred = o
                # # val_auc = metrics.roc_auc_score(y_true, y_pred)
                val_f1 = metrics.f1_score(y_true, y_pred)
                # wandb.log({})
                print(f"epoch={epoch}, F1 = {val_f1}")
                model.train()

        if scheduler is not None:
            scheduler.step()

        progress.set_postfix(
            {
                "Loss": f"{losses.avg:<8.4f}",
                "ACC": f"{auc.avg:<8.4f}",
                "F1": f"{f1score.avg:<8.4f}",
            }
        )

    del loss
    del losses
    del outputs
    del ids
    del targets
    model.eval()


def eval_loop_fn(data_loader, model, device):

    model.eval()
    fin_targets = []
    fin_outputs = []
    fin_index = []
    with torch.no_grad():
        pbar = tqdm(data_loader, leave=False, disable=False)
        for bi, d in enumerate(pbar):

            # if bi % 100 == 0:
            #     print(f'EVAL bi={bi}')

            ids = d["input_ids"]
            targets = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            logits = model(input_ids=ids)
            outputs = logits.softmax(dim=1)

            targets_np = targets.cpu().detach().numpy().tolist()
            # targets_np = targets.cpu().detach().numpy().tolist()
            outputs_np = outputs.cpu().detach().numpy().argmax(axis=1).tolist()

            fin_targets.extend(targets_np)
            fin_outputs.extend(outputs_np)

    return fin_outputs, fin_targets


def pred_loop_fn(data_loader, model, device):

    # model.eval()
    fin_ids = []
    fin_outputs = []
    fin_index = []
    with torch.no_grad():
        pbar = tqdm(data_loader, leave=False, disable=False)
        for bi, d in enumerate(pbar):

            # if bi % 100 == 0:
            #     print(f'EVAL bi={bi}')

            ids = d["input_ids"]
            idx = d["ids"]

            ids = ids.to(device, dtype=torch.long)

            logits = model(input_ids=ids)
            outputs = logits.softmax(dim=1)

            outputs_np = outputs.cpu().detach().numpy().argmax(axis=1).tolist()

            fin_ids.extend(idx.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs_np)

    return fin_outputs, fin_ids

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

%cd /content/gdrive/My Drive/Kaggle
path = %pwd

In [None]:
ROOT_DIR = os.path.abspath("./datathon")

df = [
    pd.read_csv("train.tsv", delimiter="\t"),
    pd.read_csv("en.tsv", delimiter="\t"),
]

df = pd.concat(df)

df.Sentence = df.Sentence.apply(lambda x: replacer(x))
df.to_csv("processsed.csv", sep="\t")
df = df.sample(frac=1)
NO_LABELS = len(df.Relation.value_counts())
print("NO_LABELS: ", NO_LABELS)
df.reset_index(drop=True, inplace=True)
train_df, valid_df = train_test_split(
    df, test_size=0.1, shuffle=True, random_state=config.SEED
)
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
# print(train_df.tail())
print(train_df.shape)
print(valid_df.shape)
test_df = pd.read_csv("../input/indore-datathon-2021/valid.tsv", delimiter="\t")
test_df.Sentence = test_df.Sentence.apply(lambda x: replacer(x))
train_df.tail()

NO_LABELS:  25
(12051, 4)
(1340, 4)


Unnamed: 0,Relation,Sentence,NER1,NER2
12046,director,[ভয়ংকর সুন্দর] {অনিমেষ আইচ} পরিচালিত ২০১৭ সাল...,WORK_OF_ART,PERSON
12047,parent_organization,{The Walt Disney Company} acquired the parent ...,ORG,ORG
12048,subsidiary,"RV ""Pelagia"" is a research vessel in the servi...",ORG,ORG
12049,child,{హర్ష్ వర్ధన్ కపూర్} భారతీయ నటుడు. ప్రముఖ కపూర...,PERSON,PERSON
12050,position_held,बता दें कि [जसवंत सिंह] भारतीय राजनीति के उन थ...,PERSON,OTHER


In [None]:
# --- Subject & object markup ---
SUB_START_CHAR = "["
SUB_END_CHAR = "]"
OBJ_START_CHAR = "{"
OBJ_END_CHAR = "}"

In [None]:
# --- BERT variants ---
# See https://huggingface.co/transformers/pretrained_models.html for the full list
AVAILABLE_PRETRAINED_MODELS = [
    "xlm-roberta-base",  # 0
    "xlm-roberta-large",  # 1
    "bert-base-uncased",  # 2
    "bert-base-multilingual-cased",  # 3
    "gpt2",  # 4
    "distilroberta-base",  # 5
    "roberta-base",  # 6
    "albert-base-v1",  # 7
    "albert-base-v2",  # 8
    "bert-large-uncased",  # 9
]

In [None]:
class IndoML(nn.Module):
    def __init__(self, name) -> None:
        super(IndoML, self).__init__()
        self.num_labels = NO_LABELS
        self.roberta = transformers.AutoModel.from_pretrained(
            name,
            output_hidden_states=False,
            num_labels=NO_LABELS,
            cache_dir="/tmp/cache/",
        )
        self.outlayer = self.roberta.pooler.dense.out_features
        self.dropout = nn.Dropout(p=0.4)
        self.ln = nn.LayerNorm(self.outlayer)
        self.classifier = nn.Linear(self.outlayer, self.num_labels)

    def forward(self, input_ids=None):

        out = self.roberta(input_ids)
        x1 = torch.mean(out.last_hidden_state, 1)

        x = x1

        x = self.ln(x)
        x = self.dropout(x)

        logits = self.classifier(x)
        return logits

In [None]:
LAB2ID = {
    "director": 0,
    "child": 1,
    "spouse": 2,
    "sport": 3,
    "father": 4,
    "award_received": 5,
    "mother": 6,
    "position_held": 7,
    "sibling": 8,
    "original_language_of_film_or_TV_show": 9,
    "employer": 10,
    "occupation": 11,
    "parent_organization": 12,
    "capital": 13,
    "discoverer_or_inventor": 14,
    "founded_by": 15,
    "participant": 16,
    "tributary": 17,
    "subsidiary": 18,
    "winner": 19,
    "capital_of": 20,
    "place_of_birth": 21,
    "place_of_death": 22,
    "student_of": 23,
    "nominated_for": 24,
}

ID2LAB = list(LAB2ID.keys())

In [None]:
class IndoMLDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, X, y=None) -> None:
        self.label2id = LAB2ID
        self.X = X.astype(str)
        self.y = y
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            tokenizer, cache_dir="/tmp/cache/"
        )

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        txt = self.X.iloc[idx]
        tokenized = self.tokenizer.encode_plus(
            txt,
            return_attention_mask=False,
            return_token_type_ids=False,
            truncation=True,
            padding="max_length",
            max_length=196,
            return_tensors="pt",
        )["input_ids"].view(-1)
        if type(self.y) != type(None):
            lab = torch.tensor(self.label2id[self.y.iloc[idx]], dtype=torch.long)
        else:
            lab = torch.tensor(0, dtype=torch.long)
        return {
            # "text": txt,
            "input_ids": tokenized,
            "labels": lab,
        }

class IndoMLTestDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, X, idx) -> None:
        self.label2id = LAB2ID
        self.X = X.astype(str)
        self.y = idx
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            tokenizer, cache_dir="/tmp/cache/"
        )

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        txt = self.X.iloc[idx]
        tokenized = self.tokenizer.encode_plus(
            txt,
            return_attention_mask=False,
            return_token_type_ids=False,
            truncation=True,
            padding="max_length",
            max_length=196,
            return_tensors="pt",
        )["input_ids"].view(-1)

        return {
            "input_ids": tokenized,
            "ids": self.y.iloc[idx],
        }

In [None]:
train_ds = IndoMLDataset(AVAILABLE_PRETRAINED_MODELS[config.MODEL_INDEX], train_df.Sentence, train_df.Relation)
valid_ds = IndoMLDataset(AVAILABLE_PRETRAINED_MODELS[config.MODEL_INDEX], valid_df.Sentence, valid_df.Relation)
test_ds = IndoMLTestDataset(AVAILABLE_PRETRAINED_MODELS[config.MODEL_INDEX], test_df.Sentence, test_df.Id)

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

In [None]:
train_dl = torch.utils.data.DataLoader(
    train_ds, batch_size=config.BS, shuffle=True, num_workers=4
)
valid_dl = torch.utils.data.DataLoader(
    valid_ds, batch_size=config.BS, shuffle=False, num_workers=4
)
test_dl = torch.utils.data.DataLoader(
    test_ds, batch_size=config.BS, shuffle=False, num_workers=4
)

  cpuset_checked))


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Starting run ...")
model = IndoML(name=AVAILABLE_PRETRAINED_MODELS[config.MODEL_INDEX])
model.to(device)

Starting run ...


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


IndoML(
  (roberta): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [None]:
print(model.classifier)
print("done loading model")
print(f"Training on {len(train_ds)} samples")
print(f"Evaluating on {len(valid_ds)} samples")

Linear(in_features=768, out_features=25, bias=True)
done loading model
Training on 12051 samples
Evaluating on 1340 samples


In [None]:
num_train_steps = int(len(train_ds) / int(config.BS))
optimizer = torch.optim.AdamW(
    [
        {"params": model.roberta.parameters(), "lr": float(5e-6)},
        {
            "params": [
                param
                for name, param in model.named_parameters()
                if "roberta" not in name
            ],
            "lr": float(1e-3),
        },
    ],
    lr=float(1e-3),
    weight_decay=0,
)

scheduler = transformers.get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps * int(config.EPOCH)
)

scaler = torch.cuda.amp.GradScaler(enabled=True)

In [None]:
!mkdir indoout

In [None]:
for epoch in range(config.EPOCH):
    train_loop_fn(
        train_dl,
        model,
        optimizer,
        device,
        scheduler=scheduler,
        epoch=epoch,
        scaler=scaler,
    )
    o, t = eval_loop_fn(valid_dl, model, device)
    y_true = np.array(t)
    y_pred = o
    auc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="micro")

    print(f"epoch={epoch}, acc = {auc}, F1 = {f1}")
    preds, ids = pred_loop_fn(test_dl, model, device)
    result_df = pd.DataFrame()
    result_df["Id"] = ids
    result_df["Relation"] = preds
    result_df.Relation = result_df.Relation.apply(lambda x: ID2LAB[x])
    result_df.to_csv(f"./indoout/submission_{epoch}.csv", index=None)
    print(f"File saved in ./indoout/submission_{epoch}.csv")

0: 100%|██████████| 1507/1507 [06:16<00:00,  4.00it/s, Loss=0.3621, ACC=0.3930, F1=0.3931]
                                                 

epoch=0, acc = 0.5276119402985074, F1 = 0.5276119402985074


                                                 

File saved in ./indoout/submission_0.csv


1: 100%|██████████| 1507/1507 [06:16<00:00,  4.00it/s, Loss=0.3368, ACC=0.5934, F1=0.5935]
                                                 

epoch=1, acc = 0.641044776119403, F1 = 0.641044776119403


                                                 

File saved in ./indoout/submission_1.csv


2: 100%|██████████| 1507/1507 [06:16<00:00,  4.00it/s, Loss=0.3234, ACC=0.6995, F1=0.6996]
                                                 

epoch=2, acc = 0.7276119402985075, F1 = 0.7276119402985076


                                                 

File saved in ./indoout/submission_2.csv


3: 100%|██████████| 1507/1507 [06:17<00:00,  3.99it/s, Loss=0.3166, ACC=0.7540, F1=0.7541]
                                                 

epoch=3, acc = 0.7626865671641792, F1 = 0.7626865671641793


                                                 

File saved in ./indoout/submission_3.csv


4: 100%|██████████| 1507/1507 [06:17<00:00,  4.00it/s, Loss=0.3119, ACC=0.7912, F1=0.7913]
                                                 

epoch=4, acc = 0.8082089552238806, F1 = 0.8082089552238807


                                                 

File saved in ./indoout/submission_4.csv


5: 100%|██████████| 1507/1507 [06:16<00:00,  4.01it/s, Loss=0.3057, ACC=0.8416, F1=0.8416]
                                                 

epoch=5, acc = 0.8544776119402985, F1 = 0.8544776119402986


                                                 

File saved in ./indoout/submission_5.csv


6: 100%|██████████| 1507/1507 [06:15<00:00,  4.02it/s, Loss=0.3024, ACC=0.8679, F1=0.8679]
                                                 

epoch=6, acc = 0.8522388059701492, F1 = 0.8522388059701492


                                                 

File saved in ./indoout/submission_6.csv


7: 100%|██████████| 1507/1507 [06:15<00:00,  4.02it/s, Loss=0.3008, ACC=0.8807, F1=0.8807]
                                                 

epoch=7, acc = 0.8634328358208955, F1 = 0.8634328358208955


                                                 

File saved in ./indoout/submission_7.csv


8: 100%|██████████| 1507/1507 [06:15<00:00,  4.01it/s, Loss=0.3000, ACC=0.8871, F1=0.8872]
                                                 

epoch=8, acc = 0.8656716417910447, F1 = 0.8656716417910447


                                                 

File saved in ./indoout/submission_8.csv


9: 100%|██████████| 1507/1507 [06:15<00:00,  4.01it/s, Loss=0.2998, ACC=0.8891, F1=0.8892]
                                                 

epoch=9, acc = 0.8686567164179104, F1 = 0.8686567164179104


                                                 

File saved in ./indoout/submission_9.csv




In [None]:
result_df.to_csv(f"./submission.csv", index=None)