In [2]:
!pip install -q --upgrade wandb transformers datasets ml_collections

In [3]:
!pip install sentencepiece

In [4]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

import numpy as np
import os
import pandas as pd
import wandb
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [5]:
# !cat classification_public_datasets/iitp+-movie-reviews/readme.md

In [6]:
# df = pd.read_csv("classification_public_datasets/bbc-articles/hi/hi-test.csv", names=["label", "review"])
# df["label"].unique()

In [7]:
class getDataset(Dataset):
    def __init__(self, path, label_to_int):
        super().__init__()
        self.path = path
        self.df = pd.read_csv(self.path)
        self.label_to_int = label_to_int
        print(len(self.df))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df["text"][idx]
        label = self.label_to_int[self.df["labels"][idx]]
        return text, label

In [8]:
category_mapping = {
    'easy': 0, 'medium': 1, 'hard': 2
}



In [9]:
from ml_collections import ConfigDict

cfg = ConfigDict()
cfg.epochs = 10
cfg.max_length = 256
cfg.batch_size = 32

In [10]:
global_model_alias = "roberta-base"

In [11]:
class Classifier(nn.Module):
    def __init__(self, label_to_int, model_alias=None, cfg=None):
        super().__init__()
        if model_alias is not None:
            self.backbone = AutoModel.from_pretrained(model_alias)
            self.tokenizer = AutoTokenizer.from_pretrained(model_alias)
        else:
            self.backbone = AutoModel.from_pretrained("roberta-base")
            self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        
        self.lin = nn.Linear(768, len(label_to_int.keys()))
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.cfg = cfg

    def forward(self, batch):
        tokenized = self.tokenizer(text=list(batch[0]),
                                return_attention_mask=True,
                                max_length=cfg.max_length,
                                padding='max_length',
                                truncation=True,
                                return_tensors="pt")
        tokenized = {
            k: v.to(self.device) for k, v in tokenized.items()
        }
        x = self.backbone(**tokenized)
        x = self.lin(x.pooler_output)
        return x

In [12]:
train_ds = getDataset("../input/sih-sanfoundary-questions/difficulty/difficulty/diff_m_train.csv", category_mapping)
val_ds = getDataset("../input/sih-sanfoundary-questions/difficulty/difficulty/diff_m_val.csv", category_mapping)
test_ds = getDataset("../input/sih-sanfoundary-questions/difficulty/difficulty/diff_m_test.csv", category_mapping)

train_ds = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, drop_last=True)
val_ds = DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False, drop_last=True)
test_ds = DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False, drop_last=True)

model = Classifier(category_mapping, cfg=cfg, model_alias=global_model_alias)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-6)
loss_fn = nn.CrossEntropyLoss()

def accuracy(true, pred):
    true = np.array(true)
    pred = np.array(pred)
    acc = np.sum((true == pred).astype(np.float32)) / len(true)
    return acc * 100

best_loss = 1000.
for epoch in range(cfg.epochs):
    print(f"##### Epoch {epoch}")
    train_loss = []
    train_preds = []
    train_gts = []

    val_loss = []
    val_preds = []
    val_gts = []

    model.train()
    for batch in train_ds:
        outputs = model(batch)

        loss = loss_fn(outputs, batch[1].to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss.append(loss.detach().cpu().numpy())
        train_preds.append(outputs.argmax(dim=-1).cpu())
        train_gts.append(batch[1].cpu())

    model.eval()
    with torch.no_grad():
        for batch in val_ds:
            outputs = model(batch)

            loss = loss_fn(outputs, batch[1].to(device))

            val_loss.append(loss.detach().cpu().numpy())
            val_preds.append(outputs.argmax(dim=-1).cpu())
            val_gts.append(batch[1].cpu())

    train_gts = torch.concat(train_gts, dim=0)
    train_preds = torch.concat(train_preds, dim=0)
    
    val_gts = torch.concat(val_gts, dim=0)
    val_preds = torch.concat(val_preds, dim=0)

    train_acc = accuracy(train_gts, train_preds)
    val_acc = accuracy(val_gts, val_preds)
    
    print("Train acc: ", train_acc)
    print("Val acc: ", val_acc)
    #torch.save(model.state_dict(), f"./sih_diff_{epoch}.pt")
    if np.mean(val_loss) < best_loss:
        torch.save(model.state_dict(), f"./cat_best.pt")



In [14]:
from sklearn.metrics import classification_report

model.load_state_dict(torch.load("./cat_best.pt"))

# print(f"##### Epoch {epoch}")

val_loss = []
val_preds = []
val_gts = []


model.eval()
with torch.no_grad():
    for batch in test_ds:
        outputs = model(batch)

        loss = loss_fn(outputs, batch[1].to(device))

        val_loss.append(loss.detach().cpu().numpy())
        val_preds.append(outputs.argmax(dim=-1).cpu())
        val_gts.append(batch[1].cpu())



val_gts = torch.concat(val_gts, dim=0)
val_preds = torch.concat(val_preds, dim=0)

val_acc = accuracy(val_gts, val_preds)



print("Val acc: ", val_acc)
print(classification_report(val_gts, val_preds))