# Login and data download

In [None]:
!pip install aicrowd-cli
API_KEY = ""
!aicrowd login --api-key $API_KEY

!mkdir data
!aicrowd dataset download --challenge research-paper-classification -j 3 -o data

# Utility functions and constants

In [None]:
def set_seed(value=23):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed_all(value)

In [None]:
def text_preprocessing(text):
    text = text.lower()
    
    return text

In [None]:
def bert_preprocessing(data):
    input_ids = []
    attention_masks = []
    
    for text in data:
        encoded_text = tokenizer.encode_plus(
            text=text_preprocessing(text),
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True)
        
        input_ids.append(encoded_text.get("input_ids"))
        attention_masks.append(encoded_text.get("attention_mask"))
    
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    return input_ids, attention_masks

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 200
CLIP = 1.0

# Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer
from transformers import BertModel

# Data preprocessing

In [None]:
train_dataset = pd.read_csv("./data/train.csv")
validation_dataset = pd.read_csv("./data/val.csv")[1:]
test_data = pd.read_csv("./data/test.csv")

In [None]:
X_train = train_dataset.text.values
y_train = train_dataset.label.values
X_val = validation_dataset.text.values
y_val = validation_dataset.label.values

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
all_data = np.concatenate([X_train, X_val])
encoded_data = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_data]

In [None]:
train_inputs, train_masks = bert_preprocessing(X_train)
val_inputs, val_masks = bert_preprocessing(X_val)

train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# Define model

In [None]:
class BertClassifier(nn.Module):
    
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        in_features, hid_dim, out_features = 768, 16, 4
        
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        self.classifier = nn.Sequential(
            nn.Linear(in_features, hid_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hid_dim, out_features)
        )
        
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
                
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        last_hid_state_cls = outputs[0][:, 0, :]
        
        logits = self.classifier(last_hid_state_cls)
        
        return logits
    
def initialize_model(epochs=4):
    
    bert_classifier = BertClassifier(freeze_bert=False)
    bert_classifier.to(device)
    
    optimizer = AdamW(bert_classifier.parameters(), lr=5e-5, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    return bert_classifier, optimizer, scheduler

In [None]:
loss_fn = nn.CrossEntropyLoss()

# Loop functions

In [None]:
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    
    best_val_loss = np.inf
    t0_epoch, t0_batch = time.time(), time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0
    
    model.train()
    
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            
            model.zero_grad()
            
            logits = model(b_input_ids, b_attn_mask)
            
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            
            optimizer.step()
            scheduler.step()
            
            if (step % 100 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                # Print training results
                
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
                
        avg_train_loss = total_loss / len(train_dataloader)
        
        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), "research_paper_bert.pt")
                
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
        

In [None]:
def evaluate(model, val_dataloder):
    
    model.eval()
    
    val_accuracy = []
    val_loss = []
    
    for batch in val_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
            
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())
        
        preds = torch.argmax(logits, dim=1).flatten()
        
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)
    
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    
    return val_loss, val_accuracy

In [None]:
def predict(model, test_dataloder):
    model.eval()
    
    all_logits = []
    
    for batch in test_dataloader:
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
        
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
        
    all_logits = torch.cat(all_logits, dim=0)
    
    probs = F.softmax(all_logits, dim=1).cpu().numpy()
    
    return probs

# Train model

In [None]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

full_train_data = torch.utils.data.ConcatDataset([train_data, val_data])
full_train_sampler = RandomSampler(full_train_data)
full_train_dataloader = DataLoader(full_train_data, sampler=full_train_sampler, batch_size=batch_size)

set_seed(23)

bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader, epochs=10, evaluation=True)
evaluate(bert_classifier, val_dataloader)

In [None]:
bert_classifier.load_state_dict(torch.load("research_paper_bert.pt"))
loss, accuracy = evaluate(bert_classifier, val_dataloader)
print(f"Best model loss: {loss}")
print(f"Best model accuracy: {accuracy}")

# Prediction

In [None]:
test_inputs, test_masks = bert_preprocessing(test_data.text)

test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

In [None]:
probs = predict(bert_classifier, test_dataloader)
preds = np.argmax(probs, axis=1)
test_data["label"] = preds

# Make submission

In [None]:
import os

!mkdir assets
test_data.to_csv(os.path.join("assets", "submission.csv"), index=False)

In [None]:
!aicrowd notebook submit -c research-paper-classification -a assets --no-verify