# Competetive submission

In [1]:
from copy import deepcopy
import os
os.environ["TRANSFORMERS_CACHE"]='/home/burouj/work/cache'
from time import time
from typing import Tuple

import pandas as pd
import torch
import torch.nn as nn
from torch.nn import Module
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from transformers import BertTokenizer, BertModel

torch.manual_seed(7)

<torch._C.Generator at 0x7f353b7b7a70>

## Preprocessing

In [2]:
class MultimodalDataset(Dataset):
    def __init__(self, path_x, path_y) -> None:
        super().__init__()
        self.df = pd.read_csv(path_x).drop(["Id"], axis=1)
        self.labels = pd.read_csv(path_y).drop(["Id"], axis=1)
        return
    
    def __len__(self) -> int:
        return self.labels.shape[0]

    def __getitem__(self, index) -> Tuple[str, str, int]:
        text = self.df["Title"].iloc[index]
        image = self.df["Cover_image_name"].iloc[index]
        label = self.labels["Genre"].iloc[index]
        return (text, image, label)

In [3]:
pipeline_label = lambda x: int(x)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
pipeline_text = lambda x: tokenizer(x, return_tensors="pt", truncation=True, padding=True)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    list_label = list()
    text = list(map(lambda x:x[0], batch))
    tokenized=pipeline_text(text)
    for (__, __, label) in batch:
        list_label.append(pipeline_label(label))
    list_label = torch.tensor(list_label, dtype=torch.long)
    return tokenized['input_ids'].to(device), tokenized['attention_mask'].to(device), list_label.to(device)

## Model

In [11]:
class Book_genre(Module):
    def __init__(self) -> None:
        super().__init__()
        # Text
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        for param in self.bert.parameters():
            param.requires_grad = False
        # NN
        self.fc1 = nn.Linear(768, 128)
        self.fc2 = nn.Linear(128, 30)
        return

    def forward(self, text_input_ids, text_attention):
        x = self.bert(input_ids=text_input_ids, attention_mask=text_attention)
        x = x.last_hidden_state[:, 0, :] # extract the cls embedding.and
        x = self.fc1(x).tanh()
        x = self.fc2(x)
        return x

In [12]:
def evaluate(model, dataloader):
    model.eval()
    acc, count = 0, 0
    list_predictions = list()
    with torch.no_grad():
        #todo: images
        for texts_input_ids, texts_attention, labels in dataloader:
            predictions = model(texts_input_ids, texts_attention)
            acc += (predictions.argmax(1) == labels).sum().item()
            count += labels.size(0)
            list_predictions.append(predictions.argmax(1))
    return acc/count, torch.cat(list_predictions)

In [16]:
def train(model, dataloader_train, dataloader_val, optimizer, criterion, max_epochs:int =10):
    count = 0
    log_interval = 100
    PATIENCE = 10
    patience = 0
    train_acc = 0
    best_val_acc = 0
    best_model = None

    for epoch in range(1, max_epochs+1):
        for index, (texts_input_ids, texts_attention, labels) in enumerate(dataloader_train):
            model.train()
            optimizer.zero_grad()
            predictions = model(texts_input_ids, texts_attention)
            loss = criterion(predictions, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            optimizer.step()
            train_acc += (predictions.argmax(1) == labels).sum().item()
            count += labels.size(0)
            if index % log_interval == 0:
                val_acc, __ = evaluate(model, dataloader_val)
                print(
                    f"Epoch: {epoch}/{max_epochs:2d}"
                    f" | Batch: {index:4d}/{len(dataloader_train)}"
                    f" | Train Acc: {100 * train_acc / count:4.5f} %"
                    f" | Val Acc: {100 * val_acc:4.5f} %"
                )
                if val_acc >= best_val_acc:
                    best_val_acc = val_acc
                    patience = 0
                    best_model = deepcopy(model)
                else:
                    patience += 1
                if patience == PATIENCE:
                    print("[STOP] Patience expired.")
                    return best_model
        print("#" * 75)
    print("[STOP] Maximum epochs reached.")
    return best_model

## Train

In [17]:
EPOCHS = 50
LR = 1e-4
BATCH_SIZE = 64

model = Book_genre().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

dataset_train = MultimodalDataset(
    path_x="../data/train_x.csv",
    path_y="../data/train_y.csv"
)
val_split_idx = int(0.95 * len(dataset_train))
split_train_, split_val_ = random_split(dataset_train, [val_split_idx, len(dataset_train) - val_split_idx])
dataloader_train = DataLoader(
    dataset=split_train_,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch
)
dataloader_val = DataLoader(
    dataset=split_val_,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
best_model = train(
    model=model,
    dataloader_train=dataloader_train,
    dataloader_val=dataloader_val,
    optimizer=optimizer,
    criterion=criterion,
    max_epochs=EPOCHS
)
torch.save(best_model, f"../output/bert_{int(time())}.pt")

## Test

In [19]:
dataset_test = MultimodalDataset(
    path_x="../data/non_comp_test_x.csv",
    path_y="../data/non_comp_test_y.csv"
)

dataloader_test = DataLoader(
    dataset=dataset_test,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_batch
)
acc_test, predictions_test = evaluate(model=model, dataloader=dataloader_test)
print(f"\nTest accuracy: {100 * acc_test:.4f} %")

# Save output to disk.
df = pd.DataFrame({"Id": torch.Tensor(range(predictions_test.size(0))), "Genre": predictions_test.to("cpu")})
df.to_csv("non_comp_test_pred_y.csv", index=False)


Test accuracy: 50.1579 %
