In [1]:
import pandas as pd
import numpy as np 

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm

In [2]:
train_data = pd.read_csv('directory to train.csv file.') 
train_data

In [3]:
test_data = pd.read_csv('directory to test.csv file.')
test_data

In [4]:
train_data = pd.read_csv('train.csv')
train_data

In [5]:
test_data = pd.read_csv('test.csv')
test_data

In [6]:
label_columns = ['Thriller', 'Classics', 'Romance', 'Mystery', 'Science', 'Literature', 'Fantasy', 'Historical', 'Fiction']
label_columns

In [7]:
class BookDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
        self.title = dataframe['title']
        self.description = dataframe['description']
        self.targets = self.data[label_columns].values

    def __len__(self):
        return len(self.description)

    def __getitem__(self, index):
        title = str(self.title[index])
        description = str(self.description[index])
        text = title + " " + description
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [8]:
# modeling 
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label_columns))

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [10]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5

In [11]:
train_dataset = BookDataset(train_data, tokenizer, MAX_LEN)
train_size = int(0.9 * len(train_dataset))
valid_size = len(train_dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [train_size, valid_size])
train_dataset

In [12]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=1)
train_loader

In [13]:
def train_model(model, train_loader, optimizer, device):
    model.train()
    for batch in tqdm(train_loader, total=len(train_loader)):
        ids = batch['ids'].to(device)
        mask = batch['mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def evaluate_model(model, data_loader, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader)):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=ids, attention_mask=mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [14]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_model(model, train_loader, optimizer, device)
    
    outputs, targets = evaluate_model(model, valid_loader, device)
    outputs = np.array(outputs) >= 0.5
    accuracy = f1_score(targets, outputs, average='micro')
    print(f'Validation F1-Score = {accuracy}')

In [15]:
test_dataset = BookDataset(test_data, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=1)

test_outputs, _ = evaluate_model(model, test_loader, device)
test_outputs = np.array(test_outputs) >= 0.5

In [16]:
class BookDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_test=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
        self.title = dataframe['title']
        self.description = dataframe['description']
        self.is_test = is_test
        if not self.is_test:
            self.targets = self.data[label_columns].values

    def __len__(self):
        return len(self.description)

    def __getitem__(self, index):
        title = str(self.title[index])
        description = str(self.description[index])
        text = title + " " + description
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if self.is_test:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }

In [17]:
test_dataset = BookDataset(test_data, tokenizer, MAX_LEN, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=1)

def predict_test(model, data_loader, device):
    model.eval()
    fin_outputs = []
    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader)):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)

            outputs = model(input_ids=ids, attention_mask=mask)
            fin_outputs.extend(torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist())
    return fin_outputs

test_outputs = predict_test(model, test_loader, device)
test_outputs = np.array(test_outputs) >= 0.5

In [18]:
# evaluate model
from sklearn.metrics import  f1_score, classification_report, confusion_matrix
print(f'Validation F1-Score = {accuracy}')

In [19]:
# predict test samples
# submission = pd.DataFrame()
# submission

submission = pd.DataFrame(test_outputs, columns=label_columns)
submission.to_csv('submission.csv', index=False)

In [20]:
import zipfile
import joblib
import os

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)
            
submission.to_csv('submission.csv', index=False)
file_names = ['Babel.ipynb', 'submission.csv']
compress(file_names)