In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import AutoTokenizer, RobertaModel, AdamW;

import torch;
import torch.nn as nn;
import torch.nn.functional as F;
import torch.optim as optim;
from torch.utils.data import Dataset, DataLoader;

In [None]:
import os;
import os.path;

In [None]:
import numpy as np;
import matplotlib.pyplot as plt;

import statistics;

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu');
print(device);

cuda


In [None]:
"""
    Using bert model for downstream classification task.
"""

class ROBERTaClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(ROBERTaClassifier, self).__init__();
        self.bert = RobertaModel.from_pretrained(model_name);
        self.dropout = nn.Dropout(p=0.2);
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes);

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask);
        output = self.dropout(output.pooler_output);
        output = self.fc(output);
        return output;

In [None]:
roberta_model_name = 'FacebookAI/roberta-base';
num_classes = 3;
max_length = 512;
batch_size = 16;
epochs = 3;
learning_rate = 1e-5;

In [None]:
tokenizer = AutoTokenizer.from_pretrained(roberta_model_name);

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
class Dataset(nn.Module):
    def __init__(self,path_x,path_y,max_length):

        self.max_length = max_length;

        if not os.path.exists(path_x):
            raise FileNotFoundError(path_x);
        if not os.path.exists(path_y):
            raise FileNotFoundError(path_y);

        with open(path_x,'r') as f:
            self.x = f.readlines();

        with open(path_y,'r') as f:
            self.y = f.readlines();

        if len(self.x) != len(self.y):
            raise ValueError("x and y must have same length");

    def __len__(self):
        return len(self.x);

    def __getitem__(self,idx):
        x = self.x[idx];
        y = self.y[idx];
        output = tokenizer(x, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt');
        return {
            'input_ids': output['input_ids'].flatten(),
            'attention_mask': output['attention_mask'].flatten(),
            'label' : torch.tensor(int(self.y[idx]))
        };

In [None]:
train_dataset = Dataset('/content/drive/MyDrive/DATA/trainset_cmb_T_seq.txt',
                        '/content/drive/MyDrive/DATA/y_train.txt',max_length);

val_dataset = Dataset('/content/drive/MyDrive/DATA/valset_cmb_T_seq.txt',
                      '/content/drive/MyDrive/DATA/y_val.txt',max_length);

train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True);
val_loader = DataLoader(val_dataset,batch_size=batch_size,shuffle=True);

In [None]:
model = ROBERTaClassifier(roberta_model_name, num_classes);
model = model.to(device);
optimizer = AdamW(model.parameters(), lr=learning_rate);
criterion = nn.CrossEntropyLoss();

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
store_best_path = "/content/drive/MyDrive/PARAM/";

In [None]:
epochs = 5;

train_loss = [];
val_loss = [];
train_acc = [];
val_acc = [];

for epoch in range(epochs):

    train_loss_epoch = [];
    val_loss_epoch = [];
    train_acc_epoch = [];
    val_acc_epoch = [];

    model.train();
    for batch in train_loader:
        optimizer.zero_grad();

        input_ids = batch['input_ids'].to(device);
        attention_mask = batch['attention_mask'].to(device);
        y = batch['label'].to(device);

        output = model(input_ids,attention_mask);
        loss = criterion(output,y);
        loss.backward();
        optimizer.step();

        train_loss_epoch.append(loss.item());

        accuracy = torch.argmax(output,dim=-1).view(-1) == y.view(-1);
        train_acc_epoch.append((torch.sum(accuracy) / len(accuracy)).item());


    model.eval();
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device);
            attention_mask = batch['attention_mask'].to(device);
            y = batch['label'].to(device);

            output = model(input_ids,attention_mask);
            loss = criterion(output,y);

            val_loss_epoch.append(loss.item());

            accuracy = torch.argmax(output,dim=-1).view(-1) == y.view(-1);
            val_acc_epoch.append((torch.sum(accuracy) / len(accuracy)).item());


    print(f"Epoch[{epoch}][{epochs}] : Training Loss :{statistics.mean(train_loss_epoch)}, Validation Loss :{statistics.mean(val_loss_epoch)}, \
Training Accuracy :{statistics.mean(train_acc_epoch)}, Validation Accuracy :{statistics.mean(val_acc_epoch)}");

    if len(val_loss) == 0 or statistics.mean(val_loss_epoch) < min(val_loss):
        torch.save(model.state_dict(),store_best_path + 'roberta_BERTSUM_seq_1.pt');

    train_loss.append(statistics.mean(train_loss_epoch));
    val_loss.append(statistics.mean(val_loss_epoch));
    train_acc.append(statistics.mean(train_acc_epoch));
    val_acc.append(statistics.mean(val_acc_epoch));

Epoch[0][5] : Training Loss :1.0200989082455636, Validation Loss :0.9919055378437043, Training Accuracy :0.42625, Validation Accuracy :0.44625
Epoch[1][5] : Training Loss :0.913211000263691, Validation Loss :0.8699929225444794, Training Accuracy :0.5475, Validation Accuracy :0.525
Epoch[2][5] : Training Loss :0.792764657586813, Validation Loss :0.7043631619215012, Training Accuracy :0.64, Validation Accuracy :0.6975
Epoch[3][5] : Training Loss :0.6154741793870926, Validation Loss :0.6596883335709571, Training Accuracy :0.7478125, Validation Accuracy :0.7475
Epoch[4][5] : Training Loss :0.4559628729522228, Validation Loss :0.7530637812614441, Training Accuracy :0.83125, Validation Accuracy :0.7225


In [None]:
model.load_state_dict(torch.load(store_best_path + 'roberta_BERTSUM_seq_1.pt',map_location=device));

In [None]:
class Dataset(nn.Module):
    def __init__(self,path_x,max_length):

        self.max_length = max_length;

        if not os.path.exists(path_x):
            raise FileNotFoundError(path_x);

        with open(path_x,'r') as f:
            self.x = f.readlines();


    def __len__(self):
        return len(self.x);

    def __getitem__(self,idx):
        x = self.x[idx];
        output = tokenizer(x, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt');
        return {
            'input_ids': output['input_ids'].flatten(),
            'attention_mask': output['attention_mask'].flatten(),
        };

In [None]:
test_dataset = Dataset('/content/drive/MyDrive/DATA/testset_cmb_T_seq.txt',
                        max_length);

test_dataloader = DataLoader(test_dataset,batch_size=8);

In [None]:
test = []
for batch in test_dataloader:
  model.eval();

  input_ids = batch['input_ids'].to(device);
  attention_mask = batch['attention_mask'].to(device);

  output = model(input_ids,attention_mask);
  y_pred = torch.argmax(output,dim=-1);
  test.append(y_pred);

In [None]:
tmp = [];
for batch in test:
    for line in batch:
        tmp.append(line.item());

In [None]:
id_to_label = {
    0:"passage",
    1:"phrase",
    2:"multi"
};

In [None]:
test_pred_label = [id_to_label[_] for _ in tmp];

In [None]:
import pandas as pd;

data = {
    "id":range(len(test_pred_label)),
    "spoilerType":test_pred_label
};

df = pd.DataFrame(data);
print(df);

      id spoilerType
0      0     passage
1      1     passage
2      2      phrase
3      3      phrase
4      4      phrase
..   ...         ...
395  395     passage
396  396      phrase
397  397       multi
398  398     passage
399  399      phrase

[400 rows x 2 columns]


In [None]:
df.to_csv("/content/drive/MyDrive/DATA/roberta_BERTSUM_SEQ_submission_T_5.csv",index=False);