In [2]:
import sys
sys.path.append('../')
import pandas as pd

import torch
import torch.nn as nn
from torch import cuda
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertTokenizer, DistilBertModel

from config import settings

### Prepare

In [7]:
device = 'cuda:1' if cuda.is_available() else 'cpu'

MAX_LEN = 150
BATCH_SIZE = 64
EPOCHS = 1
LEARNING_RATE = 1e-05
DISTIL_BERT_CHECKPOINT = 'distilbert-base-uncased'
RUN_NAME = 'ROS'
TEST_PATH = '../data/processed/test.csv'
TRAIN_PATH = '../data/ros/train.csv'

tokenizer = DistilBertTokenizer.from_pretrained(DISTIL_BERT_CHECKPOINT)

### Dataset and dataloader

In [8]:
class QuoraDataset(Dataset):

    def __init__(self, file_path, tokenizer, max_len):
        self._dataset = pd.read_csv(file_path, low_memory=False)
        self._tokenizer = tokenizer 
        self._max_len = max_len

    def __getitem__(self, index):
        text = self._dataset.iloc[index]["question_text"]
        inputs = self._tokenizer(
            [text],
            truncation=True, 
            return_tensors="pt",
            max_length=self._max_len,
            padding='max_length'
        )

        return {
            "ids": inputs["input_ids"],
            "mask": inputs["attention_mask"],
            "target": torch.tensor(self._dataset.iloc[index]["target"], dtype=torch.long)
        }

    def __len__(self):
        return len(self._dataset)

In [9]:
train_dataset = QuoraDataset(TRAIN_PATH, tokenizer, MAX_LEN)
test_dataset = QuoraDataset(TEST_PATH, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

### DistilBert Model

In [10]:
class DistilBertModelClass(nn.Module):

    def __init__(self):
        super(DistilBertModelClass, self).__init__()
        self.distil_bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.linear1 = nn.Linear(768, 2)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, ids, mask):
        bert_out = self.distil_bert(ids, mask)
        x = bert_out.last_hidden_state[:, -1, :] # get bert last hidden state
        x = self.linear1(x)
        x = self.sigmoid(x)
        return x

model = DistilBertModelClass()
model.to(device);

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Training

In [11]:
# Creating the loss function and optimizer
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE) 

In [12]:
from collections import defaultdict

def accuracy(model, loader):
    model.eval()

    with torch.no_grad():
        classname = {0: 'Sincere', 1: 'Insincere'}
        correct_pred = defaultdict(lambda: 0)
        total_pred = defaultdict(lambda: 0)

        for idx, inputs in enumerate(loader):
            ids = inputs['ids'].squeeze(1).to(device)
            mask = inputs['mask'].squeeze(1).to(device)
            targets = inputs['target'].to(device)

            output = model(ids, mask).squeeze()

            _, predictions = torch.max(output, 1)
            
            for target, prediction in zip(targets, predictions):
                if target.item() == prediction.item():
                    correct_pred[classname[target.item()]] += 1
                total_pred[classname[target.item()]] += 1

        class_acc = {}
        for classname, correct_count in correct_pred.items():
            class_acc[classname] = 100 * float(correct_count) / total_pred[classname]
        
        return correct_pred, class_acc
        
accuracy(model, test_loader)

In [13]:
def train(epoch=1):
    model.train()

    for idx, inputs in enumerate(train_loader):
        
        ids = inputs['ids'].squeeze(1).to(device)
        mask = inputs['mask'].squeeze(1).to(device)
        target = inputs['target'].to(device)

        output = model(ids, mask).squeeze()

        optimizer.zero_grad()

        l = loss(output, target)
        l.backward()

        optimizer.step()

        if idx % 10 == 0:
            print(f'Epoch: {epoch}, {idx}/{len(train_loader)}, Loss:  {l.item()}')

        if idx % 1_000 == 0 and idx > 1:
            correct_pred, class_acc = accuracy(model, test_loader)
            print(f'Validation Accuracy: {class_acc}')

### Training

In [None]:
# track training and results...
import neptune.new as neptune

run = neptune.init(
    project=settings.project,
    api_token=settings.api_token
)  

train(epoch=EPOCHS)

run.stop()