In [1]:
import sys
sys.path.append('../')
from pathlib import Path
from datetime import datetime

import pandas as pd

import torch
import torch.nn as nn
from torch import cuda
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertTokenizer, DistilBertModel

from config import settings

### Prepare

In [2]:
device = 'cuda:1' if cuda.is_available() else 'cpu'

MAX_LEN = 150
BATCH_SIZE = 64
EPOCHS = 1
LEARNING_RATE = 1e-05
DISTIL_BERT_CHECKPOINT = 'distilbert-base-uncased'
RUN_NAME = 'ROS'
TEST_PATH = '../data/processed/quick_test.csv'
TRAIN_PATH = '../data/ros/train.csv'
MODEL_SAVE = '../models/'

tokenizer = DistilBertTokenizer.from_pretrained(DISTIL_BERT_CHECKPOINT)

### Dataset and dataloader

In [3]:
class QuoraDataset(Dataset):

    def __init__(self, file_path, tokenizer, max_len):
        self._dataset = pd.read_csv(file_path, low_memory=False)
        self._tokenizer = tokenizer 
        self._max_len = max_len

    def __getitem__(self, index):
        text = self._dataset.iloc[index]["question_text"]
        inputs = self._tokenizer(
            [text],
            truncation=True, 
            return_tensors="pt",
            max_length=self._max_len,
            padding='max_length'
        )

        return {
            "ids": inputs["input_ids"],
            "mask": inputs["attention_mask"],
            "target": torch.tensor(self._dataset.iloc[index]["target"], dtype=torch.long)
        }

    def __len__(self):
        return len(self._dataset)

In [4]:
train_dataset = QuoraDataset(TRAIN_PATH, tokenizer, MAX_LEN)
test_dataset = QuoraDataset(TEST_PATH, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

### DistilBert Model

In [5]:
class DistilBertModelClass(nn.Module):

    def __init__(self):
        super(DistilBertModelClass, self).__init__()
        self.distil_bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.linear1 = nn.Linear(768, 2)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, ids, mask):
        bert_out = self.distil_bert(ids, mask)
        x = bert_out.last_hidden_state[:, -1, :] # get bert last hidden state
        x = self.linear1(x)
        x = self.sigmoid(x)
        return x

model = DistilBertModelClass()
model.to(device);

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Training

In [6]:
# Creating the loss function and optimizer
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE) 

In [7]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from collections import defaultdict

def accuracy(model, loader):
    model.eval()

    with torch.no_grad():
        y_pred = []
        y_true = []

        classname = {0: 'Sincere', 1: 'Insincere'}
        correct_pred = defaultdict(lambda: 0)
        total_pred = defaultdict(lambda: 0)

        for inputs in loader:
            ids = inputs['ids'].squeeze(1).to(device)
            mask = inputs['mask'].squeeze(1).to(device)
            targets = inputs['target'].to(device)

            output = model(ids, mask).squeeze()

            _, predictions = torch.max(output, 1)
            
            y_pred += list(predictions.to('cpu'))
            y_true += list(targets.to('cpu'))

            for target, prediction in zip(targets, predictions):
                if target.item() == prediction.item():
                    correct_pred[classname[target.item()]] += 1
                total_pred[classname[prediction.item()]] += 1

        results = {
            'accuracy': accuracy_score(y_true, y_pred),
            'f1': f1_score(y_true, y_pred),
            'roc_auc': roc_auc_score(y_true, y_pred)
        }

        for classname, correct_count in correct_pred.items():
            results['precision_' + classname] = 100 * float(correct_count) / total_pred[classname]

        return results

results = accuracy(model, test_loader)
results

{'accuracy': 0.20977393617021275,
 'f1': 0.26613152207471447,
 'roc_auc': 0.5047354954236274,
 'precision_Insincere': 15.52593659942363,
 'precision_Sincere': 86.20689655172414}

In [8]:
def train(epoch=1):
    model.train()

    for idx, inputs in enumerate(train_loader):
        
        ids = inputs['ids'].squeeze(1).to(device)
        mask = inputs['mask'].squeeze(1).to(device)
        target = inputs['target'].to(device)

        output = model(ids, mask).squeeze()

        optimizer.zero_grad()

        l = loss(output, target)
        l.backward()

        optimizer.step()

        # Log Loss
        run["train/loss"].log(l.item())

        if idx % 10 == 0:
            print(f'Epoch: {epoch}, {idx}/{len(train_loader)}, Loss:  {l.item()}')

        if idx % 20 == 0:
            results = accuracy(model, test_loader) 
            run["train/accuracy"] = results['accuracy']
            run["train/f1"] = results['f1']
            run["train/roc_auc"] = results['roc_auc']
            run["train/precision_Sincere"] = results['precision_Sincere']
            run["train/precision_Insincere"] = results['precision_Insincere']
            print(results)
            print("Saving model...")
            torch.save(model.state_dict(), Path(MODEL_SAVE) / f'ftbert_{idx}_{datetime.now()}' )

### Training

In [9]:
# track training and results...
import neptune.new as neptune

run = neptune.init(
    project=settings.project,
    api_token=settings.api_token,
    name='RandomOversampling'
)  

train(epoch=EPOCHS)

run.stop()

https://app.neptune.ai/demenezes/Mestrado-RI/e/MES-6
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Epoch: 1, 0/13497, Loss:  0.6846345067024231
{'accuracy': 0.1761968085106383, 'f1': 0.267297457125961, 'roc_auc': 0.5034451153534436, 'precision_Insincere': 15.484755053100377, 'precision_Sincere': 87.64044943820225}
Saving model...
Epoch: 1, 10/13497, Loss:  0.6750589609146118
Epoch: 1, 20/13497, Loss:  0.6509659886360168
Epoch: 1, 30/13497, Loss:  0.6095486879348755
Epoch: 1, 40/13497, Loss:  0.5514026880264282
Epoch: 1, 50/13497, Loss:  0.49052292108535767
Epoch: 1, 60/13497, Loss:  0.476421594619751
Epoch: 1, 70/13497, Loss:  0.4465118944644928
Epoch: 1, 80/13497, Loss:  0.4685976207256317
Epoch: 1, 90/13497, Loss:  0.42306268215179443
Epoch: 1, 100/13497, Loss:  0.456206351518631
Epoch: 1, 110/13497, Loss:  0.48231

KeyboardInterrupt: 