In [1]:
import os
import re
import numpy as np 
from sklearn.metrics import accuracy_score

import transformers
from transformers import BertTokenizer, BertModel
# from transformers import RobertaTokenizer, RobertaModel

import torch
from torch import cuda
from tqdm import tqdm

device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
X = np.array(["it's going up", "you ruined my day","this is a good investment", "wow, numbers go brrrrrrr", "lol","going up","going down","going flat","it's a bear","it's a bull","butterflies are cool","git good","why do I care","get rekt nerd", "you're so bad at this","stocks tanking","I'm not too confident this'll go up","big numbers", "it'll go up", "outlook good"])
y = np.array([0,2,0,0,1,0,2,1,2,0,1,1,1,1,1,2,2,0,0,0])

In [3]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets':self.targets[index].clone().detach().long()
        }

In [4]:
class BERTClass(torch.nn.Module):
# class RobertaClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()
        # super(RobertaClass, self).__init__()
                   
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
        # self.l1 = RobertaModel.from_pretrained("FacebookAI/roberta-base")
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

In [5]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 10
NUM_OUT = 3
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(X, torch.from_numpy(y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(X, torch.from_numpy(y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

In [9]:
model = BERTClass(NUM_OUT)
# model = RobertaClass(NUM_OUT)
model.to(device)    

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = targs
    print('accuracy on test set {}'.format(accuracy_score(guesses.indices, targets)))

100%|██████████| 2/2 [00:00<00:00,  3.43it/s]


Epoch: 0, Loss:  1.1297011375427246


100%|██████████| 2/2 [00:00<00:00,  9.54it/s]


accuracy on test set 0.5


100%|██████████| 2/2 [00:00<00:00,  3.66it/s]


Epoch: 1, Loss:  1.0926964282989502


100%|██████████| 2/2 [00:00<00:00,  9.62it/s]


accuracy on test set 0.7


100%|██████████| 2/2 [00:00<00:00,  3.62it/s]


Epoch: 2, Loss:  1.0523526668548584


100%|██████████| 2/2 [00:00<00:00,  9.63it/s]


accuracy on test set 0.7


100%|██████████| 2/2 [00:00<00:00,  3.68it/s]


Epoch: 3, Loss:  1.1119499206542969


100%|██████████| 2/2 [00:00<00:00,  9.63it/s]


accuracy on test set 0.6


100%|██████████| 2/2 [00:00<00:00,  3.63it/s]


Epoch: 4, Loss:  1.034719705581665


100%|██████████| 2/2 [00:00<00:00,  9.52it/s]


accuracy on test set 0.7


100%|██████████| 2/2 [00:00<00:00,  3.61it/s]


Epoch: 5, Loss:  0.9648408889770508


100%|██████████| 2/2 [00:00<00:00,  9.50it/s]


accuracy on test set 0.8


100%|██████████| 2/2 [00:00<00:00,  3.64it/s]


Epoch: 6, Loss:  0.9302015900611877


100%|██████████| 2/2 [00:00<00:00,  9.43it/s]


accuracy on test set 0.85


100%|██████████| 2/2 [00:00<00:00,  3.69it/s]


Epoch: 7, Loss:  0.9043044447898865


100%|██████████| 2/2 [00:00<00:00,  9.60it/s]


accuracy on test set 0.95


100%|██████████| 2/2 [00:00<00:00,  3.59it/s]


Epoch: 8, Loss:  0.7661095261573792


100%|██████████| 2/2 [00:00<00:00,  9.47it/s]


accuracy on test set 1.0


100%|██████████| 2/2 [00:00<00:00,  3.59it/s]


Epoch: 9, Loss:  0.6622636318206787


100%|██████████| 2/2 [00:00<00:00,  9.57it/s]

accuracy on test set 1.0



