In [16]:

import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch

In [17]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.labels = dataframe['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [34]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 1, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
criterion = nn.CrossEntropyLoss()  
optimizer = optim.AdamW(model.parameters(), lr=2e-4)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
df = pd.read_csv("trainb.csv", delimiter="|")
testDf = pd.read_csv("testb.csv", delimiter="|")

In [36]:
train_dataset = CustomDataset(df,tokenizer,64)
test_dataset = CustomDataset(testDf,tokenizer,64)

In [37]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [64]:
def train(model, optimizer, train_loader, criterion, batches):
    model.train()
    total_loss = 0

    for i,batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits
        labels=labels.to(torch.float)
        loss = criterion(logits.squeeze(), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i == batches:break
    print(f'Training loss: {total_loss/batches}')

In [72]:
def evaluate(model, test_loader, criterion, batches):
    model.eval()
    total_loss = 0
    total_acc = 0

    with torch.no_grad():
        for i,batch in enumerate(test_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            token_type_ids = batch['token_type_ids']
            labels = batch['labels']
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            logits = outputs.logits
            labels=labels.to(torch.float)
            loss = criterion(logits.squeeze(), labels)
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            print(predictions == labels)
            print(predictions)
            total_acc += (predictions == labels).sum().item()
            if i == batches:break

    print(f'Test loss: {total_loss/batches} Test acc: {total_acc/(batches*16)}%')

In [73]:
for epoch in range(10):
    train(model, optimizer, train_loader, criterion,10)
    evaluate(model, test_loader, criterion,10)

Training loss: 385.6047302246094
tensor([False,  True, False,  True, False,  True,  True, False, False,  True,
        False,  True, False,  True, False,  True])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([False,  True,  True, False,  True, False, False,  True,  True, False,
         True, False, False,  True,  True, False])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([False,  True, False,  True, False,  True,  True, False,  True, False,
        False,  True,  True, False,  True, False])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([ True, False, False,  True,  True, False, False,  True,  True, False,
        False,  True, False,  True, False,  True])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([ True, False, False,  True,  True, False, False,  True,  True, False,
        False,  True,  True, False,  True, False])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([ True, False, False,  True, F