In [6]:
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F 
import torchtext
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

In [3]:
torch.backends.cudnn.deterministic = True 
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f5164484b30>

In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
NUM_EPOCHS = 3

In [7]:
df = pd.read_csv("data.csv")
le = LabelEncoder()
df["Sentiment"] = le.fit_transform(df.Sentiment)

In [9]:
train_texts = df.iloc[:4500]["Sentence"].values 
train_labels = df.iloc[:4500]["Sentiment"].values
valid_texts = df.iloc[4500:5500]["Sentence"].values
valid_labels = df.iloc[4500:5500]["Sentiment"].values
test_texts = df.iloc[5500:]["Sentence"].values
test_labels = df.iloc[5500:]["Sentiment"].values

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [13]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels
    
    def __len__(self):
    return len(self.labels)

    def __getitem__(self,idx):
    item = {key: torch.tensor(val[idx])
            for key, val in self.encodings.items()}
    item["labels"] = torch.tensor(self.labels[idx])
    return item

train_dataset = CustomDataset(train_encodings, train_labels) 
valid_dataset = CustomDataset(valid_encodings, valid_labels) 
test_dataset = CustomDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [15]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [16]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            predicted_labels = torch.argmax(logits, 1)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [17]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    
    model.train()
    
    for batch_idx, batch in enumerate(train_loader):
        
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        ### Logging
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')
            
    model.eval()

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0003 | Batch 0000/0282 | Loss: 1.0880
Epoch: 0001/0003 | Batch 0250/0282 | Loss: 0.4611
training accuracy: 84.67%
valid accuracy: 81.70%
Time elapsed: 1.42 min
Epoch: 0002/0003 | Batch 0000/0282 | Loss: 0.2145
Epoch: 0002/0003 | Batch 0250/0282 | Loss: 0.3090
training accuracy: 88.98%
valid accuracy: 81.60%
Time elapsed: 2.81 min
Epoch: 0003/0003 | Batch 0000/0282 | Loss: 0.2596
Epoch: 0003/0003 | Batch 0250/0282 | Loss: 0.1652
training accuracy: 90.00%
valid accuracy: 78.70%
Time elapsed: 4.19 min
Total Training Time: 4.19 min
Test accuracy: 78.65%
