In [25]:
# !pip install transformers
# !pip install accelerate

In [1]:
import numpy as np
import pandas as pd
import time
import torch
# from icecream import ic
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [4]:
reviews = pd.read_csv("/content/drive/MyDrive/Transformers/Sentiment_Analysis/Movies_Dataset/movie_data.csv")
reviews.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [5]:
reviews.shape

(50000, 2)

#### Train, Validation, Test split

In [6]:
train_text = reviews.iloc[:35000, 0].tolist()
train_label = reviews.iloc[:35000, 1].values

val_text = reviews.iloc[35000:40000, 0].tolist()
val_label = reviews.iloc[35000:40000, 1].values

test_text = reviews.iloc[40000:, 0].tolist()
test_label = reviews.iloc[40000:, 1].values

#### Using HuggingFace DistilBERT model and it's tokenizer

In [7]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [9]:
train_embedding = tokenizer(train_text, padding = True, truncation = True, max_length = 512)
val_embedding = tokenizer(val_text, padding = True, truncation = True, max_length = 512)
test_embedding = tokenizer(test_text, padding = True, truncation = True, max_length = 512)

In [13]:
# example_embedding = tokenizer(train_text[0:2], padding = True, truncation = True, max_length = 512)
# for i, j in example_embedding.items():
#     print(i)
#     print(j[0])

#### Creating iterable dataset using Torch Dataset class

In [10]:
class ReviewsDataset(Dataset):
    def __init__(self, embedding, label):
        self.embedding = embedding
        self.label = label

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.embedding.items()}
        item["label"] = torch.tensor(self.label[idx])
        return item

In [11]:
train_dataset = ReviewsDataset(train_embedding, train_label)
val_dataset = ReviewsDataset(val_embedding, val_label)
test_dataset = ReviewsDataset(test_embedding, test_label)

#### Batching using Torch DataLoader

In [12]:
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True, drop_last = True)
val_loader = DataLoader(val_dataset, batch_size = 16, shuffle = True, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = True, drop_last = True)

In [None]:
# x = next(iter(train_loader))
# x["input_ids"].shape

In [13]:
epochs = 3

In [14]:
model = DistilBertForSequenceClassification.from_pretrained(model_name)
model.to(device)
model.train()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

#### Using AdamW with Scheduler

In [15]:
optim = torch.optim.AdamW(model.parameters(), lr = 5e-5, weight_decay=0.01)

In [16]:
scheduler = get_linear_schedule_with_warmup(optimizer = optim, num_warmup_steps = 500, num_training_steps = len(train_loader)*epochs)

In [24]:
def check_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct, total = 0, 0

        for batch in data_loader:

            embeddings = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(embeddings, attention_mask = attention_mask, labels = labels)
            loss, logits = outputs["loss"], outputs["logits"]

            _, predicted_labels = torch.max(logits, 1)
            correct += (predicted_labels == labels).sum()
            total += labels.size(0)

    return (correct.float() / total) * 100

#### Training the PyTorch way

In [18]:
# start_time = time.time()

# for e in range(epochs):

#     for batch_idx, batch in enumerate(train_loader):

#         embeddings = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["label"].to(device)

#         outputs = model(embeddings, attention_mask = attention_mask, labels = labels)
#         loss = model["loss"]

#         optim.zero_grad()
#         loss.backward()
#         optim.step()
#         scheduler.step()

#         if batch_idx % 250 == 0:S
#             print(f"Epoch number: {epoch+1:04d}/{NUM_EPOCHS:04d} | "
#                   f"Batch: {batch_idx:04d}/{len(train_loader):04d} | "
#                   f"Loss: {loss:.4f}")

#         model.eval()
#         with torch.set_grad_enabled(False):
#             print(f'training accuracy: '
#                   f'{compute_accuracy(model, train_loader, device):.2f}%'
#                   f'\nvalid accuracy: '
#                   f'{compute_accuracy(model, valid_loader, device):.2f}%')

#     print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

# print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
# print(f'Test accuracy: {compute_accuracy(model, test_loader, device):.2f}%')

#### Training using HuggingFace trainer

In [19]:
trainer_args = TrainingArguments(
    output_dir = "DistilBERT3_Result",
    num_train_epochs = 3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=10
)

In [20]:
trainer = Trainer(
    model=model,
    args=trainer_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [21]:
trainer.train()

Step,Training Loss
10,0.5472
20,0.4012
30,0.5687
40,0.3635
50,0.2994
60,0.3894
70,0.3418
80,0.2867
90,0.2716
100,0.3215


TrainOutput(global_step=6564, training_loss=0.1455269756565954, metrics={'train_runtime': 5029.8029, 'train_samples_per_second': 20.876, 'train_steps_per_second': 1.305, 'total_flos': 1.390907685888e+16, 'train_loss': 0.1455269756565954, 'epoch': 3.0})

In [25]:
model.eval()
model.to(device)

print(f'Test accuracy: {check_accuracy(model, test_loader, device):.2f}%')

Test accuracy: 93.67%


In [None]:
trainer.save_model("DistilBERT3_Finetuned")

### An improvement of 3.6% from the non fine tuned DistilBERT model