<a href="https://colab.research.google.com/github/2017soft/MovieReview/blob/master/transformer_distill_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!cp 'drive/My Drive/utils.py' .

In [3]:
from utils import read_data_csv

In [4]:
df = read_data_csv('dataset.csv')
print(df.head())

                                              REVIEW  SENTIMENT
0  A series of escapades demonstrating the adage ...          1
1  A series of escapades demonstrating the adage ...          2
2                                           A series          2
3                                                  A          2
4                                             series          2


In [5]:
!pip install transformers
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer



In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Define some hyperparameters

In [7]:
MAX_LEN = 64
TRAIN_BATCH_SIZE = 500
VALID_BATCH_SIZE = 500
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

# Preprocess the text data

In [8]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        review = str(self.data.REVIEW[index])
        review = " ".join(review.split())
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.SENTIMENT[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [9]:
random_seed = 0
train_size = 0.95
train_dataset=df.sample(frac=train_size,random_state=random_seed)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (156060, 2)
TRAIN Dataset: (148257, 2)
TEST Dataset: (7803, 2)


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Define the modified DistillBERT model for this sentiment classification task

In [11]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [12]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

# Create the loss function and the optimizer

In [13]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Import the training and validation function for tuning the model

In [15]:
from utils import train, valid

# Start training and validating the model

In [17]:
from datetime import datetime

In [18]:
for epoch in range(EPOCHS):
  train(model, training_loader, loss_function, optimizer, epoch, device)
  valid(model, testing_loader, loss_function, epoch, device)
  if epoch % 3 == 0:
    torch.save(model.state_dict(), f"drive/My Drive/my_distillbert_model_{datetime.now()}.pt")

There are 297 batches in the training set




Training Loss after step 0: 1.616094708442688
Training Accuracy after step 0: 11.8
Training Loss after step 50: 1.3399755370383168
Training Accuracy after step 50: 47.227450980392156
Training Loss after step 100: 1.2791901177699023
Training Accuracy after step 100: 49.073267326732676
Training Loss after step 150: 1.2547213873326384
Training Accuracy after step 150: 49.87152317880795
Training Loss after step 200: 1.2415462726384252
Training Accuracy after step 200: 50.24676616915423
Training Loss after step 250: 1.2293505103464621
Training Accuracy after step 250: 50.70677290836653
The Total Accuracy for Epoch 0: 50.92643180423184
Training Loss Epoch 0: 1.220914981983326
Training Accuracy Epoch 0: 50.92643180423184
There are 16 batches in the test set
Validation Loss after 0 steps: 1.165771722793579
Validation Accuracy after 0 steps: 51.6
Validation Loss after 10 steps: 1.1380802176215432
Validation Accuracy after 10 steps: 53.472727272727276
Validation Loss after Epoch 0: 1.13589645177

# Save the model for further training and inference later

In [19]:
model_path = f"drive/My Drive/my_distillbert_model_{datetime.now()}.pt"
torch.save(model.state_dict(), model_path)

# Retrieve the saved model

In [20]:
saved_model = DistillBERTClass()
saved_model.load_state_dict(torch.load(model_path))
saved_model.to(device)


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [21]:
saved_model_loss_function = torch.nn.CrossEntropyLoss()
saved_model_optimizer = torch.optim.Adam(params =  saved_model.parameters(), lr=LEARNING_RATE)

# Train the saved model for more iterations

In [22]:
for epoch in range(2):
    train(saved_model, training_loader, saved_model_loss_function, saved_model_optimizer, epoch, device)
    valid(saved_model, testing_loader, saved_model_loss_function, epoch, device)

There are 297 batches in the training set




Training Loss after step 0: 0.6865410804748535
Training Accuracy after step 0: 72.6
Training Loss after step 50: 0.7018031279246012
Training Accuracy after step 50: 71.70196078431373
Training Loss after step 100: 0.6931082883683761
Training Accuracy after step 100: 71.97821782178218
Training Loss after step 150: 0.6909897820839029
Training Accuracy after step 150: 71.9841059602649
Training Loss after step 200: 0.6909190923420351
Training Accuracy after step 200: 71.9820895522388
Training Loss after step 250: 0.6910970702589271
Training Accuracy after step 250: 71.9617529880478
The Total Accuracy for Epoch 0: 71.9156599688379
Training Loss Epoch 0: 0.6913075441061848
Training Accuracy Epoch 0: 71.9156599688379
There are 16 batches in the test set
Validation Loss after 0 steps: 0.7640259861946106
Validation Accuracy after 0 steps: 69.8
Validation Loss after 10 steps: 0.8115146105939691
Validation Accuracy after 10 steps: 67.67272727272727
Validation Loss after Epoch 0: 0.8194257318973541

### Part of the code in this notebook is modified from the tutorial here https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb