<a href="https://colab.research.google.com/github/2017soft/MovieReview/blob/master/transformer_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp 'drive/My Drive/utils.py' .

In [3]:
from utils import read_data_csv

In [4]:
df = read_data_csv('dataset.csv')
print(df.head())

                                              REVIEW  SENTIMENT
0  A series of escapades demonstrating the adage ...          1
1  A series of escapades demonstrating the adage ...          2
2                                           A series          2
3                                                  A          2
4                                             series          2


In [5]:
!pip install transformers
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 22.0MB/s eta 0:00:01[K     |▍                               | 20kB 17.7MB/s eta 0:00:01[K     |▋                               | 30kB 15.7MB/s eta 0:00:01[K     |▉                               | 40kB 14.8MB/s eta 0:00:01[K     |█                               | 51kB 11.7MB/s eta 0:00:01[K     |█▎                              | 61kB 12.0MB/s eta 0:00:01[K     |█▌                              | 71kB 11.8MB/s eta 0:00:01[K     |█▊                              | 81kB 11.9MB/s eta 0:00:01[K     |██                              | 92kB 11.7MB/s eta 0:00:01[K     |██▏                             | 102kB 11.7MB/s eta 0:00:01[K     |██▍                             | 112kB 11.7MB/s eta 0:00:01[K     |██▋                             | 

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Define some hyperparameters

In [7]:
MAX_LEN = 64
TRAIN_BATCH_SIZE = 100
VALID_BATCH_SIZE = 100
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




# Preprocess the text data

In [8]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        review = str(self.data.REVIEW[index])
        review = " ".join(review.split())
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.SENTIMENT[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [9]:
random_seed = 0
train_size = 0.95
train_dataset=df.sample(frac=train_size,random_state=random_seed)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (156060, 2)
TRAIN Dataset: (148257, 2)
TEST Dataset: (7803, 2)


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Define the modified BERT model for this sentiment classification task

In [11]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [12]:
model = BERTClass()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

# Create the loss function and the optimizer

In [13]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Import the training and validation function for tuning the model

In [16]:
from utils import train, valid

In [17]:
from datetime import datetime

In [18]:
for epoch in range(EPOCHS):
  train(model, training_loader, loss_function, optimizer, epoch, device, report_steps=200)
  valid(model, testing_loader, loss_function, epoch, device, report_steps=20)
  if epoch % 3 == 0:
    torch.save(model.state_dict(), f"drive/My Drive/my_bert_model_{datetime.now()}.pt")

There are 1483 batches in the training set




Training Loss after step 0: 1.6711827516555786
Training Accuracy after step 0: 14.0
Training Loss after step 200: 1.2587491683105925
Training Accuracy after step 200: 49.60696517412935
Training Loss after step 400: 1.2309422665403371
Training Accuracy after step 400: 50.44887780548628
Training Loss after step 600: 1.2190131068031325
Training Accuracy after step 600: 50.99001663893511
Training Loss after step 800: 1.2094431297014119
Training Accuracy after step 800: 51.1398252184769
Training Loss after step 1000: 1.197708324893014
Training Accuracy after step 1000: 51.57142857142857
Training Loss after step 1200: 1.182866516061667
Training Accuracy after step 1200: 52.210657785179016
Training Loss after step 1400: 1.1662412259427246
Training Accuracy after step 1400: 52.91720199857245
The Total Accuracy for Epoch 0: 53.15769238551974
Training Loss Epoch 0: 1.159979079835574
Training Accuracy Epoch 0: 53.15769238551974
There are 79 batches in the test set
Validation Loss after 0 steps: 0

### Part of the code in this notebook is modified from the tutorial here https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb