In [None]:
import transformers
from transformers import BertTokenizer, BertModel

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Load dataset from google drive
df = pd.read_csv('...path/IMDB Dataset.csv')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
training_set, test_set = train_test_split(df, random_state = 101, test_size = 0.2)

In [None]:
import re

In [None]:
# Remove HTML tags
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)
    return text

In [None]:
training_set['review'] = training_set['review'].apply(clean_text)
test_set['review'] = test_set['review'].apply(clean_text)

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")

def to_device(data,device):
    if isinstance(data,dict):
        return {k: v.to(device = device, non_blocking = True) for k, v in data.items()}
    return data.to(device,non_blocking=True)

class ToDeviceLoader:
    def __init__(self,data,device):
        self.data = data
        self.device = device

    def __iter__(self):
        for batch in self.data:
            yield to_device(batch,self.device)

    def __len__(self):
        return len(self.data)

In [None]:
def load_data (dataframe):
    texts = list(dataframe['review'])
    labels = [1 if label == 'positive' else 0 for label in dataframe['sentiment']]
    return texts, labels

In [None]:
train_text, train_label = load_data(training_set)
test_text, test_label = load_data(test_set)

In [None]:
class text_dataset():
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        # self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, id):
        text = self.texts[id]
        label = self.labels[id]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens= True,
            max_length= 150,
            return_token_type_ids= True,
            padding= "max_length",
            return_attention_mask= True,
            truncation = True,
            return_tensors= 'pt',
        )
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(label)}


In [None]:
def dataloader(batch_size):
    train_dataset = text_dataset(train_text, train_label, tokenizer)
    test_dataset = text_dataset(test_text, test_label, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle= True)
    test_loader = DataLoader(test_dataset, batch_size = batch_size)
    return train_loader, test_loader

In [None]:
import torch.nn as nn

In [None]:
class BERT_sentiment(nn.Module):
    def __init__(self, num_class):
        super(BERT_sentiment,self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size,num_class)
        self.softmax = nn.Softmax(dim =1)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids = input_ids, attention_mask = attention_mask)
        pooled_output = outputs.pooler_output
        out = self.fc(pooled_output)
        out = self.softmax(out)
        return out

In [None]:
model = BERT_sentiment(2)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5)

In [None]:
device = get_device()

In [None]:
train_loader, test_loader = dataloader(16)
train_loader = ToDeviceLoader(train_loader, device)
test_loader = ToDeviceLoader(test_loader, device)

In [None]:
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [None]:
model.to(device)

In [None]:
# training loop:
num_epoch = 10
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]
for epoch in range(num_epoch):
    train_losses = []
    train_acc = 0.0
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        output1 = model(input_ids, attention_mask)
        loss = criterion(output1, labels)
        train_losses.append(loss.item())
        loss.backward()
        pred1 = torch.argmax(output1, axis = -1)
        accuracy = acc(pred1,labels)
        train_acc += accuracy
        optimizer.step()
    val_loss =[]
    val_acc = 0.0
    model.eval()
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        output = model(input_ids, attention_mask)
        loss = criterion(output, labels)
        val_loss.append(loss.item())
        pred = torch.argmax(output, axis = -1)
        accuracy = acc(pred, labels)
        val_acc += accuracy
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_loss)
    # epoch_train_acc = train_acc/len(train_loader.dataset)
    # epoch_val_acc = val_acc/len(test_loader.dataset)
    epoch_train_acc = train_acc/(len(training_set))
    epoch_val_acc = val_acc/(len(test_set))
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    print(25*'==')

In [None]:
import matplotlib.pyplot as plt

In [None]:

fig = plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 1)
plt.plot(epoch_tr_acc, label='Train Acc')
plt.plot(epoch_vl_acc, label='Validation Acc')
plt.title("Accuracy")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(epoch_tr_loss, label='Train loss')
plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()

plt.show()

# Inference with an example

In [None]:
text = 'I really hate this film, it is the worst film I have ever seen'


In [None]:
encode = tokenizer.encode_plus(
    text,
    add_special_tokens= True,
    max_length= 150,
    return_token_type_ids= True,
    padding= "max_length",
    return_attention_mask= True,
    return_tensors= 'pt',
)

In [None]:
encode.to(device)
input_ids = encode['input_ids']
attention_mask = encode['attention_mask']
output = model(input_ids, attention_mask)
pred = torch.argmax(output, axis = -1)


In [None]:
if (pred == 1):
   print('positive')
else:
  print('negative')

In [None]:
torch.save(model.state_dict(), 'path')