In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/04/58/3d789b98923da6485f376be1e04d59ad7003a63bdb2b04b5eea7e02857e5/transformers-2.5.0-py3-none-any.whl (481kB)
[K     |████████████████████████████████| 491kB 3.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 16.6MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 20.1MB/s 
Collecting tokenizers==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/7e/1d/ea7e2c628942e686595736f73678348272120d026b7acd54fe43e5211bb1/tokenizers-0.5.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K    

In [5]:
import json 
import dask 
import dask.bag as db
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
# from pytorch_pretrained_bert import BertTokenizer
from transformers import BertForQuestionAnswering, AlbertConfig, AlbertModel, RobertaModel, RobertaTokenizer
from sklearn.model_selection import train_test_split
from dask import delayed 

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

HBox(children=(IntProgress(value=0, description='Downloading', max=898823, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




In [0]:
train_path = '/content/drive/My Drive/IMDB/IMDB Dataset.csv'

In [8]:
data = pd.read_csv(train_path)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [0]:
def tokenize_text(text, max_seq_length):
    tokens_text = tokenizer.tokenize(text)
    ids_text = tokenizer.convert_tokens_to_ids(tokens_text)
    padding = [0] * max_seq_length - len(ids_text)

    ids_text += padding

    return ids_text

In [0]:
class IMDBData(Dataset):
    def __init__(self, path):
        self.path = path 
        self.tokens = None
        self.masks = None
        self.segment_ids = None
        self.sentiments = None

    def download_data(self):
        data = pd.read_csv(self.path)
        token_ids = []
        sentiments = []
        masks = []

        for i in tqdm(range(int(data.shape[0] / 1))):
            text = tokenizer.tokenize(data.iloc[i, 0])
            # print(text)
            if len(text) <= 512:
                if data.iloc[i, 1] == "positive":
                    sentiment = 1
                elif data.iloc[i, 1] == "negative":
                    sentiment = 0

                mask = [1 for i in range(len(text))]
                tokens = tokenizer.convert_tokens_to_ids(text)
                # print(tokens)
                
                padding = [0] * (512 - len(text))

                tokens += padding
                mask += padding

                token_ids.append(tokens)
                sentiments.append(sentiment)
                masks.append(mask)

        self.tokens = torch.tensor(token_ids, dtype=torch.int64)
        self.sentiments = torch.tensor(sentiments, dtype=torch.float32)
        self.masks = torch.tensor(masks)
        self.segment_ids = torch.tensor(np.zeros(self.masks.shape), dtype=torch.int64)

    def __getitem__(self, index):
        return self.tokens[index, :], self.segment_ids[index, :], self.masks[index, :], self.sentiments[index]

    def __len__(self):
      return self.tokens.shape[0]


In [11]:
a = IMDBData(train_path)
a.download_data()

100%|██████████| 50000/50000 [01:08<00:00, 730.05it/s]


In [0]:
train_data, test_data = train_test_split(a, test_size=0.1)

In [0]:
class SA(nn.Module):
    def __init__(self, config, n1, n2, n3, n4, dropout_prob):
        super(SA, self).__init__()
        self.bert = RobertaModel.from_pretrained('roberta-large')
        self.dropout = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(n1, n2)
        self.linear2 = nn.Linear(n2, n3)
        self.linear3 = nn.Linear(n3, n4)
        self.sigmoid = nn.Sigmoid()
        self.celu = nn.CELU()
        nn.init.xavier_normal_(self.linear1.weight)
        nn.init.xavier_normal_(self.linear2.weight)
        nn.init.xavier_normal_(self.linear3.weight)

    def forward(self, input_token_ids, segment_ids, mask):

        encoded_layers, pooled_output = self.bert(input_token_ids, mask, segment_ids)
        
        # print(encoded_layers.shape, pooled_output.shape)
        med1 = self.linear1(pooled_output)
        med1 = self.dropout(med1)
        output1 = self.celu(med1)

        med2 = self.linear2(output1)
        output2 = self.celu(med2)

        med3 = self.linear3(output2)
        output3 = self.sigmoid(med3)

        return output3
    
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [0]:
config = AlbertConfig()

In [15]:
model = SA(config, 1024, 300, 50, 1, 0.3)# .cuda()
model.freeze_bert_encoder()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
data_loader = DataLoader(train_data, batch_size=128)
test_loader = DataLoader(test_data, batch_size=128)
criterion = nn.BCELoss().cuda()

HBox(children=(IntProgress(value=0, description='Downloading', max=525, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=1425941629, style=ProgressStyle(description…




In [0]:
model.load_state_dict(torch.load("/content/drive/My Drive/IMDB/model.pth"))
model = model.cuda()
model.eval()

In [0]:

def train_model(model, criterion, data_loader, test_loader, optimizer, num_epochs=3):
    for epoch in range(1, num_epochs+1):
        model.train()
        loss_accum = 0
        loss_accum_val = 0
        
        for i_step, (input_token_ids, segment_ids, mask, sentiment) in tqdm(enumerate(data_loader)):
            input_token_ids = input_token_ids.cuda()
            segment_ids = segment_ids.cuda()
            mask = mask.cuda()
            sentiment = sentiment.reshape(-1, 1).cuda()

            # print("Making prediction")
            prediction1 = model(input_token_ids, segment_ids, mask)
            # print("prediction made")

            # print(prediction1.shape, sentiment.shape)

            loss_value = criterion(prediction1, sentiment)
            # print("Loss value counted")

            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            # print("Backward done")

            
            loss_accum += loss_value.item()

        for j_step, (input_token_ids_val, segment_ids_val, mask_val, sentiment_val) in enumerate(test_loader):
            input_token_ids_val = input_token_ids_val.cuda()
            segment_ids_val = segment_ids_val.cuda()
            mask_val = mask_val.cuda()
            sentiment_val = sentiment_val.reshape(-1, 1).cuda()

            val_prediction = model(input_token_ids_val, segment_ids_val, mask_val).cuda()
            # val_loss = criterion(val_prediction, sentiment_val)

            val_prediction = torch.tensor(val_prediction > 0.5, dtype=torch.int64).cuda()
            accuracy = torch.tensor(val_prediction == sentiment_val, dtype=torch.float32).mean()

            loss_accum_val += accuracy.item()

        average_loss = loss_accum / (i_step+1)
        average_loss_val = loss_accum_val / (j_step+1)
        print(f"Epoch:{epoch}, loss: {average_loss}, mean validation accuracy: {average_loss_val}")
        torch.save(model.state_dict(), "/content/drive/My Drive/IMDB/model.pth")

In [20]:
train_model(model, criterion, data_loader, test_loader, optimizer, num_epochs=3)

306it [35:06,  5.73s/it]


Epoch:1, loss: 0.4120093377587063, mean validation accuracy: 0.8256893385859096


306it [35:06,  5.73s/it]


Epoch:2, loss: 0.40654150050839566, mean validation accuracy: 0.8270986518439125


306it [35:06,  5.73s/it]


Epoch:3, loss: 0.3913433503091725, mean validation accuracy: 0.8190257356447332


In [19]:
model.load_state_dict(torch.load("/content/drive/My Drive/IMDB/model.pth"))
model = model.cuda()
model.eval()

SA(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-05, 