In [1]:
import os
import torch
import random
import numpy as np
from time import time

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

import amazonDataset
import twitterDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using:', device)

Using: cuda


In [2]:
amazon_data_dir = '/home/abhishek/Downloads/datasets/kaggle/SentimentAnalysis/amazon/'
twitter_dataset_path = '/home/abhishek/Downloads/datasets/kaggle/SentimentAnalysis/twitter/training.1600000.processed.noemoticon.csv'

In [3]:
amazon_datasets = amazonDataset.AmazonDataset(os.path.join(amazon_data_dir, 'train.ft.txt')), \
                 amazonDataset.AmazonDataset(os.path.join(amazon_data_dir, 'test.ft.txt')), 

twitter_dataset = twitterDataset.ReadDF(twitter_dataset_path)

In [4]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

TOKEN = {
    "INIT": tokenizer.cls_token,
    "EOS": tokenizer.sep_token,
    "PAD": tokenizer.pad_token,
    "UNK": tokenizer.unk_token
}

TOKEN_IDX = {
    "INIT": tokenizer.cls_token_id,
    "EOS": tokenizer.sep_token_id,
    "PAD": tokenizer.pad_token_id,
    "UNK": tokenizer.unk_token_id
}
print(TOKEN, TOKEN_IDX)

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

{'INIT': '[CLS]', 'EOS': '[SEP]', 'PAD': '[PAD]', 'UNK': '[UNK]'} {'INIT': 101, 'EOS': 102, 'PAD': 0, 'UNK': 100}
512


In [5]:
def TextToToken(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [TOKEN_IDX['INIT']] + tokenizer.convert_tokens_to_ids(tokens) + [TOKEN_IDX['EOS']]
    return indexed

def BatchTextToTensor(sentences):
    tokens = list(map(TextToToken, sentences))
    max_length = max([len(x) for x in tokens])
    token = map(lambda x: x + [TOKEN_IDX['PAD']]*(max_length-len(x)), list(tokens))
    return torch.LongTensor(list(token))

In [6]:
DATASET = torch.utils.data.ConcatDataset([*amazon_datasets, twitter_dataset])

TRAIN_TEST_RATIO = 0.8
BATCH_SIZE = 64

TRAIN_LEN = int(len(DATASET)*0.8)
TEST_LEN = len(DATASET) - TRAIN_LEN

In [7]:
train_data, test_data = torch.utils.data.random_split(DATASET, [TRAIN_LEN, TEST_LEN])

train_loader = torch.utils.data.DataLoader(train_data, BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_data, BATCH_SIZE, shuffle=False, num_workers=4)

len(train_loader), len(test_loader)

(120000, 30000)

In [8]:
# bert, word2vec, glove

In [9]:
# ***************** IMDB Dataset ******************** #

# tokens = tokenizer.tokenize('Hello worLD how ArE YOU ? ')
# indexes = tokenizer.convert_tokens_to_ids(tokens)
# tokens, indexes


# from torchtext.legacy import data
# TEXT = data.Field(
#     batch_first=True, use_vocab=False, tokenize=Tokenize, 
#     preprocessing=tokenizer.convert_tokens_to_ids,
#     init_token=TOKEN_IDX['INIT'],
#     eos_token=TOKEN_IDX['EOS'],
#     pad_token=TOKEN_IDX['PAD'],
#     unk_token=TOKEN_IDX['UNK']
# )
# LABEL = data.LabelField(dtype=torch.float)

# from torchtext.legacy import datasets
# train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

# LABEL.build_vocab(train_data)

# BATCH_SIZE = (64,64)
# train_iterator, test_iterator = data.BucketIterator.splits(
#     (train_data, test_data),
#     batch_sizes=BATCH_SIZE,
#     device=device
# )

In [10]:
import torch.nn as nn

In [11]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased').to(device)
for name, param in bert.named_parameters():
    param.requires_grad = False

print(bert.config.to_dict()['hidden_size'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768


In [12]:
class BertGruSentiment(nn.Module):
    def __init__(self, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(
            embedding_dim, hidden_dim, 
            num_layers=n_layers, bidirectional=True, 
            batch_first=True, dropout=dropout
        )
        self.out = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        with torch.no_grad():
            embedded = bert(text)[0]
        
        _, hidden = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        output = self.out(hidden)
        return output
    
    def param_count(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
    def named_params(self):
        return [x[0] for x in self.named_parameters() if x[1].requires_grad]

In [13]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.5
model = BertGruSentiment(HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT).to(device)

In [14]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

In [15]:
def accuracy(preds, actual):
    rounded = torch.round(torch.sigmoid(preds))
    correct = (rounded == actual)
    return correct.sum() / len(correct)

In [16]:
def Train(epoch, print_every=100):
    epoch_loss = 0
    epoch_acc = 0
    start_time = time()
    
    model.train()
    for idx, batch in enumerate(train_loader, 1):
        optimizer.zero_grad()
        batch[0] = BatchTextToTensor(batch[0]).to(device)
        batch[1] = batch[1].float().to(device)
        
        predictions = model(batch[0]).squeeze(1)
        
        loss = criterion(predictions, batch[1])
        acc = accuracy(predictions, batch[1])
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        if idx%print_every == 0:
            print('\t[{}] [{}/{}], Loss: {:.3f}, Accuracy: {:.2f}, Time: {:.1f} minutes'.format(
                epoch, idx, len(train_loader), loss.item(), acc.item(), (time()-start_time)/60
            ))
    epoch_acc /= len(train_loader)
    epoch_loss /= len(train_loader)
    print('Train Epoch: {} | Loss: {:.4f}, Accuracy: {:.2f} | Time: {:.1f} minutes'.format(
        epoch, epoch_loss, epoch_acc, (time()-start_time)/60
    ))

In [17]:
def Test(epoch, print_every=100):
    epoch_loss = 0
    epoch_acc = 0
    start_time = time()
    
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(test_loader, 1):
            batch[0] = BatchTextToTensor(batch[0]).to(device)
            batch[1] = batch[1].float().to(device)
            
            predictions = model(batch[0]).squeeze(1)
            loss = criterion(predictions, batch[1])
            acc = accuracy(predictions, batch[1])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            if idx%print_every == 0:
                print('\t[{}] [{}/{}] Testing Completed, Time: {:.1f} minutes'.format(
                    epoch, idx, len(test_loader), (time()-start_time)/60
                ))
    
    epoch_acc /= len(test_loader)
    epoch_loss /= len(test_loader)
    print('Test Epoch: {} | Loss: {:.4f}, Accuracy: {:.2f} | Time: {:.1f} minutes'.format(
        epoch, epoch_loss, epoch_acc, (time()-start_time)/60
    ))

In [18]:
for epoch in range(1):
    Train(epoch, 1000)
    Test(epoch, 1000)
    torch.save(model.state_dict(), f'model_{epoch}.pt')

	[0] [1000/120000], Loss: 0.583, Accuracy: 0.70, Time: 3.4 minutes
	[0] [2000/120000], Loss: 0.436, Accuracy: 0.78, Time: 6.8 minutes
	[0] [3000/120000], Loss: 0.509, Accuracy: 0.75, Time: 10.1 minutes
	[0] [4000/120000], Loss: 0.402, Accuracy: 0.80, Time: 13.5 minutes
	[0] [5000/120000], Loss: 0.504, Accuracy: 0.78, Time: 16.9 minutes
	[0] [6000/120000], Loss: 0.535, Accuracy: 0.72, Time: 20.3 minutes


Process Process-3:
Traceback (most recent call last):
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/multiprocessing/process.py", line 318, in _bootstrap
    util._exit_function()
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/multiprocessing/util.py", line 360, in _exit_function
    _run_finalizers()
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/multiprocessing/util.py", line 300, in _run_finalizers
    finalizer()
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/multiprocessing/queues.py", line 201, in _finalize_join
    thread.join()
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/threading.py", line 1053, in join
    self._wait_for_tstate_lock()
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/threading.py", line 1069, in _wait_for_tstate_lock
    el

Traceback (most recent call last):
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_5720/1419694417.py", line 2, in <module>
    Train(epoch, 1000)
  File "/tmp/ipykernel_5720/2242097262.py", line 20, in Train
    epoch_loss += loss.item()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2064, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/abhishek/anaconda3/envs/abhienv/lib/python3.9/site-packages/IPython/core/ultratb.py", line 1101, in get

TypeError: object of type 'NoneType' has no len()

In [None]:
# model.load_state_dict(torch.load('model_2.pt'))
# Test('test', 100)

In [None]:
# def predict_sentiment(sentence):
#     model.eval()
#     tokens = tokenizer.tokenize(sentence)
#     tokens = tokens[:max_input_length-2]
#     indexed = [TOKEN_IDX['INIT']] + tokenizer.convert_tokens_to_ids(tokens) + [TOKEN_IDX['EOS']]
#     tensor = torch.LongTensor(indexed).to(device)
#     tensor = tensor.unsqueeze(0)
#     prediction = torch.sigmoid(model(tensor))
#     return prediction.item()

In [None]:
# predict_sentiment("This film is great")