In [1]:
import torch
import pandas as pd

# nlp library of Pytorch
from torchtext.legacy import data as dt
import numpy as np
import torchtext

import warnings as wrn
wrn.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path_pos = "/cluster/home/abkumar/dataset/twitter-datasets/train_pos.txt"
file_path_neg = "/cluster/home/abkumar/dataset/twitter-datasets/train_neg.txt"
file_path_all = "/cluster/home/abkumar/dataset/twitter-datasets/train_all.csv"

In [3]:
data_pos = []
data_neg = []
with open(file_path_pos) as f:
    for i in f:
        t = i.replace('<user>', '')
        t1 = t.replace('<url>', '')
        data_pos.append(t1)

with open(file_path_neg) as f:
    for i in f:
        t = i.replace('<user>', '')
        t1 = t.replace('<url>', '')
        data_neg.append(t1)



In [4]:
data = data_pos + data_neg
data_labels = [1]* len(data_pos) + [-1] * len(data_neg)
print(len(data))

200000


In [5]:
df = pd.DataFrame({'text':data})
df['label'] = pd.Series(data_labels)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    200000 non-null  object
 1   label   200000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.1+ MB
None


In [6]:
df.to_csv('/cluster/home/abkumar/dataset/twitter-datasets/train_all.csv', sep=',', encoding='utf-8', index=False)

In [3]:
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load('en_core_web_lg')
tokenizer = Tokenizer(nlp.vocab)
def spacy_tokenize(x):
    return [tok.text for tok in tokenizer(x)]


In [4]:
TEXT = dt.Field(tokenize=spacy_tokenize, batch_first=True,include_lengths=True)
LABEL = dt.LabelField(dtype = torch.float,batch_first=True)

In [5]:
fields = [('text',TEXT), ("label", LABEL)]

In [6]:
training_data = dt.TabularDataset(path=file_path_all,
                                    format="csv",
                                    fields=fields,
                                    skip_header=True
                                   )

print(vars(training_data.examples[0]))

{'text': [' ', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15'], 'label': '1'}


In [7]:
import random
# train and validation splitting
train_data,valid_data = training_data.split(split_ratio=0.75,
                                            random_state=random.seed(2022))

In [8]:
# Building vocabularies => (Token to integer)
TEXT.build_vocab(train_data,
                 min_freq=5)

LABEL.build_vocab(train_data)

In [14]:
print("Size of text vocab:",len(TEXT.vocab))
print("Size of label vocab:",len(LABEL.vocab))


Size of text vocab: 17676
Size of label vocab: 2


In [9]:
TEXT.vocab.freqs.most_common(10)

[(' ', 75790),
 ('!', 62129),
 ('i', 60373),
 ('the', 45513),
 (',', 44949),
 ('.', 44937),
 ('to', 41818),
 ('you', 35803),
 ('(', 35200),
 ('a', 31076)]

In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [11]:
BATCH_SIZE = 64

# We'll create iterators to get batches of data when we want to use them
"""
This BucketIterator batches the similar length of samples and reduces the need of 
padding tokens. This makes our future model more stable

"""
train_iterator,validation_iterator = dt.BucketIterator.splits(
    (train_data,valid_data),
    batch_size = BATCH_SIZE,
    # Sort key is how to sort the samples
    sort_key = lambda x:len(x.text),
    sort_within_batch = True,
    device = device
)

In [12]:
import torch.nn as nn


class LSTMNet(nn.Module):
    
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout):
        
        super(LSTMNet,self).__init__()
        
        # Embedding layer converts integer sequences to vector sequences
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        
        # LSTM layer process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True
                           )
        
        # Dense layer to predict 
        self.fc = nn.Linear(hidden_dim * 2,output_dim)
        # Prediction activation function
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self,text,text_lengths):
        embedded = self.embedding(text)
        
        # Thanks to packing, LSTM don't see padding tokens 
        # and this makes our model better
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(),batch_first=True)
        
        packed_output,(hidden_state,cell_state) = self.lstm(packed_embedded)
        
        # Concatenating the final forward and backward hidden states
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.sigmoid(dense_outputs)
        
        return outputs

In [13]:
SIZE_OF_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 200
NUM_HIDDEN_NODES = 128
NUM_OUTPUT_NODES = 1
NUM_LAYERS = 2
BIDIRECTION = True
DROPOUT = 0.3

In [14]:
model = LSTMNet(SIZE_OF_VOCAB,
                EMBEDDING_DIM,
                NUM_HIDDEN_NODES,
                NUM_OUTPUT_NODES,
                NUM_LAYERS,
                BIDIRECTION,
                DROPOUT
               )

In [15]:
import torch.optim as optim
model = model.to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-4)
criterion = nn.BCELoss()
criterion = criterion.to(device)

In [25]:
model

LSTMNet(
  (embedding): Embedding(17676, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [16]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [17]:
def train(model,iterator,optimizer,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    model.train()
    
    for batch in iterator:
        
        # cleaning the cache of optimizer
        optimizer.zero_grad()
        
        text,text_lengths = batch.text
        
        # forward propagation and squeezing
        predictions = model(text,text_lengths).squeeze()
        
        # computing loss / backward propagation
        loss = criterion(predictions,batch.label)
        loss.backward()
        
        # accuracy
        acc = binary_accuracy(predictions,batch.label)
        
        # updating params
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    # It'll return the means of loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [18]:
def evaluate(model,iterator,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    # deactivate the dropouts
    model.eval()
    
    # Sets require_grad flat False
    with torch.no_grad():
        for batch in iterator:
            text,text_lengths = batch.text
            
            predictions = model(text,text_lengths).squeeze()
              
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


SIZE_OF_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 100
NUM_HIDDEN_NODES = 64
NUM_OUTPUT_NODES = 1
NUM_LAYERS = 2
BIDIRECTION = True
DROPOUT = 0.2

In [35]:
EPOCH_NUMBER = 10
for epoch in range(1,EPOCH_NUMBER+1):
    
    train_loss,train_acc = train(model,train_iterator,optimizer,criterion)
    
    valid_loss,valid_acc = evaluate(model,validation_iterator,criterion)
    
    # Showing statistics
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print()

	Train Loss: 0.417 | Train Acc: 79.80%
	 Val. Loss: 0.404 |  Val. Acc: 80.56%

	Train Loss: 0.390 | Train Acc: 81.57%
	 Val. Loss: 0.388 |  Val. Acc: 81.55%

	Train Loss: 0.371 | Train Acc: 82.65%
	 Val. Loss: 0.379 |  Val. Acc: 82.09%

	Train Loss: 0.356 | Train Acc: 83.62%
	 Val. Loss: 0.374 |  Val. Acc: 82.49%

	Train Loss: 0.343 | Train Acc: 84.34%
	 Val. Loss: 0.367 |  Val. Acc: 82.96%

	Train Loss: 0.332 | Train Acc: 84.96%
	 Val. Loss: 0.366 |  Val. Acc: 83.20%

	Train Loss: 0.322 | Train Acc: 85.46%
	 Val. Loss: 0.364 |  Val. Acc: 83.26%

	Train Loss: 0.313 | Train Acc: 86.00%
	 Val. Loss: 0.364 |  Val. Acc: 83.53%

	Train Loss: 0.304 | Train Acc: 86.51%
	 Val. Loss: 0.363 |  Val. Acc: 83.56%

	Train Loss: 0.296 | Train Acc: 87.03%
	 Val. Loss: 0.364 |  Val. Acc: 83.62%



SIZE_OF_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 200
NUM_HIDDEN_NODES = 128
NUM_OUTPUT_NODES = 1
NUM_LAYERS = 2
BIDIRECTION = True
DROPOUT = 0.3

In [19]:
EPOCH_NUMBER = 10
for epoch in range(1,EPOCH_NUMBER+1):
    
    train_loss,train_acc = train(model,train_iterator,optimizer,criterion)
    
    valid_loss,valid_acc = evaluate(model,validation_iterator,criterion)
    
    # Showing statistics
    print("Epoch -- ", epoch)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print()

Epoch --  1
	Train Loss: 0.463 | Train Acc: 76.74%
	 Val. Loss: 0.402 |  Val. Acc: 80.94%

Epoch --  2
	Train Loss: 0.382 | Train Acc: 82.03%
	 Val. Loss: 0.374 |  Val. Acc: 82.65%

Epoch --  3
	Train Loss: 0.352 | Train Acc: 83.82%
	 Val. Loss: 0.367 |  Val. Acc: 82.99%

Epoch --  4
	Train Loss: 0.329 | Train Acc: 85.11%
	 Val. Loss: 0.360 |  Val. Acc: 83.62%

Epoch --  5
	Train Loss: 0.310 | Train Acc: 86.18%
	 Val. Loss: 0.361 |  Val. Acc: 83.48%

Epoch --  6
	Train Loss: 0.293 | Train Acc: 87.15%
	 Val. Loss: 0.371 |  Val. Acc: 83.87%

Epoch --  7
	Train Loss: 0.278 | Train Acc: 88.00%
	 Val. Loss: 0.368 |  Val. Acc: 83.51%

Epoch --  8
	Train Loss: 0.261 | Train Acc: 88.82%
	 Val. Loss: 0.374 |  Val. Acc: 83.79%

Epoch --  9
	Train Loss: 0.245 | Train Acc: 89.59%
	 Val. Loss: 0.387 |  Val. Acc: 83.60%

Epoch --  10
	Train Loss: 0.230 | Train Acc: 90.32%
	 Val. Loss: 0.391 |  Val. Acc: 83.94%

