# Artificial Intelligence (CS6364-002) 
## Homework 8 - GRU Part
## Submission by - Adithya Sundararajan Iyer (asi200000)


Revise the code in the tutorial “Sentiment Analysis using LSTM (Step-by-Step Tutorial)” to replace the LSTM with the GRU. The following article provides you an example of GPU for sentiment analysis on a dataset of Amazon reviews.
https://www.kaggle.com/code/dijiswiki/lstm-gru-sentiment-analysis-on-amazon-review

In [1]:
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from string import punctuation
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
data_frame = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding='latin-1')
# data_frame = data_frame.sample(frac=0.25)

In [20]:
data_frame.columns = ['labels','time','date','qurey','username','tweet']
data_frame = data_frame.drop(columns =['time','date','qurey','username'], axis=1)
data_frame.head()

Unnamed: 0,labels,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [21]:
data_frame.drop(data_frame[data_frame['labels'] == 0].sample(frac=0.70).index)
data_frame = data_frame.drop(data_frame[data_frame['labels'] == 4].sample(frac=0.70).index)
print(data_frame.shape)

(1039999, 2)


In [22]:
data_frame.head()

Unnamed: 0,labels,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [23]:
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    rem = [w for w in words if not w in stop_words]
    return rem

def preprocessing(text):
    words = text.split()
    words = [word for word in words if word.isalpha()]
    rem = remove_stopwords(words)
    return ' '.join(rem)

In [24]:
data_frame['tweet'] = data_frame['tweet'].apply(preprocessing)
data_frame.head()

Unnamed: 0,labels,tweet
0,0,upset update Facebook texting might cry result...
1,0,I dived many times Managed save The rest go bo...
2,0,whole body feels itchy like fire
3,0,behaving I see
4,0,whole crew


In [25]:
count = Counter((' ').join(data_frame['tweet']).split())
total_words = len(count)
sorted_words = count.most_common(total_words)
vocab_to_int = {w:i for i, (w,c) in enumerate(sorted_words)}

tweet_arr = []
for tweets in data_frame['tweet']:
    tweet = [vocab_to_int[w] for w in tweets.split()]
    tweet_arr.append(tweet)

print(tweet_arr[:3])

[[513, 438, 1418, 2043, 169, 380, 2580, 1194, 22], [0, 60398, 163, 315, 6576, 725, 33, 379, 3, 48755], [249, 629, 310, 2506, 2, 1377]]


In [26]:
target = [1 if labels == 4 else 0 for labels in data_frame['labels']]
target_arr = np.array(target)

In [27]:
def padding_tweets(tweets, sequence):
    total_features = np.zeros((len(tweets), sequence), dtype = int)
    for i, tweet in enumerate(tweets):
        tweet_len = len(tweet)
        if tweet_len <= sequence:
            zeroes = list(np.zeros(sequence-tweet_len))
            new = zeroes+tweet
        elif tweet_len > sequence:
            new = tweet[0:sequence]
        total_features[i,:] = np.array(new)
    return total_features


In [28]:
features = padding_tweets(tweet_arr, 25)
len_feat = len(features)

print(len_feat)
print(type(features))

split_frac = 0.8
train_x = features[:int(split_frac*len_feat)]
train_y = target_arr[:int(split_frac*len_feat)]
remaining_x = features[int(split_frac*len_feat):]
remaining_y = target_arr[int(split_frac*len_feat):]
valid_x = remaining_x[:int(len(remaining_x)*0.5)]
valid_y = remaining_y[:int(len(remaining_y)*0.5)]
test_x = remaining_x[int(len(remaining_x)*0.5):]
test_y = remaining_y[int(len(remaining_y)*0.5):]

1039999
<class 'numpy.ndarray'>


In [29]:
batch_size = 50

train_data = TensorDataset(torch.from_numpy(np.array(train_x)), torch.from_numpy(np.array(train_y)))
valid_data = TensorDataset(torch.from_numpy(np.array(valid_x)), torch.from_numpy(np.array(valid_y)))
test_data = TensorDataset(torch.from_numpy(np.array(test_x)), torch.from_numpy(np.array(test_y)))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

print('Sample input size: ', sample_x.size()) 
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size())

Sample input size:  torch.Size([50, 25])
Sample input: 
 tensor([[    0,     0,     0,  ...,    11,   187,    48],
        [    0,     0,     0,  ...,    40,     1,    69],
        [    0,     0,     0,  ...,   642,  3587, 48476],
        ...,
        [    0,     0,     0,  ...,   151,   218,    65],
        [    0,     0,     0,  ...,   228,  5865,  1056],
        [    0,     0,     0,  ...,    60,  8125,  9249]])

Sample label size:  torch.Size([50])


In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [31]:
class GRUNetwork_Q2(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] 
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()
        return hidden


In [32]:
vocabulary_size = len(vocab_to_int)+1 
output_size = 1
embedding_dims = 400
hidden_dims = 256
num_layers = 2
gru_net = GRUNetwork_Q2(vocabulary_size, output_size, embedding_dims, hidden_dims, num_layers).to(device)
print(gru_net)

GRUNetwork_Q2(
  (embedding): Embedding(205990, 400)
  (lstm): GRU(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [35]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(gru_net.parameters(), lr=0.001)
epochs = 3 
count = 0
epoch_print = 100
clip=5

gru_net.train()

for epoch in range(epochs):
    h = gru_net.init_hidden(batch_size)
    train_num_correct = 0
    for inputs, labels in train_loader:
        count += 1
        if count>(5000*(epoch+1)):
            break
        h = h.data
        gru_net.zero_grad()
        inputs = inputs.type(torch.LongTensor)
        inputs = inputs.to(device)
        labels = labels.to(device)
        output, h = gru_net(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        pred = torch.round(output.squeeze()) 
        train_correct_tensor = pred.eq(labels.float().view_as(pred)).cpu()
        train_correct = np.squeeze(train_correct_tensor.numpy())
        train_num_correct += np.sum(train_correct)
        nn.utils.clip_grad_norm_(gru_net.parameters(), clip)
        optimizer.step()
        if count % epoch_print == 0:
            val_h = gru_net.init_hidden(batch_size)
            val_losses = []
            gru_net.eval()
            val_num_correct = 0
            for inputs, labels in valid_loader:
                val_h = val_h.data
                inputs = inputs.type(torch.LongTensor)
                inputs = inputs.to(device)
                labels = labels.to(device)
                output, val_h = gru_net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())
                val_losses.append(val_loss.item())
                pred = torch.round(output.squeeze()) 
                val_correct_tensor = pred.eq(labels.float().view_as(pred)).cpu()
                val_correct = np.squeeze(val_correct_tensor.numpy())
                val_num_correct += np.sum(val_correct)
            gru_net.train()
            val_acc = val_num_correct/len(valid_loader.dataset)
            print("Epoch: {}/{} \t".format(epoch+1, epochs),
                  "Step: {} \t".format(count),
                  "Loss: {:.6f} \t".format(loss.item()),
                  "Val Loss: {:.6f} \t".format(np.mean(val_losses)),
                  "Val Accuracy: {:.6f} \t".format(val_acc))
    train_acc = train_num_correct/len(train_loader.dataset)
    print("Train Accuracy: {:.6f}".format(train_acc))


Epoch: 1/3 	 Step: 100 	 Loss: 0.079683 	 Val Loss: 2.687428 	 Val Accuracy: 0.077865 	
Epoch: 1/3 	 Step: 200 	 Loss: 0.138392 	 Val Loss: 2.466555 	 Val Accuracy: 0.089125 	
Epoch: 1/3 	 Step: 300 	 Loss: 0.246004 	 Val Loss: 2.657064 	 Val Accuracy: 0.067212 	
Epoch: 1/3 	 Step: 400 	 Loss: 0.199285 	 Val Loss: 2.766932 	 Val Accuracy: 0.075702 	
Epoch: 1/3 	 Step: 500 	 Loss: 0.060889 	 Val Loss: 2.663726 	 Val Accuracy: 0.036375 	
Epoch: 1/3 	 Step: 600 	 Loss: 0.239222 	 Val Loss: 2.576798 	 Val Accuracy: 0.086413 	
Epoch: 1/3 	 Step: 700 	 Loss: 0.304842 	 Val Loss: 2.686845 	 Val Accuracy: 0.114846 	
Epoch: 1/3 	 Step: 800 	 Loss: 0.113358 	 Val Loss: 2.744933 	 Val Accuracy: 0.069433 	
Epoch: 1/3 	 Step: 900 	 Loss: 0.235670 	 Val Loss: 2.698611 	 Val Accuracy: 0.039010 	
Epoch: 1/3 	 Step: 1000 	 Loss: 0.109358 	 Val Loss: 2.906211 	 Val Accuracy: 0.056731 	
Epoch: 1/3 	 Step: 1100 	 Loss: 0.079935 	 Val Loss: 2.954846 	 Val Accuracy: 0.043798 	
Epoch: 1/3 	 Step: 1200 	 Loss

In [36]:
test_losses = [] 
num_correct = 0
h = gru_net.init_hidden(batch_size)
gru_net.eval()
for inputs, labels in test_loader:
    h = h.data
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.to(device)
    labels = labels.to(device)
    output, h = gru_net(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())
    correct_tensor = pred.eq(labels.float().view_as(pred)).cpu()
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
print("Test Data loss is : {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test Data accuracy is : {:.3f}".format(test_acc))

Test Data loss is : 2.504
Test Data accuracy is : 0.097


In [43]:
def tweet_tokenize(test_tweet):
    test_tweet = test_tweet.lower()
    test_tweet = test_tweet.replace('[^\w\s]','')
    test_words = test_tweet.split()
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words if word in vocab_to_int])
    return test_ints

def predict_sentiment(gru_net, test_tweet, sequence_length = 25):    
    gru_net.eval()
    test_ints = tweet_tokenize(test_tweet)
    seq_length = sequence_length
    features = padding_tweets(test_ints, seq_length)
    feature_tensor = torch.from_numpy(features).to(device)
    batch_size = feature_tensor.size(0)
    h = gru_net.init_hidden(batch_size)
    output, h = gru_net(feature_tensor, h)
    pred = torch.round(output.squeeze()) 
    print("Test Tweet is: ",test_tweet)
    print('Predicted value is : {:.6f}'.format(1-output.item()))
    print("\n")
    if(pred.item()==0):
        print("Positive tweet detected !!")
    else:
        print("Negative tweet detected !!")

In [44]:
test_tweet = 'Ronaldo will win this Euro Cup for Portugal.'
predict_sentiment(gru_net, test_tweet)
print("\n")
test_tweet = 'India is playing very well in this tournament.'
predict_sentiment(gru_net, test_tweet)
print("\n")
test_tweet = 'The new Marvel move was very disappointing.'
predict_sentiment(gru_net, test_tweet)

Test Tweet is:  Ronaldo will win this Euro Cup for Portugal.
Predicted value is : 0.925413


Positive tweet detected !!


Test Tweet is:  India is playing very well in this tournament.
Predicted value is : 0.944563


Positive tweet detected !!


Test Tweet is:  The new Marvel move was very disappointing.
Predicted value is : 0.480344


Negative tweet detected !!
