In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from sklearn.feature_extraction.text import CountVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
train = pd.read_csv("/content/Train.csv")
valid = pd.read_csv("/content/Valid.csv")
test = pd.read_csv("/content/Test.csv")

In [None]:
dataset = pd.concat([train, valid, test])

In [None]:
dataset.shape

(50000, 2)

Process the Text

In [None]:
class Sequences(Dataset):
    def __init__(self, data, max_seq_len):
        self.max_seq_len = max_seq_len
        df = data
        vectorizer = CountVectorizer(stop_words='english', min_df=0.015)
        vectorizer.fit(df.text.tolist())
        
        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x)
                                 if token in self.token2idx]
        self.pad = lambda x: x + (max_seq_len - len(x)) * [self.token2idx['<PAD>']]
        
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df.text.tolist()]
        sequences, self.labels = zip(*[(sequence, label) for sequence, label
                                    in zip(sequences, df.label.tolist()) if sequence])
        self.sequences = [self.pad(sequence) for sequence in sequences]

    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]
    
    def __len__(self):
        return len(self.sequences)


In [None]:
data = Sequences(dataset, max_seq_len=240)

In [None]:
len(data.token2idx)

1046

In [None]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs, target

batch_size = 2048
train_loader = DataLoader(data, batch_size=batch_size, collate_fn=collate)

In [None]:
class Model(nn.Module):
    def __init__(self,
                 vocab_size, batch_size,
                 embedding_dimension = 100,
                 hidden_size = 240, n_layers = 1,
                 device = "cpu"):
        super(Model, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        self.batch_size = batch_size

        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension, hidden_size,
            num_layers = n_layers,
            batch_first = True
        )
        self.decoder = nn.Linear(hidden_size, 1)

    def init_hidden(self):
        return torch.rand(self.n_layers, self.batch_size, self.hidden_size).to(self.device)

    def forward(self, inputs):
        # Avoid breaking if the last batch has a different size
        batch_size = inputs.size(0)
        if batch_size != self.batch_size:
            self.batch_size = batch_size

        encoded = self.encoder(inputs)
        output, hidden = self.rnn(encoded, self.init_hidden())
        output = self.decoder(output[:,:,-1]).squeeze()
        return output

In [None]:
net = Model(hidden_size=240,
            vocab_size = len(data.token2idx),
            device = device,
            batch_size = batch_size)

net = net.to(device)
net

Model(
  (encoder): Embedding(1046, 100)
  (rnn): GRU(100, 240, batch_first=True)
  (decoder): Linear(in_features=240, out_features=1, bias=True)
)

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in net.parameters() if p.requires_grad], lr=0.001)

In [None]:
net.train()
train_losses = []
for epoch in range(1,11):
    losses = []
    total = 0
    for inputs, target in train_loader:
        inputs, target = inputs.to(device), target.to(device)

        net.zero_grad()
        output = net(inputs)
        loss = criterion(output, target)
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), 3)
        optimizer.step()
        losses.append(loss.item())
        total += 1
    
    epoch_loss = sum(losses)/total
    train_losses.append(epoch_loss)

    print(f"Epoch {epoch}\t Train Loss:{epoch_loss:.3f}")

Epoch 1	 Train Loss:0.666
Epoch 2	 Train Loss:0.521
Epoch 3	 Train Loss:0.433
Epoch 4	 Train Loss:0.367
Epoch 5	 Train Loss:0.330
Epoch 6	 Train Loss:0.309
Epoch 7	 Train Loss:0.294
Epoch 8	 Train Loss:0.283
Epoch 9	 Train Loss:0.272
Epoch 10	 Train Loss:0.262


In [None]:
def predict_sentiment(text):
    net.eval()
    with torch.no_grad():
        test_vector = torch.LongTensor([data.pad(data.encode(text))]).to(device)
        
        output = net(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5:
            print(f'{prediction:0.3}: Positive sentiment')
        else:
            print(f'{prediction:0.3}: Negative sentiment')

In [None]:
test_text = '''
Really good I like him as he was the main character who fight with Voldemort very greatly. I m an asian and India it so popular and I wish to meet u three
Ur performance and that quidditch match was so excited and also that ghost who guards that jail
And also serious black 
I also wish that producer has to make more films of them show dumbuledore, harmony, Ronald, serious black, Voldemort, Mrs. Weasley, Malfoy, and also HARRY POTTER.
💝💝💖💖💖💖
In ur first part that kingdom and that style you came here and the post letter that has been giving in your home again and again by an owl and that magic when haegret make that magical by giving tale that was so interesting and also that u r famous for ur that sign when Voldemort kill ur parent but u saved and Voldemort gave u some power of him that was very scared and shocking that Voldemort a simple boy who taught by dumbledore bitten him easily and I couldn't understand why another school helps them?
please give a reply to my question 
I m your big fan. I saw ur picture again and again



I REQUEST TO FORWARD THIS MESSAGE TO EMMA WATSON, RUPERT GRINT, AND DANIEL REDCLIFFE 
'''

predict_sentiment(test_text)

0.604: Positive sentiment


In [None]:
test_text = '''
I'm going to write the honest and sincere comments and suggestions for indian audiance who watch most bollywood films. 
Guys first of all the entire Race Frenchie that is race 1, and 2 both are ripped off from Hollywood flick check out in Google and IMBD that means they are entirely copy pasted the movie even the songs composed by copycat Pritam Chakraborty. Song's we're copied from Korean album my Sasi girl. 
Now Race 3 what is new .... Nothing 😂 there is nothing in this movie which makes you heartwarming, eye catching or any sort of connection with characters in the movie. This movie is also not worth watching for free on television because your valuable time will be wasted and that is equal to loosing MONEY. 
When this movie got first premiered on television on I suppose on Zee cinema. I watched this for nearly 10 minutes and I felt what the heck I'm doing during break i just browsing through channels I came across Hollywood Bean movie. This movie really saved my day and got rid of Race 3. 
Why people are still praising salman khan why he's now getting aged and he should pass on the battle and let New face come to bollywood but unfortunately it's bollywood Full of nepotism it will never improve it's nepotism, favouritism strategy. Hence request you All please don't waste your time and money on these worthless star's. 
There are wonderful... Incredible astonishing amazing Hollywood flicks collection you can watch instead of this kind of trash
'''

predict_sentiment(test_text)

0.0316: Negative sentiment
