In [1]:
#Data Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation

#Pytorch
import torch
from torch import nn,optim
from torch.utils.data import TensorDataset,DataLoader

In [2]:
# Reding the txt files
with open('Data/reviews.txt','r') as f:
    reviews = f.read()
with open('Data/labels.txt','r') as f:
    labels = f.read()

In [3]:
print(reviews[:200])
print('......')
print(labels[:62])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  
......
positive
negative
positive
negative
positive
negative
positive


In [4]:
reviews = reviews.lower()
labels = labels.lower()

### Data Pre-processing

In [5]:
#Removing Puntuations
reviews = ''.join([c for c in reviews if c not in punctuation])

print('\n' in reviews)

#creating a list of reviews
reviews_split = reviews.split('\n')
reviews= ''.join(reviews_split)

print('\n' in reviews)

# Creating a list of all the words in the file.
words = reviews.split()
words[:10]

True
False


['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

Encoding the words

In [6]:
from collections import Counter
# Getting the words in the most frequent order
c= Counter(words)
vocab = sorted(c, key=c.get,reverse=True)
#Creatng a vocab_to_int dictionary for future reference
vocab_to_int = {word:i for i,word in enumerate(vocab,1)}
print('Unique Words:',len(vocab_to_int))

#Creating a list of indices for each word in each review
review_ints = []
for review in reviews_split:
    review_ints.append([vocab_to_int[word] for word in review.split()])


Unique Words: 74072


In [7]:
#Encoding the labels
labels = labels.split('\n')
encoded_labels = [0 if i =='negative' else 1 for i in labels]
encoded_labels[:10]

[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

In [8]:
print(len(review_ints))
print(len(encoded_labels))

25001
25001


In [9]:
# Removing Zero length reviews
print(f'no. of reviews before the processing:{len(review_ints)}')

zero_len_idx = [i for i,review in enumerate(review_ints) if len(review) == 0]
review_ints = [review_ints[i] for i in range(len(review_ints)) if i not in zero_len_idx]
encoded_labels = [encoded_labels[i] for i in range(len(encoded_labels)) if i not in zero_len_idx]

print(f'no. of reviews after the processing:{len(review_ints)}')

no. of reviews before the processing:25001
no. of reviews after the processing:25000


In [10]:
#Padding the reviews so that, all reviews have same lengths.
def pad_features(reviews_ints, seq_length=200):
    features= np.zeros((len(reviews_ints),seq_length), dtype=int)
    for i,review in enumerate(reviews_ints):
        features[i,-len(review):] = np.array(review)[:seq_length]
    
    # for review in review_ints:   
    #     if len(review)<=seq_length:
    #         temp = [0]*seq_length
    #         temp[seq_length-len(review):] = review
    #         features.append(temp)
    #     elif len(review) > seq_length:
    #         features.append(review[:seq_length])

    return features


In [11]:
test_review_ints = [[1,2,3],[1,2,3,4,5,6,7,8,9,10]]
temp_seq_len = 5
temp_features = pad_features(test_review_ints,temp_seq_len)
print([i for i in temp_features])

[array([0, 0, 1, 2, 3]), array([1, 2, 3, 4, 5])]


In [12]:
seq_len=250

features = pad_features(reviews_ints=review_ints,seq_length=seq_len)
print(f'{len(review_ints)} and {type(review_ints)}')
print(f'{len(features)} and {type(features)}')

25000 and <class 'list'>
25000 and <class 'numpy.ndarray'>


Splitting the data

In [13]:
encoded_labels = np.array(encoded_labels)

In [14]:
split_frac =0.8
split_idx = int(len(features)*0.8)
train_x,rem_x = features[:split_idx],features[split_idx:]
train_y,rem_y = encoded_labels[:split_idx],encoded_labels[split_idx:]

test_idx = int(len(rem_x)*0.5)
valid_x, test_x = features[:test_idx],features[test_idx:]
valid_y,test_y = encoded_labels[:test_idx],encoded_labels[test_idx:]


In [15]:
# Creating TensorDataset
train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x),torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(test_y))

#Creating Dataloader
train_loader = DataLoader(train_data,batch_size=64,shuffle=True,drop_last=True)
valid_loader = DataLoader(valid_data,batch_size=64,shuffle=True, drop_last=True)
test_loader = DataLoader(test_data,batch_size=64,shuffle=True, drop_last=True)


In [16]:
dataiter = iter(valid_loader)
sample_x,sample_y = dataiter.next()

print(f'Sample feature shape:{sample_x.shape}')
print(f'Sample label shape:{sample_y.shape}')

Sample feature shape:torch.Size([64, 250])
Sample label shape:torch.Size([64])


### Defining the model

In [17]:
class Sentiment_analysis(nn.Module, ):
    def __init__(self,n_vocab,embed_dim,hidden_size,num_layers,output_size,drop_prob = 0.5):
        super().__init__()

        self.hidden_dim = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        self.embed = nn.Embedding(n_vocab,embedding_dim=embed_dim)
        
        self.lstm = nn.LSTM(
            embed_dim, hidden_size, num_layers, 
            dropout=drop_prob, batch_first = True
            )
        
        self.fc = nn.Linear(hidden_size,output_size)
    
    def forward(self,x,hidden):
        batch_size = x.size(0)
        embedding = self.embed(x)
        lstm_out, hidden = self.lstm(embedding,hidden)
        out = torch.sigmoid(self.fc(lstm_out))
        
        out = out.view(batch_size,-1)
        out = out[:,-1]

        return out,hidden

    def init_hidden(self,batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().cuda())
        
        return hidden

In [18]:
vocab_size = len(vocab_to_int)+1
output_size = 1
embedding_dim = 500
hidden_dim = 256
n_layers = 2

model = Sentiment_analysis(
    n_vocab = vocab_size,embed_dim = embedding_dim,
    hidden_size = hidden_dim,num_layers= n_layers,output_size = output_size
    )

model.cuda()
model

Sentiment_analysis(
  (embed): Embedding(74073, 500)
  (lstm): LSTM(500, 256, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

### Training the model

In [19]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(),lr=0.01)

In [20]:
epochs = 5
counter = 0
clip = 5
print_every = 100

for epoch in range(epochs):
    h = model.init_hidden(batch_size=64)

    for input,label in train_loader:
        counter+=1

        model.train()
        input,label = input.cuda(),label.cuda()
        label = label.float()
        h = tuple([each.data for each in h]) # Removing past references

        optimizer.zero_grad()
        output,h = model(input,h)

        loss = criterion(output.squeeze(),label)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size=64)
            val_losses = []
            model.eval()
            
            for val_inputs,val_label in valid_loader:
                
                val_inputs,val_label = val_inputs.cuda(),val_label.cuda()
                val_label = val_label.float()
                val_h = tuple([each.data for each in val_h]) # Removing past references
                try:
                    val_output,val_h = model(val_inputs,val_h)
                    

                    val_loss = criterion(val_output.squeeze(),val_label)
                    val_losses.append(val_loss.item())
                except:
                    print(f'Validation shape{val_output.shape}')
                    print(f'Validation shape{val_h.shape}')


            model.train()
            print(f'Epoch:{epoch+1}   Step:{counter}    Loss:{loss.item():.4f}   val_loss:{np.mean(val_losses):.4f}')



Epoch:1   Step:100    Loss:0.6153   val_loss:0.5130
Epoch:1   Step:200    Loss:0.4251   val_loss:0.3951
Epoch:1   Step:300    Loss:0.5561   val_loss:0.4794
Epoch:2   Step:400    Loss:0.3258   val_loss:0.3109
Epoch:2   Step:500    Loss:0.4104   val_loss:0.2452
Epoch:2   Step:600    Loss:0.2872   val_loss:0.2127
Epoch:3   Step:700    Loss:0.1314   val_loss:0.1653
Epoch:3   Step:800    Loss:0.3239   val_loss:0.1666
Epoch:3   Step:900    Loss:0.2300   val_loss:0.1647
Epoch:4   Step:1000    Loss:0.1760   val_loss:0.1223
Epoch:4   Step:1100    Loss:0.0921   val_loss:0.1484
Epoch:4   Step:1200    Loss:0.1410   val_loss:0.1462
Epoch:5   Step:1300    Loss:0.1160   val_loss:0.0852
Epoch:5   Step:1400    Loss:0.1161   val_loss:0.1114
Epoch:5   Step:1500    Loss:0.1599   val_loss:0.0994


### Testing the model

In [21]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0
train_on_gpu=True
# init hidden state
h = model.init_hidden(batch_size = 64)

model.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    # get predicted outputs
    output, h = model(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.211
Test accuracy: 0.925


In [22]:
torch.save(model.state_dict(),'model.pth')

In [25]:
vocab_size = len(vocab_to_int)+1
output_size = 1
embedding_dim = 500
hidden_dim = 256
n_layers = 2

model = Sentiment_analysis(
    n_vocab = vocab_size,embed_dim = embedding_dim,
    hidden_size = hidden_dim,num_layers= n_layers,output_size = output_size
    )

model.load_state_dict(torch.load('model.pth'))
model.cuda()

model

Sentiment_analysis(
  (embed): Embedding(74073, 500)
  (lstm): LSTM(500, 256, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

## Prediction of some reviews

In [28]:
test_review_pos = 'What a wonderful experience this was! I was on the edge of my seat when I watched this movie. I still recall all the scenes. When I watched the intermission, I got goosebumps. I would definitely watch this movie again'
test_review_neg = 'This is the worst movie that anyone could possibly make. I still cannot understand why the producer invested so much in this movie. The recently released low budget movie was better than this one. I probably would never watch this one again'

In [29]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])

    return test_ints

def predict(net, test_review, sequence_length=200):

    net.eval()

    # tokenize review
    test_ints = tokenize_review(test_review)

    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)

    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)

    batch_size = feature_tensor.size(0)

    # initialize hidden state
    h = net.init_hidden(batch_size)

    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()

    # get the output from the model
    output, h = net(feature_tensor, h)

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))

    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")

In [30]:
predict(model,test_review_pos,250)

Prediction value, pre-rounding: 0.977527
Positive review detected!


In [31]:
predict(model,test_review_neg,250)

Prediction value, pre-rounding: 0.004393
Negative review detected.
