In [148]:
import pandas as pd
import numpy as np
from collections import Counter

# dataset is accessible at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt (https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz)
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv', sep='\t', header=None, on_bad_lines='skip')

print(df.head())

            0            1               2           3               4   \
0  marketplace  customer_id       review_id  product_id  product_parent   
1           US     17747349  R2EI7QLPK4LF7U  B00U7LCE6A       106182406   
2           US     10956619  R1W5OMFK1Q3I3O  B00HRJMOM4       162269768   
3           US     13132245   RPZWSYWRP92GI  B00P31G9PQ       831433899   
4           US     35717248  R2WQWM04XHD9US  B00FGDEPDY       991059534   

                                           5                 6            7   \
0                               product_title  product_category  star_rating   
1                    CCleaner Free [Download]  Digital_Software            4   
2          ResumeMaker Professional Deluxe 18  Digital_Software            3   
3                   Amazon Drive Desktop [PC]  Digital_Software            1   
4  Norton Internet Security 1 User 3 Licenses  Digital_Software            5   

              8            9     10                 11              

  exec(code_obj, self.user_global_ns, self.user_ns)


In [149]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101837 entries, 0 to 101836
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       101837 non-null  object
 1   1       101837 non-null  object
 2   2       101837 non-null  object
 3   3       101837 non-null  object
 4   4       101837 non-null  object
 5   5       101837 non-null  object
 6   6       101837 non-null  object
 7   7       101837 non-null  object
 8   8       101837 non-null  object
 9   9       101837 non-null  object
 10  10      101837 non-null  object
 11  11      101837 non-null  object
 12  12      101837 non-null  object
 13  13      101837 non-null  object
 14  14      101832 non-null  object
dtypes: object(15)
memory usage: 11.7+ MB


In [150]:
df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating', 
            'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']

print(df.head())

   marketplace  customer_id       review_id  product_id  product_parent  \
0  marketplace  customer_id       review_id  product_id  product_parent   
1           US     17747349  R2EI7QLPK4LF7U  B00U7LCE6A       106182406   
2           US     10956619  R1W5OMFK1Q3I3O  B00HRJMOM4       162269768   
3           US     13132245   RPZWSYWRP92GI  B00P31G9PQ       831433899   
4           US     35717248  R2WQWM04XHD9US  B00FGDEPDY       991059534   

                                product_title  product_category  star_rating  \
0                               product_title  product_category  star_rating   
1                    CCleaner Free [Download]  Digital_Software            4   
2          ResumeMaker Professional Deluxe 18  Digital_Software            3   
3                   Amazon Drive Desktop [PC]  Digital_Software            1   
4  Norton Internet Security 1 User 3 Licenses  Digital_Software            5   

   helpful_votes  total_votes  vine  verified_purchase     review_he

In [151]:
print(df['review_body'][1:10])
print(df['star_rating'][1:10])  

1                                       So far so good
2                        Needs a little more work.....
3                                       Please cancel.
4                                   Works as Expected!
5    I've had Webroot for a few years. It expired a...
6    EXCELLENT software !!!!!  Don't need to do any...
7    The variations created by Win10 induced this p...
8    Horrible!  Would not upgrade previous version ...
9                                      Waste of time .
Name: review_body, dtype: object
1    4
2    3
3    1
4    5
5    4
6    5
7    1
8    1
9    1
Name: star_rating, dtype: object


## Sentiment analysis

In [152]:
all_reviews = df['review_body'][1:]
all_ratings = df['star_rating'][1:]
all_ratings = all_ratings.to_list()
all_ratings = [int(i) for i in all_ratings]

In [153]:
for i, x in enumerate(all_ratings):
    if x >= 3: all_ratings[i] = 1
    else: all_ratings[i] = 0

In [154]:
print(all_ratings[1:10])

[1, 0, 1, 1, 1, 0, 0, 0, 1]


In [155]:
reviews = list()

for review in all_reviews:
    reviews.append(review)

print(reviews[:10])

['So far so good', 'Needs a little more work.....', 'Please cancel.', 'Works as Expected!', "I've had Webroot for a few years. It expired and I decided to purchase a renewal on Amazon. I went through hell trying to uninstall the expired version in order to install the new.  I called Webroot and had their representative remote into my computer at his request. He was clueless as a bad joke and consumed 29 minutes and 57 seconds of my time forever.  He initially told me it wasn't compatible with Windows 10, but I finally managed to convince him that it is indeed compatible with Windows 10 as it was working on my computer before it expired and also I showed him a review on Amazon to convince him that it works on Windows 10. Finally, he offered to connect me with a senior consultant for over 100 dollars. I declined and told him I'd fix the issue myself. This guy was less helpful than a severed limb.  After spending some time on Google, the issue is now fixed. Webroot should just get rid of 

In [156]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = list()
ps = PorterStemmer()
sw = set(stopwords.words('english'))
for i in range(0, len(reviews)):
    # get review and remove non alpha chars
    review = re.sub('[^a-zA-Z]', ' ', reviews[i])
    # to lower-case
    review = review.lower()
    # split into tokens, apply stemming and remove stop words
    review = ' '.join([ps.stem(w) for w in review.split() if w not in sw])
    corpus.append(review)

print(corpus[:10])


['far good', 'need littl work', 'pleas cancel', 'work expect', 'webroot year expir decid purchas renew amazon went hell tri uninstal expir version order instal new call webroot repres remot comput request clueless bad joke consum minut second time forev initi told compat window final manag convinc inde compat window work comput expir also show review amazon convinc work window final offer connect senior consult dollar declin told fix issu guy less help sever limb spend time googl issu fix webroot get rid custom servic pay googl index much help info dedic custom servic offer far softwar think scan fast slow comput hope like peopl includ expert effect remov prevent malwar year ago extens research found among best year ago thing fluid malwar kingdom experienc issu instal advic bother uninstal old version rather launch webroot click account right side copi past product key area say enter new keycod click activ save numer pain hour tri get thing work', 'excel softwar need anyth set automat 

In [157]:
all_text = " ".join(corpus)
all_words = all_text.split()

print(all_words[:10])

['far', 'good', 'need', 'littl', 'work', 'pleas', 'cancel', 'work', 'expect', 'webroot']


In [158]:
from collections import Counter 
# Count all the words using Counter Method
count_words = Counter(all_words)
total_words=len(all_words)
sorted_words=count_words.most_common(total_words)
print(sorted_words[:10])

[('br', 81695), ('use', 70096), ('year', 44067), ('softwar', 37571), ('work', 34598), ('product', 34327), ('download', 30187), ('version', 29762), ('program', 29477), ('tax', 28695)]


In [159]:
vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}
print(vocab_to_int)

{'br': 1, 'use': 2, 'year': 3, 'softwar': 4, 'work': 5, 'product': 6, 'download': 7, 'version': 8, 'program': 9, 'tax': 10, 'get': 11, 'quicken': 12, 'time': 13, 'would': 14, 'like': 15, 'file': 16, 'one': 17, 'comput': 18, 'need': 19, 'instal': 20, 'easi': 21, 'problem': 22, 'tri': 23, 'great': 24, 'updat': 25, 'good': 26, 'purchas': 27, 'upgrad': 28, 'account': 29, 'new': 30, 'go': 31, 'amazon': 32, 'make': 33, 'support': 34, 'free': 35, 'turbotax': 36, 'buy': 37, 'intuit': 38, 'want': 39, 'well': 40, 'help': 41, 'window': 42, 'return': 43, 'price': 44, 'user': 45, 'state': 46, 'even': 47, 'money': 48, 'norton': 49, 'much': 50, 'mani': 51, 'back': 52, 'could': 53, 'featur': 54, 'also': 55, 'sinc': 56, 'custom': 57, 'issu': 58, 'recommend': 59, 'way': 60, 'still': 61, 'never': 62, 'realli': 63, 'find': 64, 'thing': 65, 'turbo': 66, 'review': 67, 'better': 68, 'run': 69, 'busi': 70, 'mac': 71, 'look': 72, 'everi': 73, 'avast': 74, 'r': 75, 'block': 76, 'first': 77, 'pc': 78, 'chang': 7

In [160]:
encoded_reviews=list()
for review in corpus:
  encoded_review=list()
  for word in review.split():
    if word not in vocab_to_int.keys():
      #if word is not available in vocab_to_int put 0 in that place
      encoded_review.append(0)
    else:
      encoded_review.append(vocab_to_int[word])
  encoded_reviews.append(encoded_review)

print(corpus[1])
print(encoded_reviews[1])

need littl work
[19, 162, 5]


In [161]:
sequence_length=250
#features=np.zeros((len(encoded_reviews), sequence_length), dtype=int)
features = []
for i, review in enumerate(encoded_reviews):
  review_len=len(review)
  if (review_len<=sequence_length):
    zeros=list(np.zeros(sequence_length-review_len))
    new=zeros+review
  else:
    new=review[:sequence_length]
  features.append(new)

print(features[0])
#features[i,:]=np.array(new)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [162]:
labels = all_ratings

In [163]:
from sklearn.model_selection import train_test_split 

X_train, X_rem, y_train, y_rem = train_test_split(features, labels, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [164]:
import torch
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [165]:
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.tensor(X_train).to(torch.int64), torch.tensor(y_train).to(torch.int64))
val_data=TensorDataset(torch.FloatTensor(X_valid).to(torch.int64), torch.FloatTensor(y_valid).to(torch.int64))
test_data=TensorDataset(torch.FloatTensor(X_test).to(torch.int64), torch.FloatTensor(y_test).to(torch.int64))

#dataloader
batch_size=50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader=DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [166]:
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 250])
Sample input: 
 tensor([[  0,   0,   0,  ..., 618,   2,   8],
        [  0,   0,   0,  ...,  21,   2, 588],
        [  0,   0,   0,  ...,   1, 256, 434],
        ...,
        [  0,   0,   0,  ..., 145, 492, 348],
        [  0,   0,   0,  ...,  74, 158, 320],
        [  0,   0,   0,  ...,  45, 288, 130]])
Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
        1, 0])


In [341]:
import torch.nn as nn
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
        
        
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        
        hidden = (h0,c0)
        return hidden


In [342]:
no_layers = 2
vocab_size = len(vocab_to_int) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256


model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

print(model)

SentimentRNN(
  (embedding): Embedding(28008, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


## Training

In [169]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [335]:
import time
clip = 5
epochs = 5 
def train_model(model, train_loader, val_loader, weights_dir, num_epochs=10):
    start=time.time()

    for epoch in range(num_epochs):
        model.train()
        # initialize hidden state 
        h = model.init_hidden(batch_size)
        for i, (inputs, labels) in enumerate(train_loader):

            if inputs.size(0) != batch_size:  # Check if batch size is not 50
                continue  # Skip the batch and move to the next iteration

            inputs, labels = inputs.to(device), labels.to(device)   
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])
            
            model.zero_grad()
            output,h = model(inputs,h)
            
            # calculate the loss and perform backprop
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()

            # calculating accuracy
            accuracy = acc(output,labels)/len(labels)

            #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()


            elapsed = time.time() - start
            # Show progress every 20 batches 
            if not i % 20:
                print(f'epoch: {epoch}, time: {elapsed:.3f}s, loss: {loss.item()}, train accuracy: {accuracy}')
    
        
            
        val_h = model.init_hidden(batch_size)
        model.eval()
        for i, (inputs, labels) in enumerate(val_loader):
                if inputs.size(0) != batch_size:  # Check if batch size is not 50
                    continue  # Skip the batch and move to the next iteration

                val_h = tuple([each.data for each in val_h])

                inputs, labels = inputs.to(device), labels.to(device)

                output, val_h = model(inputs, val_h)
                
                accuracy = acc(output,labels)/len(labels)

                print(f'Accuracy on the test set: {accuracy:.3f}')

        if epoch%2==0:
            torch.save(model.state_dict(), f"./{weights_dir}/epoch-{epoch}_accuracy-{accuracy}.pth")
            

In [174]:
import os
import shutil
# delete the directory of weights if it already exists and then create it
if os.path.exists(os.path.join('weights2')):
    shutil.rmtree(os.path.join('weights2'))
!mkdir weights2

In [175]:
train_model(model, train_loader, val_loader, 'weights2', num_epochs=5)

epoch: 0, time: 0.311s, loss: 0.6646605730056763, train accuracy: 29.000
epoch: 0, time: 1.247s, loss: 0.45675525069236755, train accuracy: 42.000
epoch: 0, time: 2.176s, loss: 0.6455901861190796, train accuracy: 34.000
epoch: 0, time: 3.102s, loss: 0.5113396048545837, train accuracy: 37.000
epoch: 0, time: 4.028s, loss: 0.5532946586608887, train accuracy: 36.000
epoch: 0, time: 4.961s, loss: 0.4647737741470337, train accuracy: 38.000
epoch: 0, time: 5.888s, loss: 0.4111664891242981, train accuracy: 43.000
epoch: 0, time: 6.814s, loss: 0.3205314874649048, train accuracy: 45.000
epoch: 0, time: 7.750s, loss: 0.23671942949295044, train accuracy: 45.000
epoch: 0, time: 8.680s, loss: 0.3134439289569855, train accuracy: 45.000
epoch: 0, time: 9.618s, loss: 0.5508928298950195, train accuracy: 37.000
epoch: 0, time: 10.545s, loss: 0.4626207947731018, train accuracy: 42.000
epoch: 0, time: 11.461s, loss: 0.3699086010456085, train accuracy: 44.000
epoch: 0, time: 12.379s, loss: 0.53865307569503

## Load Model

In [343]:
import os
# function for loaduing weights of a trained model
def load_weights(model, weights_dir):
    files = os.listdir(weights_dir)
    weight_paths = [os.path.join(weights_dir, basename) for basename in files]
    # get the latest file in the directory
    final_weight_file = os.path.basename(max(weight_paths, key=os.path.getctime))

    # first model needs to be loaded
    model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

    # fixes odd error when state_dict has prescript "model."" in keys
    state_dict = torch.load(os.path.join(weights_dir, final_weight_file))
    for key in list(state_dict.keys()):
        if key.startswith("model."):
            state_dict[key[6:]] = state_dict.pop(key)
    model.load_state_dict(state_dict)

    print('Loaded weights: ' + final_weight_file)
    
    return model

In [344]:
model = load_weights(model, 'weights2')
model.to(device)
model.eval()

Loaded weights: epoch-4_accuracy-42.000.pth


SentimentRNN(
  (embedding): Embedding(28008, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [345]:
val_h = model.init_hidden(batch_size)
accuracy = 0

for i, (inputs, labels) in enumerate(test_loader):
        if inputs.size(0) != batch_size:  # Check if batch size is not 50
            continue  # Skip the batch and move to the next iteration

        val_h = tuple([each.data for each in val_h])

        inputs, labels = inputs.to(device), labels.to(device)

        output, val_h = model(inputs, val_h)

        accuracy += acc(output,labels)

accuracy /= len(test_loader.dataset)

print(f'Accuracy on the test set: {accuracy:.3f}')

Accuracy on the test set: 0.874


In [346]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuatuon
    test_text = ''.join([c for c in test_review if c not in punctuation])
    
    # splitting by spaces
    test_words = test_text.split()
    
    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])
    
    return test_ints

In [347]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    ## getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)
    
    ## for each review, I grab that review
    for i, row in enumerate(reviews_ints):
      features[i, -len(row):] = np.array(row)[:seq_length]

In [348]:
def predict(test_review, sequence_length=200):
    ''' Prints out whether a give review is predicted to be 
        positive or negative in sentiment, using a trained model.
        
        params:
        net - A trained net 
        test_review - a review made of normal text and punctuation
        sequence_length - the padded length of a review
        '''
    
    model.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenize sequence
    seq_length = sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass to model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = model.init_hidden(batch_size)
    
    # if(train_on_gpu):
    #   feature_tensor = feature_tensor.cuda()
    
    feature_tensor.to(device)
      
    # get the output from the model
    output, h = model(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response based on whether test_review is pos/neg
    if(pred.item()==1):
      print('Positive review detected!')
    else:
      print('Negative review detected!')

In [363]:
def predict(text):
    model.eval()
    
    # get review and remove non alpha chars
    text = re.sub('[^a-zA-Z]', ' ', text)
    # to lower-case
    text = text.lower()
    # split into tokens, apply stemming and remove stop words
    text = ' '.join([ps.stem(w) for w in text.split() if w not in sw])

    encoded_review=list()
    for word in text:
        if word not in vocab_to_int.keys():
        #if word is not available in vocab_to_int put 0 in that place
            encoded_review.append(0)
        else:
            encoded_review.append(vocab_to_int[word])
    #review_len=len(encoded_review)

    features = pad_features(encoded_review, 250)
    # if (review_len<=sequence_length):
    #     zeros=list(np.zeros(sequence_length-review_len))
    #     new=zeros+encoded_review
    # else:
    #     new=encoded_review[:sequence_length]
    

    #feature_tensor = torch.tensor(new).to(torch.int64)

    feature_tensor = torch.from_numpy(features)

    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = model.init_hidden(batch_size)
    #h = tuple([each.data for each in h])

    

    
    #feature_tensor = torch.from_numpy(new)
    #batch_size = feature_tensor.size(0)
    # initialize hidden state
    #h = tuple([each.data for each in h])



    ################### nao da erro mas funciona mal
    #print(feature_tensor.shape)
    # feature_tensor = feature_tensor.view(250, -1)
    # #print(feature_tensor.shape)
    # h = model.init_hidden(250)
    # ###########################

    
    feature_tensor = feature_tensor.to(device)

    output, h = model(feature_tensor, h)
    print(output)
    pred = torch.round(output.squeeze()) 
    print(pred)

    return pred
    

In [364]:
review_text = "The worst  I have seen  and I want  money back   had bad acting and the dialogue was slow."
print(f'Review text: {review_text}')
print(f'Sentiment  : {predict(review_text)}')

Review text: The worst  I have seen  and I want  money back   had bad acting and the dialogue was slow.


TypeError: object of type 'int' has no len()

In [372]:
test_review_neg = 'The worst  I have seen and I want  money back'
def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    # for word in test_words:
    #     if word in vocab_to_int.keys():
    #         test_ints.append(vocab_to_int[word])
        #     test_ints.append(0)
        # else:
        #     test_ints.append(vocab_to_int[word])
    test_ints.append([vocab_to_int[word] for word in test_words])
    print(test_ints)

    return test_ints

# test code and generate tokenized review
test_ints = tokenize_review(test_review_neg)
print(test_ints)


# test sequence padding
seq_length=200
features = pad_features(test_ints, seq_length)

print(features)


# test conversion to tensor and pass into your model
feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())


def predict(net, test_review, sequence_length=200):
    
    net.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    feature_tensor = feature_tensor.to(device)
    
    # get the output from the model
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")

predict(model, test_review_neg, seq_length)

[[10034, 544, 12435, 7485, 657, 19552, 12435, 39, 48, 52]]
[[10034, 544, 12435, 7485, 657, 19552, 12435, 39, 48, 52]]
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0 