In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# dataset is accessible at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt (https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz)
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv', sep='\t', header=None, on_bad_lines='skip')

print(df.head())

In [None]:
df.shape
df.info()

In [None]:
df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating', 
            'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']

print(df.head())

In [None]:
print(df['review_body'][1:10])
print(df['star_rating'][1:10])  

In [None]:
all_reviews = df['review_body'][1:]
all_ratings = df['star_rating'][1:]
all_ratings = all_ratings.to_list()
all_ratings = [int(i) for i in all_ratings]

In [None]:
for i, x in enumerate(all_ratings):
    if x >= 3: all_ratings[i] = 1
    else: all_ratings[i] = 0

In [None]:
print(all_ratings[1:10])

In [None]:
reviews = list()

for review in all_reviews:
    reviews.append(review)

print(reviews[:10])

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = list()
ps = PorterStemmer()
sw = set(stopwords.words('english'))
for i in range(0, len(reviews)):
    # get review and remove non alpha chars
    review = re.sub('[^a-zA-Z]', ' ', reviews[i])
    # to lower-case
    review = review.lower()
    # split into tokens, apply stemming and remove stop words
    review = ' '.join([ps.stem(w) for w in review.split() if w not in sw])
    corpus.append(review)

print(corpus[:10])


In [None]:
all_text = " ".join(corpus)
all_words = all_text.split()

print(all_words[:10])

In [None]:
from collections import Counter 
# Count all the words using Counter Method
count_words = Counter(all_words)
total_words=len(all_words)
sorted_words=count_words.most_common(total_words)
print(sorted_words[:10])

In [None]:
vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}
print(vocab_to_int)

In [None]:
encoded_reviews=list()
for review in corpus:
  encoded_review=list()
  for word in review.split():
    if word not in vocab_to_int.keys():
      #if word is not available in vocab_to_int put 0 in that place
      encoded_review.append(0)
    else:
      encoded_review.append(vocab_to_int[word])
  encoded_reviews.append(encoded_review)

print(corpus[1])
print(encoded_reviews[1])

In [None]:
sequence_length=250
#features=np.zeros((len(encoded_reviews), sequence_length), dtype=int)
features = []
for i, review in enumerate(encoded_reviews):
  review_len=len(review)
  if (review_len<=sequence_length):
    zeros=list(np.zeros(sequence_length-review_len))
    new=zeros+review
  else:
    new=review[:sequence_length]
  features.append(new)

print(features[0])
#features[i,:]=np.array(new)

In [None]:
labels = all_ratings

In [None]:
from sklearn.model_selection import train_test_split 

X_train, X_rem, y_train, y_rem = train_test_split(features, labels, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [None]:
import torch
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.tensor(X_train).to(torch.int64), torch.tensor(y_train).to(torch.int64))
val_data=TensorDataset(torch.FloatTensor(X_valid).to(torch.int64), torch.FloatTensor(y_valid).to(torch.int64))
test_data=TensorDataset(torch.FloatTensor(X_test).to(torch.int64), torch.FloatTensor(y_test).to(torch.int64))

#dataloader
batch_size=50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader=DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

In [None]:
import torch.nn as nn
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
        
        
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        
        hidden = (h0,c0)
        return hidden


In [None]:
no_layers = 2
vocab_size = len(vocab_to_int) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256


model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

print(model)

## Training