In [None]:
import torch
import numpy as np
import torchvision
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

In [None]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [None]:
from string import punctuation
from collections import Counter

reviews = reviews.lower()
reviews = ''.join(char for char in reviews if char not in punctuation)
reviews = reviews.split('\n')

##get list of all words
allWords = ' '.join(reviews)
words = allWords.split()

word_counter = Counter(words)
sorted_word_counter = sorted(word_counter,key=word_counter.get,reverse=True)
word_to_int = {word: indx for indx, word in enumerate(sorted_word_counter,1)}

##encoding reviews using word_to_int
encoded_reviews = []
for rev in reviews:
    encoded_reviews.append([word_to_int[word] for word in rev.split()])

In [None]:
print(words[:10])
print(words[0]," -> " , word_to_int[words[0]])
print(words[1]," -> " ,word_to_int[words[1]])
print("Words Count : " , len(words))
print("Unique Words Count : " , len(word_to_int))
print("Encoded Reviews : ",len(encoded_reviews))

In [None]:
# outlier review stats
review_lens = Counter([len(x) for x in encoded_reviews])
print("Zero-length reviews: ",review_lens[0])
print("Maximum review length: ",max(review_lens))

In [None]:
#remove the zero length review
#must get index to delete same index from labels
zero_length_index = [index  for index,encoded_review in enumerate(encoded_reviews) if len(encoded_review) > 0 ]
#[ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
encoded_reviews = [encoded_review for encoded_review in encoded_reviews if len(encoded_review) > 0 ]
labels = labels.split('\n')
labels = [label for index,label in enumerate(labels) if index  in zero_length_index]
encoded_labels = [1 if label=='positive' else 0 for label in labels]
print("Encoded Reviews : ",len(encoded_reviews))
print("Encoded Labels : ",len(encoded_labels))

In [None]:
print(encoded_reviews[:1])
print(encoded_labels[:1])

In [None]:
#make all reviews same size
seq_length = 200 
def pad_truncate_features(original_features,length):
    #define list of (length) size lists filled with 0
    features = np.zeros((len(original_features),length),dtype=int)
    #add first (length) integers and pad 0's if len(row) < length
    for indx, feature in enumerate(original_features):
        features[indx, -len(feature):] = np.array(feature)[:length]
    return features
        
#len(pad_truncate_features(encoded_reviews[:1],seq_length)[0])
encoded_reviews = pad_truncate_features(encoded_reviews,seq_length)

In [None]:
#create train sets, validate sets, test sets
split_idx_train = int(len(encoded_reviews)*0.8) # train : 80%  , test & validation 20%
split_idx_validate = int(len(encoded_reviews)*0.9) # test : 10% , validation : 10%

train_x, test_x, validate_x = encoded_reviews[:split_idx_train], encoded_reviews[split_idx_train:split_idx_validate], encoded_reviews[split_idx_validate:]
train_y, test_y, validate_y = encoded_labels[:split_idx_train], encoded_labels[split_idx_train:split_idx_validate], encoded_labels[split_idx_validate:]

print("train set : ",len(train_x))
print("validation set : ",len(validate_x))
print("train set : ",len(test_x))

In [None]:
batch_size = 64
#create tensor dataset then use dataloader on it
train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(np.array(train_y)))
validate_data = TensorDataset(torch.from_numpy(validate_x),torch.from_numpy(np.array(validate_y)))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(np.array(test_y)))

train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
validate_loader = DataLoader(validate_data,batch_size=batch_size,shuffle=True)
test_loader = DataLoader(test_data,batch_size=batch_size,shuffle=True)

In [None]:
class RNNModel(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,num_layers,drop_prob,device):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.device = device
        self.n_layers = num_layers
        self.Embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim,hidden_dim,num_layers,batch_first=True,dropout=drop_prob)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sig = nn.Sigmoid()
    def forward(self,x,hidden):
        x = self.Embeddings(x)
        lstm_out, hidden = self.LSTM(x,hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden
    def init_hidden(self,batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device))
        return hidden

In [None]:
word_size = len(word_to_int)+1 # +1 for the 0 padding + our word tokens
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = RNNModel(word_size, embedding_dim, hidden_dim, n_layers,0.5,device)
net.to(device)
print(net)

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(),lr=0.01)

In [None]:
epochs = 10
clip = 5
net.train()
for e in range(epochs):
    #initialize hidden state
    hidden = net.init_hidden(batch_size)
    for batch, labels in train_loader:
        batch, labels = batch.to(device), labels.to(device)
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        hidden = tuple([each.data for each in hidden])
        
        out, hidden = net.forward(batch,hidden)
        # zero accumulated gradients
        net.zero_grad()
        loss = criterion(out,labels)
        loss.backward()
        #clip gradient to prevent exploding
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
    #Validation
    val_h = net.init_hidden(batch_size)
    val_losses = []
    net.eval()
    for v_batch, labels in validate_loader:
        v_batch, labels = v_batch.to(device), labels.to(device)
        val_h = tuple([each.data for each in val_h])
        out, val_h = net.forward(v_batch,val_h)
        loss = criterion(out, labels)
        val_losses.append(loss.item())
    print("Epoch: {}".format(e+1),
         "  Validation loss: {}".format(np.mean(val_losses)))
    net.train()