In [1]:
import torch
import numpy as np
import torchvision
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [3]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [4]:
from string import punctuation
from collections import Counter

reviews = reviews.lower()
reviews = ''.join(char for char in reviews if char not in punctuation)
reviews = reviews.split('\n')

##get list of all words
allWords = ' '.join(reviews)
words = allWords.split()

word_counter = Counter(words)
sorted_word_counter = sorted(word_counter,key=word_counter.get,reverse=True)
word_to_int = {word: indx for indx, word in enumerate(sorted_word_counter,1)}

##encoding reviews using word_to_int
encoded_reviews = []
for rev in reviews:
    encoded_reviews.append([word_to_int[word] for word in rev.split()])

In [5]:
print(words[:10])
print(words[0]," -> " , word_to_int[words[0]])
print(words[1]," -> " ,word_to_int[words[1]])
print("Words Count : " , len(words))
print("Unique Words Count : " , len(word_to_int))
print("Encoded Reviews : ",len(encoded_reviews))

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']
bromwell  ->  21025
high  ->  308
Words Count :  6020196
Unique Words Count :  74072
Encoded Reviews :  25001


In [6]:
# outlier review stats
review_lens = Counter([len(x) for x in encoded_reviews])
print("Zero-length reviews: ",review_lens[0])
print("Maximum review length: ",max(review_lens))

Zero-length reviews:  1
Maximum review length:  2514


In [7]:
#remove the zero length review
#must get index to delete same index from labels
zero_length_index = [index  for index,encoded_review in enumerate(encoded_reviews) if len(encoded_review) > 0 ]
#[ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
encoded_reviews = [encoded_review for encoded_review in encoded_reviews if len(encoded_review) > 0 ]
labels = labels.split('\n')
labels = [label for index,label in enumerate(labels) if index  in zero_length_index]
encoded_labels = [1 if label=='positive' else 0 for label in labels]
print("Encoded Reviews : ",len(encoded_reviews))
print("Encoded Labels : ",len(encoded_labels))

Encoded Reviews :  25000
Encoded Labels :  25000


In [8]:
print(encoded_reviews[:1])
print(encoded_labels[:1])

[[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]]
[1]


In [9]:
#make all reviews same size
seq_length = 200 
def pad_truncate_features(original_features,length):
    #define list of (length) size lists filled with 0
    features = np.zeros((len(original_features),length),dtype=int)
    #add first (length) integers and pad 0's if len(row) < length
    for indx, feature in enumerate(original_features):
        features[indx, -len(feature):] = np.array(feature)[:length]
    return features
        
#len(pad_truncate_features(encoded_reviews[:1],seq_length)[0])
encoded_reviews = pad_truncate_features(encoded_reviews,seq_length)

In [20]:
#create train sets, validate sets, test sets
split_idx_train = int(len(encoded_reviews)*0.8) # train : 80%  , test & validation 20%
split_idx_validate = int(len(encoded_reviews)*0.9) # test : 10% , validation : 10%

train_x, test_x, validate_x = encoded_reviews[:split_idx_train], encoded_reviews[split_idx_train:split_idx_validate], encoded_reviews[split_idx_validate:]
train_y, test_y, validate_y = encoded_labels[:split_idx_train], encoded_labels[split_idx_train:split_idx_validate], encoded_labels[split_idx_validate:]

print("train set : ",len(train_x))
print("validation set : ",len(validate_x))
print("testset : ",len(test_x))

train set :  20000
validation set :  2500
testset :  2500


In [21]:
batch_size = 50
#create tensor dataset then use dataloader on it
train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(np.array(train_y)))
validate_data = TensorDataset(torch.from_numpy(validate_x),torch.from_numpy(np.array(validate_y)))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(np.array(test_y)))

train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
validate_loader = DataLoader(validate_data,batch_size=batch_size,shuffle=True)
test_loader = DataLoader(test_data,batch_size=batch_size,shuffle=True)

In [22]:
class RNNModel(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,num_layers,drop_prob,device):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.device = device
        self.n_layers = num_layers
        self.Embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim,hidden_dim,num_layers,batch_first=True,dropout=drop_prob)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sig = nn.Sigmoid()
    def forward(self,x,hidden):
        batch_size = x.size(0)
        x = self.Embeddings(x)
        lstm_out, hidden = self.LSTM(x,hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sig(out)
        sig_out = out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden
    def init_hidden(self,batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device))
        return hidden

In [23]:
word_size = len(word_to_int)+1 # +1 for the 0 padding + our word tokens
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = RNNModel(word_size, embedding_dim, hidden_dim, n_layers,0.5,device)
net.to(device)
print(net)

RNNModel(
  (Embeddings): Embedding(74073, 400)
  (LSTM): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "


In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(),lr=0.01)

In [None]:
epochs = 10
clip = 5
net.train()
iter_counter = 0
for e in range(epochs):
    #initialize hidden state
    hidden = net.init_hidden(batch_size)
    train_loss = 0
    for batch, labels in train_loader:
        iter_counter+=1
        batch, labels = batch.to(device), labels.to(device)
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        hidden = tuple([each.data for each in hidden])
        
        out, hidden = net.forward(batch,hidden)
        # zero accumulated gradients
        net.zero_grad()
        loss = criterion(out,labels.float())
        loss.backward()
        #clip gradient to prevent exploding
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        train_loss += loss.item()*batch.size(0)
    #Validation
    val_h = net.init_hidden(batch_size)
    validation_loss = 0
    correct_predictions_count = 0
    net.eval()
    for v_batch, labels in validate_loader:
        v_batch, labels = v_batch.to(device), labels.to(device)
        val_h = tuple([each.data for each in val_h])
        out, val_h = net.forward(v_batch,val_h)
        loss = criterion(out, labels.float())
        validation_loss += loss.item()*v_batch.size(0)
        
        out = torch.round(out)
        correct = out.eq(labels.data.view_as(out).float())
        correct_predictions_count += torch.sum(correct)
        
        
    train_loss /=len(train_loader.dataset)
    validation_loss /= len(validate_loader.dataset)
    correct_predictions_count = correct_predictions_count.cpu().numpy()/len(validate_loader.dataset)
    print("Epoch: {}".format(e+1),
         "\ttraining loss: {}\t\tValidation loss: {}\nvalidation accuracy: {}%".format(train_loss,
                                                validation_loss,correct_predictions_count*100))
    net.train()

Epoch: 1 	training loss: 0.578457690924406		Validation loss: 0.47609719276428225
validation accuracy: 76.44%
Epoch: 2 	training loss: 0.3790711837634444		Validation loss: 0.49164815604686735
validation accuracy: 78.0%
Epoch: 3 	training loss: 0.28937404915690423		Validation loss: 0.5157116782665253
validation accuracy: 77.03999999999999%
Epoch: 4 	training loss: 0.25124864804558455		Validation loss: 0.5582127249240876
validation accuracy: 76.6%
Epoch: 5 	training loss: 0.23063293190672993		Validation loss: 0.5611292371153831
validation accuracy: 77.56%
Epoch: 6 	training loss: 0.21957946578040718		Validation loss: 0.524649131000042
validation accuracy: 76.92%
