In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np

# read data from text files
with open('/content/drive/My Drive/Colab Notebooks/Sentiment/data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('/content/drive/My Drive/Colab Notebooks/Sentiment/data/labels.txt', 'r') as f:
    labels = f.read()

In [0]:
from string import punctuation

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of words
words = all_text.split()

In [0]:
# feel free to use this import 
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
vocab_to_int['<pad>']=0

int2vocab={i:w for w,i in vocab_to_int.items()}

## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [0]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [0]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])
lens=list(map(len, reviews_ints))

In [0]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, :len(row)] = np.array(row)[:seq_length]
    
    return features

In [0]:

seq_length = 200
seq_length = max(lens)
# seq_length = int(np.percentile(lens,95))

features = pad_features(reviews_ints,seq_length)

review_lengths = np.sum(features!=0,axis=1)


In [9]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]
train_length, remaining_length=review_lengths[:split_idx], review_lengths[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]
val_legth, test_length = remaining_length[:test_idx], remaining_length[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 2514) 
Validation set: 	(2500, 2514) 
Test set: 		(2500, 2514)


In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_length), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_legth), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_length), torch.from_numpy(test_y))

# dataloaders
BATCH_SIZE = 64

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE)

In [0]:
import torch.nn as nn


class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTM, self).__init__()
  
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout = 0 if n_layers < 2 else dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)


    def forward(self, text, text_lengths):

        text=text.long() # ojo con esto
        text=text.transpose(1,0) # ojo con esto too
        text_lengths = text_lengths.long()

        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                            
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [0]:
INPUT_DIM = len(vocab_to_int)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.5
# PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = LSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 8,140,997 trainable parameters


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, text_lengths, label in iterator:
        text, text_lengths, label=text.cuda(), text_lengths.cuda(), label.cuda()

        optimizer.zero_grad()
      
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, label.float())
        
        acc = binary_accuracy(predictions, label.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for text, text_lengths, label in iterator:
            text, text_lengths, label=text.cuda(), text_lengths.cuda(), label.cuda()
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, label.float())
            
            acc = binary_accuracy(predictions, label.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 58s
	Train Loss: 0.656 | Train Acc: 60.17%
	 Val. Loss: 0.558 |  Val. Acc: 72.34%
Epoch: 02 | Epoch Time: 0m 59s
	Train Loss: 0.582 | Train Acc: 69.33%
	 Val. Loss: 0.556 |  Val. Acc: 72.93%
Epoch: 03 | Epoch Time: 1m 0s
	Train Loss: 0.551 | Train Acc: 71.84%
	 Val. Loss: 0.613 |  Val. Acc: 67.46%
Epoch: 04 | Epoch Time: 0m 55s
	Train Loss: 0.522 | Train Acc: 73.86%
	 Val. Loss: 0.508 |  Val. Acc: 74.26%
Epoch: 05 | Epoch Time: 0m 54s
	Train Loss: 0.506 | Train Acc: 75.48%
	 Val. Loss: 0.522 |  Val. Acc: 74.92%
Epoch: 06 | Epoch Time: 0m 59s
	Train Loss: 0.397 | Train Acc: 82.19%
	 Val. Loss: 0.767 |  Val. Acc: 68.79%
Epoch: 07 | Epoch Time: 0m 55s
	Train Loss: 0.361 | Train Acc: 84.27%
	 Val. Loss: 0.357 |  Val. Acc: 85.74%
Epoch: 08 | Epoch Time: 1m 0s
	Train Loss: 0.310 | Train Acc: 87.02%
	 Val. Loss: 0.349 |  Val. Acc: 85.51%
Epoch: 09 | Epoch Time: 0m 59s
	Train Loss: 0.277 | Train Acc: 88.58%
	 Val. Loss: 0.331 |  Val. Acc: 86.29%
Epoch: 10 | Epoch Tim

In [22]:
# model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_loader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.368 | Test Acc: 83.91%


In [0]:
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok for tok in sentence.split()]
    indexed = [vocab_to_int[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    tensor=tensor.reshape(1,-1)
    length_tensor = torch.LongTensor(length)
    pred = model(tensor)
    prediction = torch.sigmoid(pred)
    return prediction.item()

In [24]:
predict_sentiment(model, "this film is terrible")


TypeError: ignored

In [0]:

predict_sentiment(model, "this film is great")