In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np

# read data from text files
with open('/content/drive/My Drive/Colab Notebooks/Sentiment/data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('/content/drive/My Drive/Colab Notebooks/Sentiment/data/labels.txt', 'r') as f:
    labels = f.read()

In [0]:
from string import punctuation

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of words
words = all_text.split()

In [0]:
# feel free to use this import 
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
vocab_to_int['<pad>']=0

int2vocab={i:w for w,i in vocab_to_int.items()}

## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [0]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [0]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

In [0]:
review_lengths = np.array(list(map(len,reviews_ints)))

In [0]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, :len(row)] = np.array(row)[:seq_length]
    
    return features

In [0]:

seq_length = max(review_lengths)
# seq_length = int(np.percentile(review_lengths,95))
features = pad_features(reviews_ints,seq_length)

In [0]:
review_lengths = np.sum(features!=0,axis=1)


In [0]:
from torch.utils.data import TensorDataset, DataLoader
import torch

In [12]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]
train_length, remaining_length=review_lengths[:split_idx], review_lengths[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]
val_legth, test_length = remaining_length[:test_idx], remaining_length[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 2514) 
Validation set: 	(2500, 2514) 
Test set: 		(2500, 2514)


In [0]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_length), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_legth), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_length), torch.from_numpy(test_y))


In [0]:
def my_collate(batch):
    # batch contains a list of tuples of structure (sequence, target)
    data = [item[0] for item in batch]
    lengths = [item[1] for item in batch]
    targets = [item[2] for item in batch]
    tuples=list(zip(data, lengths, targets))
    tuples = sorted(tuples, key=lambda s: s[1],reverse=True)
    data = [item[0] for item in tuples]
    lengths = [item[1] for item in tuples]
    targets = [item[2] for item in tuples]
    return [torch.stack(data), torch.stack(lengths), torch.stack(targets)]


In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [0]:
# from torchtext import data
# from torchtext import datasets
# # dataloaders


# train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
#     (train_data, valid_data, test_data),
#     sort_key = lambda x: x.s, #sort by s attribute (quote)
#     batch_size=BATCH_SIZE,
#     device=device)


BATCH_SIZE = 64
# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True,batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=True,batch_size=BATCH_SIZE)


In [0]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence

class WordAttention(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, att_size, output_dim, dropout):
        super(WordAttention, self).__init__()

        self.emb = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim, 
                          num_layers=n_layers, 
                          bidirectional=True,
                          dropout = 0 if n_layers < 2 else dropout,
                          batch_first=True)
        
        self.att = nn.Linear(2 * hidden_dim, att_size)
        
        self.context_vector = nn.Linear(att_size, 1, bias=False)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)


    def forward(self, text, text_lengths):
        #text = [batch size, sent len]

        text=text.long()
        text_lengths = text_lengths.long()

        embedded = self.dropout(self.emb(text))  
        
        #embedded = [batch size, sent len, emb dim]
        
        packed_words = pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        
        packed_words, _ = self.rnn(packed_words)
        
        #packed_words.data = [n words, hid dim * num directions]
        
        att_w = torch.tanh(self.att(packed_words.data))
        
        #att_w = [n words, att size]
            
        att_w = self.context_vector(att_w).squeeze(1)
        
        #att_w = [n words]
        
        max_value = att_w.max()  # scalar, for numerical stability during exponent calculation
        
        att_w = torch.exp(att_w - max_value)
        
        #att_w = [n words]
        
        att_w, _ = pad_packed_sequence(PackedSequence(data=att_w,
                                                      batch_sizes=packed_words.batch_sizes,
                                                      sorted_indices=packed_words.sorted_indices,
                                                      unsorted_indices=packed_words.unsorted_indices),
                                       batch_first=True)  
        
        #att_w = [batch size, max(text_lengths)]
        
        word_alphas = att_w / torch.sum(att_w, dim=1, keepdim=True)
        
        #word_alphas = [batch size, max(text_lengths)]
        
        sentences, _ = pad_packed_sequence(packed_words,
                                           batch_first=True)
        
        #sentences = [batch size, max(text_lengths), hid dim * num directions ]
        
        sentences = sentences * word_alphas.unsqueeze(2)
        
        #sentences = [batch size, max(text_lengths), hid dim * num directions ]
        
        sentences = sentences.sum(dim=1)
        
        #sentences = [batch size, hid dim * num directions]
        
        output = self.fc(sentences)
        
        #output = [batch size, output dim]
        
        return output


In [0]:
INPUT_DIM = len(vocab_to_int)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
HIDDEN_DIM = 100
N_LAYERS = 1
ATT_DIM = 100
DROPOUT = 0.5
# PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model=WordAttention(INPUT_DIM, 
                    EMBEDDING_DIM, 
                    HIDDEN_DIM, 
                    N_LAYERS, 
                    ATT_DIM,
                    OUTPUT_DIM,
                    DROPOUT)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,548,901 trainable parameters


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [0]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, text_lengths, label in iterator:
        text, text_lengths, label=text.cuda(), text_lengths.cuda(), label.cuda()

        optimizer.zero_grad()
      
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, label.float())
        
        acc = binary_accuracy(predictions, label.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for text, text_lengths, label in iterator:
            text, text_lengths, label=text.cuda(), text_lengths.cuda(), label.cuda()
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, label.float())
            
            acc = binary_accuracy(predictions, label.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [26]:
N_EPOCHS = 20

best_valid_loss = float('inf')
counter = 0
patience = 2

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    if valid_loss < best_valid_loss:
      best_valid_loss = valid_loss
      torch.save(model.state_dict(), 'tut2-model.pt')
      counter = 0 
    else:
      counter += 1
      if counter >= patience:
          break
    
    

Epoch: 01 | Epoch Time: 0m 48s
	Train Loss: 0.569 | Train Acc: 69.61%
	 Val. Loss: 0.527 |  Val. Acc: 76.05%
Epoch: 02 | Epoch Time: 0m 49s
	Train Loss: 0.418 | Train Acc: 81.03%
	 Val. Loss: 0.460 |  Val. Acc: 82.93%
Epoch: 03 | Epoch Time: 0m 49s
	Train Loss: 0.340 | Train Acc: 85.44%
	 Val. Loss: 0.393 |  Val. Acc: 85.04%
Epoch: 04 | Epoch Time: 0m 49s
	Train Loss: 0.281 | Train Acc: 88.60%
	 Val. Loss: 0.400 |  Val. Acc: 87.19%
Epoch: 05 | Epoch Time: 0m 49s
	Train Loss: 0.250 | Train Acc: 89.93%
	 Val. Loss: 0.371 |  Val. Acc: 87.73%
Epoch: 06 | Epoch Time: 0m 49s
	Train Loss: 0.222 | Train Acc: 91.34%
	 Val. Loss: 0.417 |  Val. Acc: 86.95%
Epoch: 07 | Epoch Time: 0m 49s
	Train Loss: 0.198 | Train Acc: 92.32%
	 Val. Loss: 0.409 |  Val. Acc: 87.54%


In [27]:
# model.load_state_dict(torch.load('tut2-model.pt'))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.405 | Test Acc: 86.88%
