In [1]:
import numpy as np

# read data from text files
with open('data/reviews_sample.txt', 'r') as f:
    reviews = f.read()
with open('data/labels_sample.txt', 'r') as f:
    labels = f.read()

In [0]:
from string import punctuation

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of words
words = all_text.split()

In [0]:
# feel free to use this import 
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
vocab_to_int['<pad>']=0

int2vocab={i:w for w,i in vocab_to_int.items()}

## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [0]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [0]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]


In [0]:
# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])
lens=list(map(len, reviews_ints))

In [0]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, :len(row)] = np.array(row)[:seq_length]
    
    return features

In [0]:
seq_length = 200

features = pad_features(reviews_ints,seq_length)

In [0]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
BATCH_SIZE = 64

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE)

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        
        super(FastText).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        text=text.long() # ojo con esto
        text=text.transpose(1,0) # text = [sent len, batch size]
  
        embedded = self.embedding(text) #embedded = [sent len, batch size, emb dim]
        embedded = embedded.permute(1, 0, 2) #embedded = [batch size, sent len, emb dim]
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

In [0]:
batch=next(iter(train_loader))

In [0]:
batch[0].shape

torch.Size([64, 200])

In [0]:
INPUT_DIM = len(vocab_to_int)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,407,401 trainable parameters


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, label in iterator:
        text, label=text.cuda(), label.cuda()
        
        optimizer.zero_grad()
        
        predictions = model(text)
        
        predictions=predictions.squeeze(1)
        
        loss = criterion(predictions, label.float())
        
        acc = binary_accuracy(predictions, label.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for text, label in iterator:
            text, label=text.cuda(), label.cuda()


            predictions = model(text)
            predictions=predictions.squeeze(1)
            loss = criterion(predictions, label.float())
            
            acc = binary_accuracy(predictions, label.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 12

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/gdrive/My Drive/Colab Notebooks/Sentiment/tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 0.674 | Train Acc: 63.21%
	 Val. Loss: 0.643 |  Val. Acc: 71.99%
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 0.582 | Train Acc: 76.75%
	 Val. Loss: 0.535 |  Val. Acc: 79.14%
Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 0.462 | Train Acc: 83.02%
	 Val. Loss: 0.452 |  Val. Acc: 82.42%
Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 0.375 | Train Acc: 86.53%
	 Val. Loss: 0.408 |  Val. Acc: 83.59%
Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 0.318 | Train Acc: 88.60%
	 Val. Loss: 0.376 |  Val. Acc: 85.08%
Epoch: 06 | Epoch Time: 0m 2s
	Train Loss: 0.278 | Train Acc: 90.27%
	 Val. Loss: 0.369 |  Val. Acc: 84.96%
Epoch: 07 | Epoch Time: 0m 2s
	Train Loss: 0.247 | Train Acc: 91.54%
	 Val. Loss: 0.358 |  Val. Acc: 85.74%
Epoch: 08 | Epoch Time: 0m 2s
	Train Loss: 0.222 | Train Acc: 92.66%
	 Val. Loss: 0.355 |  Val. Acc: 85.23%
Epoch: 09 | Epoch Time: 0m 2s
	Train Loss: 0.200 | Train Acc: 93.42%
	 Val. Loss: 0.349 |  Val. Acc: 85.82%
Epoch: 10 | Epoch Time: 0m 2

In [0]:
test_loss, test_acc = evaluate(model, test_loader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.387 | Test Acc: 84.49%
