In [0]:
import numpy as np

# read data from text files
with open('data/reviews_sample.txt', 'r') as f:
    reviews = f.read()
with open('data/labels_sample.txt', 'r') as f:
    labels = f.read()

In [0]:
from string import punctuation

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')

In [0]:
reviews_split=reviews_split[:-1]

In [0]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
labels_split=labels_split[:-1]
encoded_labels_ = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [6]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/70/1a/364556102943cacde1ee00fdcae3b1615b39e52649eddbf54953e5b144c9/transformers-2.2.1-py3-none-any.whl (364kB)
[K     |█                               | 10kB 21.7MB/s eta 0:00:01[K     |█▉                              | 20kB 1.7MB/s eta 0:00:01[K     |██▊                             | 30kB 2.5MB/s eta 0:00:01[K     |███▋                            | 40kB 1.7MB/s eta 0:00:01[K     |████▌                           | 51kB 2.1MB/s eta 0:00:01[K     |█████▍                          | 61kB 2.5MB/s eta 0:00:01[K     |██████▎                         | 71kB 2.8MB/s eta 0:00:01[K     |███████▏                        | 81kB 3.2MB/s eta 0:00:01[K     |████████                        | 92kB 3.6MB/s eta 0:00:01[K     |█████████                       | 102kB 2.7MB/s eta 0:00:01[K     |█████████▉                      | 112kB 2.7MB/s eta 0:00:01[K     |██████████▊                     | 122kB 2.7M

In [7]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import DistilBertModel, DistilBertTokenizer, AdamW


BERT_MODEL='distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL)
model = DistilBertModel.from_pretrained(BERT_MODEL)

100%|██████████| 231508/231508 [00:00<00:00, 4550338.48B/s]
100%|██████████| 492/492 [00:00<00:00, 252922.85B/s]
100%|██████████| 267967963/267967963 [00:04<00:00, 56366420.60B/s]


In [8]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [9]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [10]:
max_input_length = tokenizer.max_model_input_sizes[BERT_MODEL]

print(max_input_length)

512


In [0]:
def tokenize_and_cut(sentence):
    tokens="[CLS] "+sentence+" [SEP]"
    tokens = tokenizer.tokenize(tokens) 
    tokens = tokens[:max_input_length-2]
    return tokens

def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, :len(row)] = np.array(row)[:seq_length]
    
    return features

In [0]:
tokenized_texts = [tokenize_and_cut(sent) for sent in reviews_split]


In [0]:
input_ids=[tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts]

In [0]:
max_len=200

input_ids=pad_features(input_ids, max_len)

In [0]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [16]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(input_ids)*split_frac)
tr_inputs, remaining_inputs = input_ids[:split_idx], input_ids[split_idx:]
tr_tags, remaining_tags = encoded_labels_[:split_idx], encoded_labels_[split_idx:]
tr_masks, remaining_masks = attention_masks[:split_idx], attention_masks[split_idx:]

test_idx = int(len(remaining_inputs)*0.5)
val_inputs, test_inputs = remaining_inputs[:test_idx], remaining_inputs[test_idx:]
val_tags, test_tags = remaining_tags[:test_idx], remaining_tags[test_idx:]
val_masks, test_masks = remaining_masks[:test_idx], remaining_masks[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(tr_inputs.shape), 
      "\nValidation set: \t{}".format(val_inputs.shape),
      "\nTest set: \t\t{}".format(test_inputs.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [0]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
test_inputs=torch.tensor(test_inputs)

tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
test_tags=torch.tensor(test_tags)

tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
test_masks=torch.tensor(test_masks)


In [0]:
BATCH_SIZE=64

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class DistilBert_FFNN(nn.Module):
    def __init__(self, bert):
        super(DistilBert_FFNN, self).__init__()
        self.bert=bert
        self.fc1=nn.Linear(768,100)
        self.fc2=nn.Linear(100,1)
        self.drop=nn.Dropout(0.3)

    def forward(self, text, mask):
        with torch.no_grad():
            last_hidden_states=self.bert(text, attention_mask=mask)[0][:,0,:]
        out=self.drop(F.relu(self.fc1(last_hidden_states)))
        out=F.relu(self.fc2(out))
        return out.squeeze()


In [0]:
net=DistilBert_FFNN(model)

In [0]:
for name, param in net.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(net):,} trainable parameters')

The model has 77,001 trainable parameters


In [23]:
for name, param in net.named_parameters():                
    if param.requires_grad:
        print(name)

fc1.weight
fc1.bias
fc2.weight
fc2.bias


In [0]:
import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr=0.0001) #cuidado con el lr. Si lr=0.003 no aprende res...

In [0]:
criterion = nn.BCEWithLogitsLoss()


In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)
net = net.to(device)
criterion = criterion.to(device)

cuda


In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, mask, labs in iterator:
        text=text.cuda()
        mask=mask.cuda()
        labs=labs.cuda()
        labs=labs.float()
        
        optimizer.zero_grad()
        
        predictions = model(text, mask)
        
        loss = criterion(predictions, labs)
        
        acc = binary_accuracy(predictions, labs)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for text, mask, labs in iterator:
            text=text.cuda()
            mask=mask.cuda()
            labs=labs.cuda()
            labs=labs.float()

            predictions = model(text, mask)
            
            loss = criterion(predictions, labs)
            
            acc = binary_accuracy(predictions, labs)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [31]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(net, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(net, valid_dataloader, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(net.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 4m 41s
	Train Loss: 0.644 | Train Acc: 63.49%
	 Val. Loss: 0.594 |  Val. Acc: 77.03%
Epoch: 02 | Epoch Time: 4m 41s
	Train Loss: 0.579 | Train Acc: 75.23%
	 Val. Loss: 0.563 |  Val. Acc: 78.59%
Epoch: 03 | Epoch Time: 4m 41s
	Train Loss: 0.561 | Train Acc: 78.27%
	 Val. Loss: 0.556 |  Val. Acc: 78.16%
Epoch: 04 | Epoch Time: 4m 41s
	Train Loss: 0.553 | Train Acc: 79.55%
	 Val. Loss: 0.550 |  Val. Acc: 81.60%
Epoch: 05 | Epoch Time: 4m 41s
	Train Loss: 0.548 | Train Acc: 80.45%
	 Val. Loss: 0.547 |  Val. Acc: 80.74%
Epoch: 06 | Epoch Time: 4m 41s
	Train Loss: 0.547 | Train Acc: 80.84%
	 Val. Loss: 0.545 |  Val. Acc: 81.29%
Epoch: 07 | Epoch Time: 4m 41s
	Train Loss: 0.543 | Train Acc: 81.18%
	 Val. Loss: 0.543 |  Val. Acc: 81.76%
Epoch: 08 | Epoch Time: 4m 41s
	Train Loss: 0.542 | Train Acc: 81.50%
	 Val. Loss: 0.543 |  Val. Acc: 81.41%
Epoch: 09 | Epoch Time: 4m 41s
	Train Loss: 0.540 | Train Acc: 81.53%
	 Val. Loss: 0.540 |  Val. Acc: 82.19%
Epoch: 10 | Epoch T

In [33]:
test_loss, test_acc = evaluate(net, test_dataloader,criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.535 | Test Acc: 82.03%
