In [0]:
import numpy as np

# read data from text files
with open('data/reviews_sample.txt', 'r') as f:
    reviews = f.read()
with open('data/labels_sample.txt', 'r') as f:
    labels = f.read()

In [0]:
from string import punctuation

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')

In [0]:
reviews_split=reviews_split[:-1]

In [0]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
labels_split=labels_split[:-1]
encoded_labels_ = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [0]:
encoded_labels= np.zeros((encoded_labels_.size, encoded_labels_.max()+1))
encoded_labels[np.arange(encoded_labels_.size),encoded_labels_] = 1

In [0]:
nb_labels=encoded_labels.shape[1]

In [0]:
!pip install transformers



In [0]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizer, DistilBertModel, AdamW



BERT_MODEL='distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL)
bert = DistilBertModel.from_pretrained(BERT_MODEL, num_labels=nb_labels)

Using TensorFlow backend.


In [0]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [0]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [0]:
max_input_length = tokenizer.max_model_input_sizes[BERT_MODEL]

print(max_input_length)

512


In [0]:
def tokenize_and_cut(sentence):
    tokens="[CLS] "+sentence+" [SEP]"
    tokens = tokenizer.tokenize(tokens) 
    tokens = tokens[:max_input_length-2]
    return tokens

def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, :len(row)] = np.array(row)[:seq_length]
    
    return features

In [0]:
tokenized_texts = [tokenize_and_cut(sent) for sent in reviews_split]


In [0]:
input_ids=[tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts]

In [0]:
max_len=200

input_ids=pad_features(input_ids, max_len)

In [0]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [0]:
len(encoded_labels), len(input_ids), len(attention_masks)

(25000, 25000, 25000)

In [0]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(input_ids)*split_frac)
tr_inputs, remaining_inputs = input_ids[:split_idx], input_ids[split_idx:]
tr_tags, remaining_tags = encoded_labels_[:split_idx], encoded_labels_[split_idx:]
tr_masks, remaining_masks = attention_masks[:split_idx], attention_masks[split_idx:]

test_idx = int(len(remaining_inputs)*0.5)
val_inputs, test_inputs = remaining_inputs[:test_idx], remaining_inputs[test_idx:]
val_tags, test_tags = remaining_tags[:test_idx], remaining_tags[test_idx:]
val_masks, test_masks = remaining_masks[:test_idx], remaining_masks[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(tr_inputs.shape), 
      "\nValidation set: \t{}".format(val_inputs.shape),
      "\nTest set: \t\t{}".format(test_inputs.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [0]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
test_inputs=torch.tensor(test_inputs)

tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
test_tags=torch.tensor(test_tags)

tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
test_masks=torch.tensor(test_masks)

In [0]:
BATCH_SIZE=64

In [0]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [0]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.bert = bert
        embedding_dim = bert.config.to_dict()['dim']
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = bidirectional,batch_first = True, dropout = 0 if n_layers < 2 else dropout)
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, mask):
        text = text.long() #text = [batch size, sent len]
      
        with torch.no_grad():
            embedded = bert(text, mask)[0] #embedded = [batch size, sent len, emb dim]
      
        _, hidden = self.rnn(embedded) #hidden = [n layers * n directions, batch size, emb dim]
      
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) #hidden = [batch size, hid dim]
        else:
            hidden = self.dropout(hidden[-1,:,:]) #hidden = [batch size, hid dim]
  
        output = self.out(hidden) #output = [batch size, out dim]
                        
        return output

In [0]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.5

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 67,939,329 trainable parameters


In [0]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [0]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,576,449 trainable parameters


In [0]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
out.weight
out.bias


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [0]:
criterion = nn.BCEWithLogitsLoss()

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)
model = model.to(device)
criterion = criterion.to(device)

cuda


In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, masks, labs in iterator:
        text=text.cuda()
        masks=masks.cuda()
        labs=labs.cuda()
        labs=labs.float()
        
        optimizer.zero_grad()
        
        predictions = model(text, masks).squeeze(1)
        
        loss = criterion(predictions, labs)
        
        acc = binary_accuracy(predictions, labs)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:

def evaluate(model, iterator, criterion):

    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
      
        for text, masks, labs in iterator:
            text=text.cuda()
            masks=masks.cuda()
            labs=labs.cuda()
            labs=labs.float()

            predictions = model(text, masks).squeeze(1)
            
            loss = criterion(predictions, labs)
            
            acc = binary_accuracy(predictions, labs)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 25s
	Train Loss: 0.653 | Train Acc: 64.17%
	 Val. Loss: 0.600 |  Val. Acc: 76.17%
Epoch: 02 | Epoch Time: 1m 25s
	Train Loss: 0.520 | Train Acc: 76.71%
	 Val. Loss: 0.451 |  Val. Acc: 80.00%
Epoch: 03 | Epoch Time: 1m 25s
	Train Loss: 0.426 | Train Acc: 80.92%
	 Val. Loss: 0.409 |  Val. Acc: 82.89%
Epoch: 04 | Epoch Time: 1m 25s
	Train Loss: 0.403 | Train Acc: 81.98%
	 Val. Loss: 0.392 |  Val. Acc: 83.59%
Epoch: 05 | Epoch Time: 1m 26s
	Train Loss: 0.391 | Train Acc: 82.90%
	 Val. Loss: 0.385 |  Val. Acc: 83.55%
Epoch: 06 | Epoch Time: 1m 26s
	Train Loss: 0.384 | Train Acc: 83.11%
	 Val. Loss: 0.376 |  Val. Acc: 84.10%
Epoch: 07 | Epoch Time: 1m 26s
	Train Loss: 0.373 | Train Acc: 83.67%
	 Val. Loss: 0.365 |  Val. Acc: 84.65%
Epoch: 08 | Epoch Time: 1m 26s
	Train Loss: 0.367 | Train Acc: 83.74%
	 Val. Loss: 0.367 |  Val. Acc: 84.10%
Epoch: 09 | Epoch Time: 1m 26s
	Train Loss: 0.363 | Train Acc: 84.17%
	 Val. Loss: 0.357 |  Val. Acc: 84.92%
Epoch: 10 | Epoch T

In [0]:
test_loss, test_acc = evaluate(model, test_dataloader,criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.380 | Test Acc: 83.24%
