In [5]:
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

In [6]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(vars(train_data.examples[0]))

Number of training examples: 25000
Number of testing examples: 25000
{'text': ['This', 'was', 'a', 'pretty', 'good', 'movie', ',', 'I', 'liked', 'it', '.', 'I', 'thought', 'it', 'was', 'a', 'pretty', 'accurate', 'look', 'at', 'bulimia', 'and', 'how', 'it', "'s", 'not', 'about', 'dieting', ',', 'it', "'s", 'about', 'having', 'a', 'pain', 'so', 'deep', 'that', 'they', 'have', 'to', 'find', 'a', 'way', 'to', 'deal', 'with', 'it', 'and', 'they', 'choose', 'this', '.', 'Beth', 'was', 'a', 'very', 'accurately', 'drawn', 'character', 'and', 'in', 'the', 'scene', 'where', 'she', 'confronts', 'her', 'mom', 'about', 'the', 'eating', 'disorder', 'you', 'can', 'see', 'the', 'pain', 'inside', 'her', 'and', 'hear', 'it', 'in', 'her', 'voice', 'and', 'you', 'know', 'how', 'deep', 'the', 'pain', 'is', 'that', 'she', 'is', 'feeling', '.', 'I', 'also', 'think', 'one', 'of', 'the', 'best', 'lines', 'in', 'the', 'movie', 'is', 'where', 'Beth', 'yells', 'the', 'words', ',', '"', 'It', "'s", 'not', 'about',

In [7]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [8]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
# Do not use BucketIterator in your implementation because you are required to implement the padding and masking yourself.
#train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=1, device=device)

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2
[('the', 202525), (',', 193092), ('.', 165949), ('and', 109557), ('a', 108931), ('of', 100881), ('to', 93484), ('is', 76238), ('in', 61560), ('I', 54174), ('it', 53506), ('that', 49084), ('"', 44420), ("'s", 43297), ('this', 42094), ('-', 37134), ('/><br', 35549), ('was', 35248), ('as', 30532), ('with', 30005)]
['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(None, {'neg': 0, 'pos': 1})


In [9]:
#split and pad section
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
training_data = train_data
#looks like thing is tensored so can use pad sequences
temp = (vars(training_data.examples[0])['text'])
#print (temp)
train_labels = []
train_tensors = []
train_lens = []
max_len = 0

#for i in range (0, len(train_data)):
#  example = vars(train_data.examples[i])
#  if len(example['text']) > max_len:
#    max_len = len(example['text']) 

#print (max_len)
vocab = TEXT.vocab.itos
for i in range (0, len(train_data)):
  example = vars(training_data.examples[i])
  temp = []
  for j in (example['text']):
    temp.append(TEXT.vocab.stoi[j])
  #convert to tensor
  temp = torch.tensor(temp, dtype = int, device = device) 
  train_labels.append(example['label'])
  train_tensors.append(temp)
  train_lens.append(len(temp))
#print (len(train_lens))
#print (train_lens)

valid_labels = []
valid_tensors = []
valid_lens = []
# validation data
for i in range (0, len(valid_data)):
  example = vars(valid_data.examples[i])
  temp = []
  for j in (example['text']):
    temp.append(TEXT.vocab.stoi[j])
  #convert to tensor
  temp = torch.tensor(temp, dtype = int, device = device)
  valid_labels.append(example['label'])
  valid_tensors.append(temp)
  valid_lens.append(len(temp))


test_labels = []
test_tensors = []
test_lens = []
# validation data
for i in range (0, len(test_data)):
  example = vars(test_data.examples[i])
  temp = []
  for j in (example['text']):
    temp.append(TEXT.vocab.stoi[j])
  #convert to tensor
  temp = torch.tensor(temp, dtype = int, device = device)
  test_labels.append(example['label'])
  test_tensors.append(temp)
  test_lens.append(len(temp))

In [10]:
class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, list_IDs, labels, seq_lens):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs
        self.seq_lens = seq_lens

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]

        # Load data and get label
        X =  ID
        if self.labels[index] == 'neg':
          y = torch.tensor(0, dtype = float, device = device)
        else:
          y = torch.tensor(1, dtype = float, device = device)
        #print (y)
        len = self.seq_lens[index]
        return X, y, len


In [11]:
BATCH_SIZE = 128
#pad sequence
#print (train_tensors)
pad_val = TEXT.vocab.stoi["<pad>"]
print (pad_val)
padded = pad_sequence(train_tensors, batch_first = True, padding_value= pad_val)
print (len(padded[0]))
train_set = Dataset(padded, train_labels, train_lens)
#for i in range (0, 2):#len(train_data)):
  #vars(training_data.examples[i])['text'] = train_tensors[i]
  #if vars(training_data.examples[i])['label'] =='neg':
    #vars(training_data.examples[i])['label'] = 0
  #else:
    #vars(training_data.examples[i])['label'] = 1

  #print (train_tensors[i])
#print ((TEXT.vocab.freqs.most_common()))

train_dataloader = DataLoader(train_set, batch_size = BATCH_SIZE)
#print (train_tensors[0])
train_features= (iter (train_dataloader))
print (vars(train_features))

#pad sequence
padded = pad_sequence(valid_tensors, batch_first = True)
#print (padded)
valid_set = Dataset(padded, valid_labels, valid_lens)
#
valid_dataloader = DataLoader(valid_set, batch_size = BATCH_SIZE)
#print (train_tensors[0])
valid_features= (iter (valid_dataloader))
print (vars(valid_features))

padded = pad_sequence(test_tensors, batch_first = True)
#print (padded)
test_set = Dataset(padded, test_labels, test_lens)
#
test_dataloader = DataLoader(test_set, batch_size = BATCH_SIZE)
#print (train_tensors[0])
test_features= (iter (test_dataloader))
print (vars(test_features))

1
1989
{'_dataset': <__main__.Dataset object at 0x7febbcddc1d0>, '_dataset_kind': 0, '_IterableDataset_len_called': None, '_auto_collation': True, '_drop_last': False, '_index_sampler': <torch.utils.data.sampler.BatchSampler object at 0x7febbcd74cd0>, '_num_workers': 0, '_prefetch_factor': 2, '_pin_memory': False, '_timeout': 0, '_collate_fn': <function default_collate at 0x7febc50814d0>, '_sampler_iter': <generator object BatchSampler.__iter__ at 0x7feb8e4e56d0>, '_base_seed': 3532910284440527571, '_persistent_workers': False, '_num_yielded': 0, '_profile_name': 'enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__', '_dataset_fetcher': <torch.utils.data._utils.fetch._MapDatasetFetcher object at 0x7febbcd74e90>}
{'_dataset': <__main__.Dataset object at 0x7febc4c6d090>, '_dataset_kind': 0, '_IterableDataset_len_called': None, '_auto_collation': True, '_drop_last': False, '_index_sampler': <torch.utils.data.sampler.BatchSampler object at 0x7febc4f52e50>, '_num_workers': 0, '_pref

In [12]:
import torch.nn as nn
class LR(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx= 1)
        self.fc = nn.Linear(embedding_dim, output_dim)
    def forward(self, text, seq_lens=None):
        embedded = self.embedding(text).sum(1)
        return self.fc(embedded)

In [13]:

#seq_lengths = LongTensor(list(map(len, vectorized_seqs))) this line needs to run in the data structure
#
#

#implementation with the lstm and pack pads
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim, device):
        super().__init__()
        self.device = device
        self.embedding = nn.Embedding(input_dim, embedding_dim,padding_idx=1)
        self.lstm = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)
        self.fc = nn.Linear(embedding_dim, output_dim)
    def forward(self, text, seq_lengths):
        embedded = self.embedding(text) #.squeeze().sum(1)
        #we need the lengths of all the strings before hand possibly that or we need an identifier for packed stuffs
        print (embedded[1])
        print (embedded.shape)
        print (seq_lengths.shape)
        pack = pack_padded_sequence(embedded, seq_lengths, batch_first=True, enforce_sorted=False)
        lstm, (ht, ct) = self.lstm(pack) 
        unpack, input_sizes = pad_packed_sequence(lstm, batch_first=True)
        unpack = torch.div(unpack.sum(1), seq_lengths.to(self.device).unsqueeze(1))
        return self.fc(unpack)


In [14]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 64
OUTPUT_DIM = 1
lr = 0.1 # 1e-3
#model = LR(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
model =LSTM(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, device)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,633,473 trainable parameters


In [16]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=lr)

In [17]:
criterion = nn.BCEWithLogitsLoss()

In [18]:
model = model.to(device)
criterion = criterion.to(device)

In [19]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [20]:
from tqdm import tqdm
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for instance in tqdm(iterator, desc="Training...", total=len(iterator)):
        
        optimizer.zero_grad()
        #not fine 
        #print (instance[1])  
        #print (instance[0].reshape(203, 2))
        #temp = (instance[0][0], instance[0][1])
        temp = torch.unbind(instance[0])
        #print (temp)
        reshaped = torch.stack(temp , dim = 1) #(instance[0][0], instance[0][1])
        #print (instance[0])
        predictions = model(instance[0], instance[2])
        #print (instance[0])
        #print (len(instance[0]))
        #not fine
        loss = criterion(predictions, instance[1].float().unsqueeze(1))
        #not fine
        acc = binary_accuracy(predictions, instance[1].float().unsqueeze(1))
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for instance in iterator:
            #changed this to new format
            temp = torch.unbind(instance[0])
           #print (temp)
            reshaped = torch.stack(temp , dim = 1)
            predictions = model(instance[0], instance[2])
            
            loss = criterion(predictions, instance[1].float().unsqueeze(1))
            
            acc = binary_accuracy(predictions, instance[1].float().unsqueeze(1))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    #print (train_dataloader)
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Training...:   0%|          | 0/137 [00:00<?, ?it/s]

tensor([[ 1.0603, -0.1789, -1.1871,  ..., -1.7627,  0.4865, -0.2717],
        [-1.0277,  0.9664,  1.2639,  ...,  0.8505, -1.3415, -1.3633],
        [ 0.7888, -1.2477, -1.7592,  ...,  0.3942,  0.8515,  0.5374],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   1%|          | 1/137 [00:23<53:06, 23.43s/it]

tensor([[-0.0936,  0.1063, -2.9441,  ...,  0.1267,  1.8277,  0.6758],
        [ 0.2158, -1.5504,  0.3169,  ..., -0.3651, -1.0719,  0.9414],
        [ 0.2209, -0.8245, -0.0256,  ..., -0.6476, -1.8021,  2.4911],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   1%|▏         | 2/137 [00:47<52:59, 23.55s/it]

tensor([[-0.0936,  0.1063, -2.9441,  ...,  0.1267,  1.8277,  0.6758],
        [ 0.5154,  0.2643, -2.1439,  ...,  0.1590, -0.2309, -1.8214],
        [ 0.1092, -0.2081, -1.7573,  ..., -1.2861,  0.9599,  0.1989],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   2%|▏         | 3/137 [01:08<50:01, 22.40s/it]

tensor([[ 2.7441,  1.1270,  0.0060,  ...,  0.7659, -0.8096, -0.5267],
        [-0.5113, -0.5124, -0.6501,  ...,  1.4856, -0.6763,  0.0937],
        [ 1.0027,  0.2835, -0.4596,  ...,  2.2802, -1.5437, -2.1139],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   3%|▎         | 4/137 [01:27<47:26, 21.40s/it]

tensor([[-1.4759, -0.3410,  0.0238,  ...,  1.3143,  0.1213,  0.0282],
        [-0.8659,  0.5026, -0.1108,  ...,  0.0097,  0.9155, -2.2089],
        [-0.9036,  0.9467,  0.4456,  ..., -0.9070,  0.2460, -0.3401],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   4%|▎         | 5/137 [01:59<54:47, 24.90s/it]

tensor([[-0.0937,  0.1063, -2.9441,  ...,  0.1267,  1.8277,  0.6758],
        [-0.8840,  0.7559, -0.3052,  ..., -0.6270,  0.0283,  1.1135],
        [ 0.5409, -1.2138,  0.6935,  ..., -0.5647,  0.9990,  0.7565],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   4%|▍         | 6/137 [02:27<56:47, 26.01s/it]

tensor([[-1.0154, -1.3935,  0.5388,  ..., -0.6406,  0.4578,  1.6145],
        [ 1.3442, -1.0937, -0.7099,  ...,  0.4685, -0.5173, -1.0919],
        [-0.3443, -0.5664,  2.1890,  ...,  1.0872,  1.7377,  0.3472],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   5%|▌         | 7/137 [02:57<59:26, 27.44s/it]

tensor([[-0.0937,  0.1063, -2.9441,  ...,  0.1267,  1.8277,  0.6758],
        [-0.7625,  1.2473, -1.8907,  ...,  0.0493, -1.8683,  1.8088],
        [-0.4573, -0.8678, -0.7842,  ...,  0.0071, -0.1536,  1.1330],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)
torch.Size([128, 1989, 64])
torch.Size([128])


Training...:   5%|▌         | 7/137 [03:25<1:03:34, 29.34s/it]


KeyboardInterrupt: ignored

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)

print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_dataloader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
#confusion matrix

In [None]:
# Test of model correctness
max_n_test_instances = 5
i = 1
for instance in valid_dataloader:
  score = model(instance[0], instance[1])
  print(score)
  if i >= max_n_test_instances:
    break
  else:
    i += 1
