# Hometask

1) Find text to train (any book)<br>
2) Build train and validation set <br>
3) Train bidirectional language model that predicts the POS of word being based on its `n_context= 3` neighbours from the left and `n_context= 3` neighbours from the right <br>
4) Evaluate the model 

### Loading data

In [1]:
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Dropout
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch
import numpy as np
import re




In [2]:
def load_data():
    with open('Frankenstein; Or, The Modern Prometheus.txt', 'r', encoding='utf-8') as file:
        data = file.read().lower()
        
    return data

In [3]:
text = load_data()
text = re.sub(r'\[.*\]', "", text)
text = re.sub(r'\d+', "", text)
text = re.sub(r'«|»', "", text)
print(text)

 the project gutenberg ebook of frankenstein; or, the modern
                         prometheus

  this ebook is for the use of anyone anywhere in the united states and most
  other parts of the world at no cost and with almost no restrictions whatsoever.
  you may copy it, give it away or re-use it under the terms of the project
  gutenberg license included with this ebook or online at www.gutenberg.org. if
  you are not located in the united states, you will have to check the laws of the
  country where you are located before using this ebook.

  title: frankenstein; or, the modern prometheus

      author: mary wollstonecraft shelley

  release date: october ,  
          most recently updated: december , 

  language: english

  credits: judith boss, christy phillips, lynn hanninen and david meltzer. html
           version by al haines.
           further corrections by menno de leeuw.


*** start of the project gutenberg ebook frankenstein; or, the
                     modern pr

In [4]:
text = text[:80000]

### Preprocessing data

In [5]:
vectorizer = CountVectorizer(token_pattern=r'(?u)(?:\b\w+\b|\.|\,|\!|\?|\-|\n)').fit([text])
vocab = vectorizer.get_feature_names_out()

In [6]:
vocab_word = vectorizer.vocabulary_
vocab_word

{'the': 2680,
 'project': 2090,
 'gutenberg': 1227,
 'ebook': 855,
 'of': 1850,
 'frankenstein': 1121,
 'or': 1869,
 ',': 2,
 'modern': 1738,
 '\n': 0,
 'prometheus': 2092,
 'this': 2703,
 'is': 1480,
 'for': 1088,
 'use': 2846,
 'anyone': 139,
 'anywhere': 141,
 'in': 1368,
 'united': 2820,
 'states': 2533,
 'and': 124,
 'most': 1751,
 'other': 1874,
 'parts': 1917,
 'world': 2998,
 'at': 201,
 'no': 1807,
 'cost': 595,
 'with': 2974,
 'almost': 98,
 'restrictions': 2268,
 'whatsoever': 2942,
 '.': 4,
 'you': 3026,
 'may': 1670,
 'copy': 590,
 'it': 1484,
 'give': 1171,
 'away': 239,
 're': 2156,
 '-': 3,
 'under': 2799,
 'terms': 2670,
 'license': 1574,
 'included': 1379,
 'online': 1861,
 'www': 3016,
 'org': 1871,
 'if': 1329,
 'are': 166,
 'not': 1816,
 'located': 1601,
 'will': 2964,
 'have': 1255,
 'to': 2726,
 'check': 436,
 'laws': 1542,
 'country': 603,
 'where': 2946,
 'before': 273,
 'using': 2851,
 'title': 2725,
 'author': 231,
 'mary': 1658,
 'wollstonecraft': 2981,
 'sh

In [7]:
word_tokenizer = vectorizer.build_tokenizer()
all_tokens = word_tokenizer(text)

print('len of all tokens = {:,}'.format(len(all_tokens)))

len of all tokens = 16,433


In [8]:
n_context = 3
training_pairs = []

for i in range(n_context, len(all_tokens) - n_context):
    context_words = all_tokens[i - n_context:i] + all_tokens[i + 1:i + n_context + 1]
    target_word = all_tokens[i]
    training_pairs.append((context_words, target_word))

print('len(samples) = {:,}'.format(len(training_pairs)))
    
training_pairs[:20]

len(samples) = 16,427


[(['the', 'project', 'gutenberg', 'of', 'frankenstein', 'or'], 'ebook'),
 (['project', 'gutenberg', 'ebook', 'frankenstein', 'or', ','], 'of'),
 (['gutenberg', 'ebook', 'of', 'or', ',', 'the'], 'frankenstein'),
 (['ebook', 'of', 'frankenstein', ',', 'the', 'modern'], 'or'),
 (['of', 'frankenstein', 'or', 'the', 'modern', '\n'], ','),
 (['frankenstein', 'or', ',', 'modern', '\n', 'prometheus'], 'the'),
 (['or', ',', 'the', '\n', 'prometheus', '\n'], 'modern'),
 ([',', 'the', 'modern', 'prometheus', '\n', '\n'], '\n'),
 (['the', 'modern', '\n', '\n', '\n', 'this'], 'prometheus'),
 (['modern', '\n', 'prometheus', '\n', 'this', 'ebook'], '\n'),
 (['\n', 'prometheus', '\n', 'this', 'ebook', 'is'], '\n'),
 (['prometheus', '\n', '\n', 'ebook', 'is', 'for'], 'this'),
 (['\n', '\n', 'this', 'is', 'for', 'the'], 'ebook'),
 (['\n', 'this', 'ebook', 'for', 'the', 'use'], 'is'),
 (['this', 'ebook', 'is', 'the', 'use', 'of'], 'for'),
 (['ebook', 'is', 'for', 'use', 'of', 'anyone'], 'the'),
 (['is', 

In [9]:
contexts = [pair[0] for pair in training_pairs]
targets = [pair[1] for pair in training_pairs]

In [10]:
contexts[:5]

[['the', 'project', 'gutenberg', 'of', 'frankenstein', 'or'],
 ['project', 'gutenberg', 'ebook', 'frankenstein', 'or', ','],
 ['gutenberg', 'ebook', 'of', 'or', ',', 'the'],
 ['ebook', 'of', 'frankenstein', ',', 'the', 'modern'],
 ['of', 'frankenstein', 'or', 'the', 'modern', '\n']]

In [11]:
targets[:5]

['ebook', 'of', 'frankenstein', 'or', ',']

### Datasets

In [12]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, contexts, targets, vocab_word, n_context):
        self.contexts = contexts
        self.targets = targets
        self.vocab_word = vocab_word
        self.n_context = n_context

    def __len__(self):
        return len(self.contexts)
    
    def __getitem__(self, idx):
        context = self.contexts[idx]
        target = self.targets[idx]
        context, target = self.replace_word_by_id(context, target)
        
        vector_context = np.zeros((2*self.n_context, len(self.vocab_word)), dtype=np.float32)
        vector_target = np.zeros(len(self.vocab_word), dtype=np.float32)
        
        vector_context[np.arange(2*self.n_context), context] = 1
        vector_target[target] = 1
        
        return vector_context, vector_target
    
    def replace_word_by_id(self, context, target):
        for i in range(len(context)):
            if context[i] in self.vocab_word:
                context[i] = self.vocab_word[context[i]]
                
        if target in self.vocab_word:
            target = self.vocab_word[target]

        return context, target

### Build train and validation set

In [13]:
contexts_train, contexts_test, targets_train, targets_test = train_test_split(contexts, targets, test_size=0.2, random_state=42)

In [14]:
train_data = Datasets(contexts_train, targets_train, vocab_word, n_context)
test_data = Datasets(contexts_test, targets_test, vocab_word, n_context)

### DataLoader

In [15]:
from torch.utils.data import DataLoader

In [16]:
batc_size = 32

train_loader = DataLoader(train_data, batch_size=batc_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batc_size, shuffle=False)

### Input and output

In [17]:
input_size = torch.numel(torch.tensor(train_data[0][0]))
output_size = len(vocab_word)

input_size, output_size

(18390, 3065)

### Create model

In [18]:
class BiLSTM(torch.nn.Module):
    def __init__(self, input_size, output_size, hidden_size, n_layers, dropout):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = torch.nn.Dropout(dropout)
        self.lstm = torch.nn.LSTM(input_size, hidden_size, n_layers, batch_first=True, bidirectional=True)
        self.fc = torch.nn.Linear(hidden_size * 2, output_size)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(self.dropout(x[:, -1, :]))
        return x

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

In [20]:
hidden_size = 128
n_layers = 2
dropout = 0.5

model = BiLSTM(input_size, output_size, hidden_size, n_layers, dropout).to(device)

### Loos function and optimizer

In [21]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

### Train model

In [22]:
from tqdm import tqdm
from torch.autograd import Variable

In [23]:

def train_loop(model, train_loader, optimizer, loss_function, device, epoch):
    model.train()
    train_loss = 0.0
    train_correct = 0

    with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}", unit='batch') as progress_bar:
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_function(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_correct += (outputs.argmax(dim=1) == targets.argmax(dim=1)).sum().item()

            progress_bar.update(1)
            progress_bar.set_postfix(loss=train_loss / len(train_loader.dataset))

    train_loss /= len(train_loader)
    train_correct /= len(train_loader.dataset)
    print(f"Train Error: \n Accuracy: {(100*train_correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")

    return train_loss, train_correct

def test_loop(model, test_loader, loss_function, device):
    model.eval()
    test_loss = 0.0
    test_correct = 0

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = loss_function(outputs, targets)

            test_loss += loss.item()
            test_correct += (outputs.argmax(dim=1) == targets.argmax(dim=1)).sum().item()

    test_loss /= len(test_loader)
    test_correct /= len(test_loader.dataset)
    print(f"Test Error: \n Accuracy: {(100*test_correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
    return test_loss, test_correct

In [25]:
num_epochs = 40

for epoch in range(num_epochs):
    train_loss, train_accuracy = train_loop(model, train_loader, optimizer, loss_function, device, epoch)
    test_loss, test_accuracy = test_loop(model, test_loader, loss_function, device)

Epoch 1: 100%|██████████| 411/411 [02:06<00:00,  3.26batch/s, loss=0.193] 


Train Error: 
 Accuracy: 6.0%, Avg loss: 6.183283 

Test Error: 
 Accuracy: 7.2%, Avg loss: 6.035417 



Epoch 2: 100%|██████████| 411/411 [02:56<00:00,  2.33batch/s, loss=0.179] 


Train Error: 
 Accuracy: 6.6%, Avg loss: 5.713398 

Test Error: 
 Accuracy: 7.8%, Avg loss: 5.979257 



Epoch 3: 100%|██████████| 411/411 [02:22<00:00,  2.88batch/s, loss=0.171] 


Train Error: 
 Accuracy: 7.9%, Avg loss: 5.464602 

Test Error: 
 Accuracy: 8.9%, Avg loss: 6.021592 



Epoch 4: 100%|██████████| 411/411 [01:56<00:00,  3.53batch/s, loss=0.163] 


Train Error: 
 Accuracy: 11.2%, Avg loss: 5.218881 

Test Error: 
 Accuracy: 12.4%, Avg loss: 5.973874 



Epoch 5: 100%|██████████| 411/411 [02:14<00:00,  3.06batch/s, loss=0.153] 


Train Error: 
 Accuracy: 16.3%, Avg loss: 4.882967 

Test Error: 
 Accuracy: 13.5%, Avg loss: 6.059441 



Epoch 6: 100%|██████████| 411/411 [02:06<00:00,  3.24batch/s, loss=0.14]  


Train Error: 
 Accuracy: 21.9%, Avg loss: 4.482983 

Test Error: 
 Accuracy: 13.6%, Avg loss: 6.204383 



Epoch 7: 100%|██████████| 411/411 [02:17<00:00,  2.99batch/s, loss=0.126] 


Train Error: 
 Accuracy: 27.5%, Avg loss: 4.040380 

Test Error: 
 Accuracy: 14.4%, Avg loss: 6.385246 



Epoch 8: 100%|██████████| 411/411 [02:00<00:00,  3.40batch/s, loss=0.113] 


Train Error: 
 Accuracy: 32.6%, Avg loss: 3.620932 

Test Error: 
 Accuracy: 14.2%, Avg loss: 6.374951 



Epoch 9: 100%|██████████| 411/411 [02:01<00:00,  3.37batch/s, loss=0.0993]


Train Error: 
 Accuracy: 38.6%, Avg loss: 3.173622 

Test Error: 
 Accuracy: 14.8%, Avg loss: 6.782036 



Epoch 10: 100%|██████████| 411/411 [02:07<00:00,  3.22batch/s, loss=0.0859]


Train Error: 
 Accuracy: 44.7%, Avg loss: 2.745598 

Test Error: 
 Accuracy: 14.7%, Avg loss: 7.097278 



Epoch 11: 100%|██████████| 411/411 [02:01<00:00,  3.39batch/s, loss=0.0729]


Train Error: 
 Accuracy: 51.7%, Avg loss: 2.329878 

Test Error: 
 Accuracy: 14.9%, Avg loss: 7.387132 



Epoch 12: 100%|██████████| 411/411 [01:53<00:00,  3.62batch/s, loss=0.0603]


Train Error: 
 Accuracy: 58.5%, Avg loss: 1.928219 

Test Error: 
 Accuracy: 15.2%, Avg loss: 7.495680 



Epoch 13: 100%|██████████| 411/411 [01:51<00:00,  3.69batch/s, loss=0.0494] 


Train Error: 
 Accuracy: 65.2%, Avg loss: 1.580878 

Test Error: 
 Accuracy: 14.1%, Avg loss: 7.719306 



Epoch 14: 100%|██████████| 411/411 [01:53<00:00,  3.62batch/s, loss=0.0399] 


Train Error: 
 Accuracy: 71.6%, Avg loss: 1.277070 

Test Error: 
 Accuracy: 14.6%, Avg loss: 7.880808 



Epoch 15: 100%|██████████| 411/411 [02:19<00:00,  2.96batch/s, loss=0.0316] 


Train Error: 
 Accuracy: 77.6%, Avg loss: 1.011431 

Test Error: 
 Accuracy: 14.5%, Avg loss: 8.175559 



Epoch 16: 100%|██████████| 411/411 [01:52<00:00,  3.64batch/s, loss=0.0253] 


Train Error: 
 Accuracy: 82.6%, Avg loss: 0.808067 

Test Error: 
 Accuracy: 14.5%, Avg loss: 8.289291 



Epoch 17: 100%|██████████| 411/411 [01:59<00:00,  3.43batch/s, loss=0.0202] 


Train Error: 
 Accuracy: 85.7%, Avg loss: 0.646800 

Test Error: 
 Accuracy: 14.2%, Avg loss: 8.318532 



Epoch 18: 100%|██████████| 411/411 [02:00<00:00,  3.40batch/s, loss=0.0159] 


Train Error: 
 Accuracy: 89.2%, Avg loss: 0.509216 

Test Error: 
 Accuracy: 14.8%, Avg loss: 8.401851 



Epoch 19: 100%|██████████| 411/411 [01:57<00:00,  3.50batch/s, loss=0.0132] 


Train Error: 
 Accuracy: 91.2%, Avg loss: 0.420587 

Test Error: 
 Accuracy: 15.1%, Avg loss: 8.662894 



Epoch 20: 100%|██████████| 411/411 [01:59<00:00,  3.45batch/s, loss=0.0103] 


Train Error: 
 Accuracy: 93.4%, Avg loss: 0.330275 

Test Error: 
 Accuracy: 15.0%, Avg loss: 8.897139 



Epoch 21: 100%|██████████| 411/411 [01:58<00:00,  3.47batch/s, loss=0.00881]


Train Error: 
 Accuracy: 94.3%, Avg loss: 0.281807 

Test Error: 
 Accuracy: 14.3%, Avg loss: 8.829869 



Epoch 22: 100%|██████████| 411/411 [01:58<00:00,  3.48batch/s, loss=0.00743]


Train Error: 
 Accuracy: 95.4%, Avg loss: 0.237553 

Test Error: 
 Accuracy: 15.3%, Avg loss: 9.016727 



Epoch 23: 100%|██████████| 411/411 [01:58<00:00,  3.47batch/s, loss=0.00628]


Train Error: 
 Accuracy: 96.4%, Avg loss: 0.200874 

Test Error: 
 Accuracy: 14.1%, Avg loss: 9.177000 



Epoch 24: 100%|██████████| 411/411 [01:58<00:00,  3.46batch/s, loss=0.00543]


Train Error: 
 Accuracy: 96.8%, Avg loss: 0.173561 

Test Error: 
 Accuracy: 13.8%, Avg loss: 9.097756 



Epoch 25: 100%|██████████| 411/411 [01:58<00:00,  3.46batch/s, loss=0.00471]


Train Error: 
 Accuracy: 97.2%, Avg loss: 0.150731 

Test Error: 
 Accuracy: 15.4%, Avg loss: 9.259471 



Epoch 26: 100%|██████████| 411/411 [01:59<00:00,  3.45batch/s, loss=0.00434] 


Train Error: 
 Accuracy: 97.4%, Avg loss: 0.138857 

Test Error: 
 Accuracy: 14.4%, Avg loss: 9.531257 



Epoch 27: 100%|██████████| 411/411 [02:03<00:00,  3.33batch/s, loss=0.0042]  


Train Error: 
 Accuracy: 97.4%, Avg loss: 0.134246 

Test Error: 
 Accuracy: 14.2%, Avg loss: 9.259469 



Epoch 28: 100%|██████████| 411/411 [02:08<00:00,  3.19batch/s, loss=0.00363] 


Train Error: 
 Accuracy: 97.8%, Avg loss: 0.116058 

Test Error: 
 Accuracy: 15.9%, Avg loss: 9.489871 



Epoch 29: 100%|██████████| 411/411 [02:10<00:00,  3.15batch/s, loss=0.00314] 


Train Error: 
 Accuracy: 98.0%, Avg loss: 0.100402 

Test Error: 
 Accuracy: 15.2%, Avg loss: 9.436438 



Epoch 30: 100%|██████████| 411/411 [02:28<00:00,  2.77batch/s, loss=0.00304] 


Train Error: 
 Accuracy: 98.2%, Avg loss: 0.097195 

Test Error: 
 Accuracy: 14.9%, Avg loss: 9.555924 



Epoch 31: 100%|██████████| 411/411 [02:08<00:00,  3.21batch/s, loss=0.00263] 


Train Error: 
 Accuracy: 98.5%, Avg loss: 0.083984 

Test Error: 
 Accuracy: 14.5%, Avg loss: 9.741282 



Epoch 32: 100%|██████████| 411/411 [02:05<00:00,  3.27batch/s, loss=0.00257] 


Train Error: 
 Accuracy: 98.4%, Avg loss: 0.082143 

Test Error: 
 Accuracy: 14.9%, Avg loss: 9.815140 



Epoch 33: 100%|██████████| 411/411 [02:07<00:00,  3.21batch/s, loss=0.00274] 


Train Error: 
 Accuracy: 98.3%, Avg loss: 0.087524 

Test Error: 
 Accuracy: 15.4%, Avg loss: 9.690003 



Epoch 34: 100%|██████████| 411/411 [02:04<00:00,  3.30batch/s, loss=0.00233] 


Train Error: 
 Accuracy: 98.6%, Avg loss: 0.074346 

Test Error: 
 Accuracy: 15.7%, Avg loss: 9.790952 



Epoch 35: 100%|██████████| 411/411 [02:11<00:00,  3.13batch/s, loss=0.0022]  


Train Error: 
 Accuracy: 98.6%, Avg loss: 0.070469 

Test Error: 
 Accuracy: 16.1%, Avg loss: 9.848913 



Epoch 36: 100%|██████████| 411/411 [02:07<00:00,  3.23batch/s, loss=0.00191] 


Train Error: 
 Accuracy: 98.9%, Avg loss: 0.061108 

Test Error: 
 Accuracy: 14.3%, Avg loss: 9.872866 



Epoch 37: 100%|██████████| 411/411 [02:18<00:00,  2.97batch/s, loss=0.00215] 


Train Error: 
 Accuracy: 98.6%, Avg loss: 0.068736 

Test Error: 
 Accuracy: 15.2%, Avg loss: 9.882228 



Epoch 38: 100%|██████████| 411/411 [02:08<00:00,  3.20batch/s, loss=0.00173] 


Train Error: 
 Accuracy: 98.9%, Avg loss: 0.055193 

Test Error: 
 Accuracy: 14.6%, Avg loss: 9.755190 



Epoch 39: 100%|██████████| 411/411 [02:07<00:00,  3.22batch/s, loss=0.00153] 


Train Error: 
 Accuracy: 99.1%, Avg loss: 0.048798 

Test Error: 
 Accuracy: 14.3%, Avg loss: 9.871372 



Epoch 40: 100%|██████████| 411/411 [02:05<00:00,  3.28batch/s, loss=0.00145] 


Train Error: 
 Accuracy: 99.1%, Avg loss: 0.046303 

Test Error: 
 Accuracy: 15.3%, Avg loss: 9.990540 

