#author: Jedrzej Chmiel

In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
!pip install nltk==3.7

Collecting nltk==3.7
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 30.0 MB/s 
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.4.24-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 59.8 MB/s 
Installing collected packages: regex, nltk
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 regex-2022.4.24


In [3]:
!pip install git+https://12jerek34jeremi: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@github.com/12jerek34jeremi/harry_potter.git#subdirectory=hpcw

Collecting git+https://12jerek34jeremi:****@github.com/12jerek34jeremi/harry_potter.git#subdirectory=hpcw
  Cloning https://12jerek34jeremi:****@github.com/12jerek34jeremi/harry_potter.git to /tmp/pip-req-build-gstu493r
  Running command git clone -q 'https://12jerek34jeremi:****@github.com/12jerek34jeremi/harry_potter.git' /tmp/pip-req-build-gstu493r
Building wheels for collected packages: hpcw
  Building wheel for hpcw (setup.py) ... [?25l[?25hdone
  Created wheel for hpcw: filename=hpcw-0.1.0-py3-none-any.whl size=14064 sha256=4b1ae3573ec1a3dc84b5d59d8dd6064506612c05d1ee6dec407fdd562d6694fb
  Stored in directory: /tmp/pip-ephem-wheel-cache-axyh6fu7/wheels/db/3b/ad/7fbde81314b74a6b232925f5b648d728718062aa6e4ebaf0ca
Successfully built hpcw
Installing collected packages: hpcw
Successfully installed hpcw-0.1.0


In [4]:
!python --version

Python 3.7.13


In [29]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from hpcw.datasets.words_batch_dataset import WordsBatchDataset
from hpcw.datasets.one_item_dataset import OneItemDataset
from hpcw.corpus import Corpus
from hpcw.models.embedding import Embedding
from hpcw.models.words_batch import WordsBatch
from tqdm import tqdm
import os
import os.path
import pickle
from typing import List, Dict, Tuple
from datetime import datetime
from hpcw.utils import count_distance
from google.colab import drive

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [8]:
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
MAIN_DATA_DIR = 'drive/MyDrive/data_harry_potter'
# MAIN_DATA_DIR = 'data'

In [10]:
corpus = Corpus(MAIN_DATA_DIR+'/dictionary/second_dictionary_30_04.pickle')
one_item_dataset = OneItemDataset(len(corpus))

In [11]:
words_batch_datasets = [WordsBatchDataset(MAIN_DATA_DIR+'/harry_potter_books/prepared_txt/harry_potter_'+str(i)+'_prepared.txt', corpus.dictionary, sequence_length=6)
                        for i in range(1,8)]

In [12]:
def test_words_batch(model:WordsBatch, datasets: List[OneItemDataset], batch_size=2048) -> float:
    """
Tests passed word batch model using all datasets and returns MSE
Parameters:
    model:
        Object of WordsBatch class to be tested.
    datasets:
        List of onjects of WordsBatchDataset class. Datasets on which model will be tested.
    """
    mse = 0.0
    with torch.no_grad():
      loss_function = nn.MSELoss(reduction='sum')
      for dataset in datasets:
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False)
        for X, y in loader:
          X = X.to(DEVICE)
          y = y.to(DEVICE)
          y = model.embedding.to_dense(y)
          pred = model(X)
          loss = loss_function(pred, y)
          mse +=  loss.item()
    
    total_length = sum([len(dataset) for dataset in datasets])
    mse = mse / total_length
    return mse

In [13]:
def train_words_batch(model:WordsBatch, datasets: List[OneItemDataset], batch_size: int, epochs: int,
                    optimizer: optim.Optimizer, saves_dir:str = None, results:Dict[str, int or float] = None,
                    start_epoch:int = 0):
    """
This function trains words_batch model. It uses MSE loss function to do this.
Parameters:
    model:
        Model to be trained. Object of WordsBatch class.
    datasets:
        List of all datasets on which model will be trained. First function uses all inputs from first dataset,
        then all input from second dataset, ..., then all outputs from last dataset. This is the end of first epoch.
    batch_size: the batch size to used
    optimizer: optimizer to use to update weights and biases.
    saves_dir: If not None in this directory function will save the model after each epoch. Files will be named as
        words_batch_epoch_1.pth, words_batch_epoch_2.pth, words_batch_epoch_3.pth, ...
        If in passed directory already exist file called for example words_batch_epoch_1.pth it will be truncated.
        If passed directory does not exist, it will be created.
Returns:
    List of MSEs before each epoch.
    """
    model.train()
    loss_function = nn.MSELoss()
    loaders = [DataLoader(dataset, batch_size, shuffle=True) for dataset in datasets]
    if saves_dir is not None:
        if not os.path.exists(saves_dir):
            os.makedirs(saves_dir)
        with open(saves_dir+'/results.pickle', 'wb') as file:
                    pickle.dump(results, file)
        with open(saves_dir+'/results.txt', 'wt') as file:
            for key, item in results.items():
                file.write(key + ': ' + str(item) + '\n')

    end_epoch = start_epoch+epochs
    for epoch in range(start_epoch, end_epoch):
        for i,loader in enumerate(loaders):
            print(f"Epoch {epoch}/{end_epoch}, dataset {i+1}/7")
            for X, y in tqdm(loader, desc="batch: "):
                X = X.to(DEVICE)
                with torch.no_grad():
                    y = model.embedding.to_dense(y.to(DEVICE))
                pred = model(X)
                loss = loss_function(pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if saves_dir is not None:
            model.save(saves_dir+'/'+f"words_batch_epoch_{epoch}.pth")
            if results is not None:
                mse = test_words_batch(model, datasets)
                print(f"MSE is: {mse}")
                results[f'mse_after_epoch_{epoch}'] = mse
                with open(saves_dir+'/results.pickle', 'wb') as file:
                    pickle.dump(results, file)
                with open(saves_dir+'/results.txt', 'at') as file:
                    file.write(f'mse_after_epoch_{epoch} : {mse}\n')
                model.train()

In [14]:
def train_encoding(model: Embedding, dataset: OneItemDataset,  batch_size: int, epochs: int,
                    optimizer: optim.Optimizer, saves_dir:str = None, results:Dict[str, int or float] = None,
                    start_epoch:int = 0):
    """
This function trains encoding part of Embedding class object. It uses CrossEntropyLoss.
Parameters:
    model:
        Model to be trained. Object of Embedding class.
    datasets:
        Object of OneItemDataset class. Data on which to train.
    batch_size: the batch size to used
    optimizer: optimizer to use to update weights and biases.
    saves_dir: If not None in this directory function will save the model after each epoch. Files will be named as
        words_batch_epoch_1.pth, words_batch_epoch_2.pth, words_batch_epoch_3.pth, ...
        If in passed directory already exist file called for example words_batch_epoch_1.pth it will be truncated.
        If passed directory does not exist, it will be created.
Returns:
    List of acurate factor before each epoch.
    """

    model.train()
    loss_function = nn.CrossEntropyLoss()
    acurate = 0
    acurates = []
    loader = DataLoader(dataset, batch_size, shuffle=True)

    if saves_dir is not None:
            if not os.path.exists(saves_dir):
                os.makedirs(saves_dir)
            with open(saves_dir+'/results.pickle', 'wb') as file:
                        pickle.dump(results, file)
            with open(saves_dir+'/results.txt', 'wt') as file:
                for key, item in results.items():
                    file.write(key + ': ' + str(item) + '\n')

    for epoch in range(start_epoch, start_epoch+epochs):
        print(f"Epoch {epoch}/{epochs}")
        for y in tqdm(loader, desc="batch: "):
            with torch.no_grad():
                X = model.to_dense(y)
            pred = model(X)
            loss = loss_function(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if saves_dir is not None:
            model.save(saves_dir+'/'+f"embedding_epoch_{epoch}.pth")
            if results is not None:
                acurate = test_words_batch(model, dataset)
                print(f"acurate is: {acurate}")
                results[f'acurate_after_epoch_{epoch}'] = acurate
                with open(saves_dir+'/results.pickle', 'wb') as file:
                    pickle.dump(results, file)
                with open(saves_dir+'/results.txt', 'at') as file:
                    file.write(f'acurate_after_epoch_{epoch} : {acurate}\n')
                model.train()


        model.save(saves_dir+'/'+f"embedding_epoch_{epoch}.pth")

In [15]:
def test_encoding(model, dataset, batch_size=1024):
    """
Tests passed embedding model using all datasets and returns acurate factor.
Parameters:
    model:
        Object of Embedding class to be tested.
    datasets:
        Object of OneItemDataset class. Data on which to test.
Return:
    Acurate factor (float).
    """
    acurate = 0
    model.eval()
    with torch.no_grad():
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False)
        for y in tqdm(loader):
            y = y.to(DEVICE)
            X = model.to_dense(y)
            pred = model(X)
            acurate += torch.count_nonzero(torch.argmin(pred,dim=1)==y).item()
    result = acurate/len(dataset)
    return  result

In [100]:
# dir_name = MAIN_DATA_DIR+'/models/'+ datetime.now().strftime("training_words_batch_%d_%m_%Y___%H_%M")
# dir_name = MAIN_DATA_DIR+'/models/training_words_batch_11_05_2022___08_59'

In [17]:
# embedding = Embedding(corpus_size=len(corpus), embedding_size=64, dropout_factor=0.18, sizes=[128, 512])
# embedding = embedding.to(DEVICE)
# words_batch = WordsBatch(embedding, hidden_state_size=96, dropout_factor=0.18, sequence_length=6, dense_layer_size=256)
# words_batch = words_batch.to(DEVICE)

In [18]:
# results = words_batch.info()
# results['mse_initial'] = test_words_batch(words_batch, words_batch_datasets)

In [103]:
# words_batch = WordsBatch.load('/content/drive/MyDrive/data_harry_potter/models/training_words_batch_12_05_2022___09_22/words_batch_epoch_50.pth')
# words_batch = words_batch.to(DEVICE)
# with open(r'/content/drive/MyDrive/data_harry_potter/models/training_words_batch_12_05_2022___09_22/results.pickle', 'rb') as file:
#   results = pickle.load(file)

In [96]:
# for i in range(51, 64):
#   results.pop(f'mse_after_epoch_{i}')

In [21]:
# optimizer_words_batch = optim.Adam(words_batch.parameters())
# # results['optimizer_type'] = 'Adam'

In [102]:
# optimizer_words_batch = optim.SGD(words_batch.parameters(), lr=0.5)

In [None]:
# train_words_batch(words_batch, words_batch_datasets, batch_size=16, epochs=10,
#                   results=results, start_epoch=51, optimizer=optimizer_words_batch,
#                   saves_dir=dir_name)

In [25]:
embedding_initial = Embedding(corpus_size=len(corpus), embedding_size=64, dropout_factor=0.18, sizes=[128, 512])
embedding_initial = embedding_initial.to(DEVICE)
words_batch0 = WordsBatch.load('/content/drive/MyDrive/data_harry_potter/models/training_words_batch_11_05_2022___08_59/words_batch_epoch_0.pth')
words_batch34 = WordsBatch.load('/content/drive/MyDrive/data_harry_potter/models/training_words_batch_11_05_2022___08_59/words_batch_epoch_34.pth')
words_batch61 = WordsBatch.load('/content/drive/MyDrive/data_harry_potter/models/training_words_batch_13_05_2022___09_19/words_batch_epoch_61.pth')
words_batch93 = WordsBatch.load('/content/drive/MyDrive/data_harry_potter/models/training_words_batch_13_05_2022___13_29/words_batch_epoch_93.pth')

In [None]:
def check_distances(words: List[Tuple[str, str]], corpus: Corpus, embedding:Embedding):
  for word1, word2 in words:  
    dist = count_distance(word1, word2, corpus, embedding)
    print(f"    Distance between words '{word1}' and '{word2}' is {dist}")

In [78]:
# words = [('cat', 'dog'), ('cat', 'going'), ('hermione', 'water'), ('hermione', 'harry'),
#          ('is', 'being'), ('is', 'was'), ('hagrid', 'dumbledore'), ('snake', 'snakes'),
#          ('book', 'brilliant'), ('food', 'voldemort'), ('girl', 'boy'), ('girs', 'boys')]

In [79]:
# print("For initial words_batch: ")
# check_distances(words, corpus, embedding_initial)

For initial words_batch: 
    Distance between words 'cat' and 'dog' is 97.79200744628906
    Distance between words 'cat' and 'going' is 88.8342056274414
    Distance between words 'hermione' and 'water' is 169.3144073486328
    Distance between words 'hermione' and 'harry' is 89.15751647949219
    Distance between words 'is' and 'being' is 105.0751953125
    Distance between words 'is' and 'was' is 93.20569610595703
    Distance between words 'hagrid' and 'dumbledore' is 124.45855712890625
    Distance between words 'snake' and 'snakes' is 162.002197265625
    Distance between words 'book' and 'brilliant' is 101.6808090209961
    Distance between words 'food' and 'voldemort' is 142.20803833007812
    Distance between words 'girl' and 'boy' is 119.31204223632812


In [80]:
# print("For words_batch0: ")
# check_distances(words, corpus, words_batch0.embedding)

For words_batch0: 
    Distance between words 'cat' and 'dog' is 188.5636749267578
    Distance between words 'cat' and 'going' is 159.94679260253906
    Distance between words 'hermione' and 'water' is 124.99183654785156
    Distance between words 'hermione' and 'harry' is 87.04646301269531
    Distance between words 'is' and 'being' is 155.7615203857422
    Distance between words 'is' and 'was' is 111.06465148925781
    Distance between words 'hagrid' and 'dumbledore' is 114.62118530273438
    Distance between words 'snake' and 'snakes' is 136.7913055419922
    Distance between words 'book' and 'brilliant' is 134.72833251953125
    Distance between words 'food' and 'voldemort' is 121.21263885498047
    Distance between words 'girl' and 'boy' is 93.92500305175781


In [81]:
# print("For words_batch34: ")
# check_distances(words, corpus, words_batch34.embedding)

For words_batch0: 
    Distance between words 'cat' and 'dog' is 180.727294921875
    Distance between words 'cat' and 'going' is 166.03314208984375
    Distance between words 'hermione' and 'water' is 103.7860336303711
    Distance between words 'hermione' and 'harry' is 37.58721160888672
    Distance between words 'is' and 'being' is 136.8767852783203
    Distance between words 'is' and 'was' is 71.04869079589844
    Distance between words 'hagrid' and 'dumbledore' is 58.26959991455078
    Distance between words 'snake' and 'snakes' is 131.8248291015625
    Distance between words 'book' and 'brilliant' is 129.81053161621094
    Distance between words 'food' and 'voldemort' is 115.80278015136719
    Distance between words 'girl' and 'boy' is 86.43670654296875


In [65]:
# print("For words_batch61: ")
# check_distances(words, corpus, words_batch61.embedding)

In [None]:
# print("For words_batch93: ")
# check_distances(words, corpus, words_batch93.embedding)