In [3]:
#author: Jedrzej Chmiel
import torch
import torch.nn as nn
import torch.nn.functional as f
from typing import Dict

class Embedding(nn.Module):
    """
This class can be used to transform a token (like 1513) to a dense vector (see to_dense method) and to transform a dense
vector to an item (see get_word_propabilities method).
    """

    def __init__(self, corpus_size: int, embedding_size: int,
                 dropout_factor: float, sizes = [512, 1024, 2048]):
        """
    Creates an Embedding class object.
    Parameters:
        corpus_size: the size of corpus, how many words there are in dictionary
        embedding_size: the lenght of a dense vector which will represent a word
        dropout_factor: the dropout factor used in each layer of encoding network.
        """

        super().__init__()
        self.__embedding = nn.Embedding(corpus_size, embedding_size)

        self.__encoding = nn.ModuleList()
        for input_dim, output_dim in zip([embedding_size]+sizes[:-1], sizes):
            self.__encoding.extend([nn.Linear(input_dim, output_dim), nn.ReLU(), nn.Dropout(dropout_factor)])
        self.__encoding.append(nn.Linear(sizes[-1], corpus_size))

        self.corpus_size = corpus_size
        self.embedding_size = embedding_size
        self.dropout_factor = dropout_factor
        self.sizes = sizes

    def to_dense(self, tokens: torch.Tensor):
        """
    Transform tokens to dense vecor.
    Parameters:
        tokens:
            The tensor of shape (N,) where N is number of tokens (a batch size).
    Return:
        The tensor of shape (N, e_s), where e_s is embedding size (length of vector representing on word) and N is batch
        size (length of tokens)
        """
        return self.__embedding(tokens)

    def words_probabilities(self, dense_embedding: torch.Tensor):
        """
    Used to transform dense vector (embedding) to tokens. Returns the tensor representing the propability that given
    vector represents given word.
    Parameters:
        dense_embedding:
            A tensor of shape (N, e_s), where here e_s is embedding size (length of vector representing on word) and N is
            batch size (number of words).
        Return:
            A tensor of shape (N, c_s) where c_s is corpus size (number of all available words) and N is batch size
            (number of passed words). If dense_embedding is of shape (1, e_s) and returened tensor lookes like this:
            [[0.01, 0.02, 0.93, 0.1, 0.003, ..., 0.01]], then it means that for 93% passed dense_vector represent word
            of id 2 (because 0.93 is at position [0,2]).
        """
        return f.Softmax(self.__encoding(dense_embedding), dim=-1)

    def forward(self, dense_embedding: torch.Tensor):
        """
    Used to transform dense vector (embedding) to log propabilities of each token. Returns the tensor representing the
    log probability that given vector represents given word.
    Parameters:
        dense_embedding:
            A tensor of shape (N, e_s), where here e_s is embedding size (length of vector representing on word) and N
            is batch size (number of words).
        Return:
            A tensor of shape (N, c_s) where c_s is corpus size (number of all available words) and N is batch size
            (number of passed words). If dense_embedding is of shape (1,5) and returned tensor looks like this:
            [[-2.5, -145.0, -1.0, -2.0, -0.1, -0.2]], then it means that log probability of the fact that passed vector
            represent word of if 1 is -145.0.
        """
        return f.log_softmax(self.__encoding(dense_embedding), dim=-1)

    def save(
        self,
        filepath: str,
    ) -> bool:
        """
    Saves object in file described by filepath. The directory in which file is gonna to be must exist before calling
    this function. If file doesn't exist it will be created, otherwise it will be truncated. If an problem was
    encounter while trying to save this model in a given, file function returns False. Otherwise, function returns True.
    In file there is also saved the embedding used by this object.
    Parameters:
        filepath:
            The path of the file in which this model will be saved.
    Return:
        True in case of success, False in case of failure.
        """
        parameters_dict = self.info()
        parameters_dict["state_dict"] = self.state_dict()
        try:
            torch.save(parameters_dict, filepath)
            return True
        except Exception as e:
            print(
                f"Sorry, an exception occurred while trying to save model to file {filepath}"
            )
            return False

    def info(self) -> Dict[str, int or float]:
        parameters_dict = {
            "corpus_size": self.corpus_size,
            "embedding_size": self.embedding_size,
            "dropout_factor": self.dropout_factor,
            'sizes': self.sizes
        }
        return parameters_dict

    @staticmethod
    def load(filepath: str) -> 'Embedding':
        """
    An static function to. Loads an embedding model from file and returns it. If any problems occur while trying to read
    object from a file, function returns None.
    Parameters:
        filepath:
            The filepath to object in which file is saved. It should have been created by save method of this class.
    Return:
        The Embedding class object in case of success. None otherwise.
        """
        try:
            parameters_dict = torch.load(filepath)
        except Exception as e:
            print(
                f"Sorry, an exception occurred while trying to save model to file {filepath}"
            )
            return None

        if 'sizes' in parameters_dict:
            sizes = parameters_dict['sizes']
        else:
            sizes = [512,1024,2048]

        embedding = Embedding(parameters_dict['corpus_size'],
                              parameters_dict['embedding_size'],
                              parameters_dict['dropout_factor'], sizes=sizes)
        embedding.load_state_dict(parameters_dict['state_dict'])
        return embedding

#author: Jedrzej Chmiel

In [4]:
# Author: Jedrzej Chmiel

import torch
import torch.nn as nn
from typing import Dict

class WordsBatch(nn.Module): #second version
    """A nural netwrok of given archicture:
    Assume embedding_size is 128, corpus_size is 21371, hidden_state_size is 256 and dropout_factor is 0.1 and
    sequence_length is 3.

                    Fully connected layer of size 128.                           -
                                    |                                               |
                    Fully concatenation layer of length 1024, ReLU activation       |
                                 and 0.1 dropout.                                   ----- the tail
                                       |                                            |
                                    tail_input (512,)                            -
                                       |
                                       | (concatenation)
                    hidden_state  -----|-----  hidden_state
                        (256,)                 (256,)
                          |                     |
    LSTM------LSTM------LSTM                   LSTM------LSTM------LSTM
      |         |         |                     |         |         |
    LSTM------LSTM------LSTM                   LSTM------LSTM------LSTM
      |         |         |                     |         |         |
    dense     dense     dense                  dense     dense     dense
    vector    vector    vector                 vector    vector    vector
    (128,)    (128,)    (128,)                 (128,)    (128,)    (128,)         <----shape of tensor
      |         |         |                       |         |         |
    token      token      token                token      token      token        <--- each token is one-item tensor
      |         |          |                      |         |          |
    Hogward     is         the                 school      for       wizzards

    (We want to predict the word 'best' in sentance: Hogward is the best school for wizzards)

    """

    def __init__(self, embedding: Embedding, hidden_state_size: int,
                 dropout_factor: float, sequence_length: int, dense_layer_size: int = 1024):
        """
    Parameters:
        embedding:
            The object of Embedding class. It will be used to convert tokens for dense vectors.
        hidden_state_size:
            The size of hidden_state in both layers of LSTM.
        dropout_factor:
            Dropout factor in LSTM layers and in tail in fully connected layers.
        sequence_length:
            How many words before and after will be used to predict the middle word. If sequence_length is 3 then
            input to this model should be three words before and three words after.
        """
        super().__init__()
        self.embedding = embedding
        embedding_size = embedding.embedding_size
        self.lstm_before = nn.LSTM(embedding_size,
                                   hidden_state_size,
                                   2,
                                   dropout=dropout_factor,
                                   batch_first=True)
        self.lstm_after = nn.LSTM(embedding_size,
                                  hidden_state_size,
                                  2,
                                  dropout=dropout_factor,
                                  batch_first=True)
        self.tail = nn.Sequential(nn.Linear(hidden_state_size * 2, dense_layer_size),
                                  nn.ReLU(), nn.Dropout(dropout_factor),
                                  nn.Linear(dense_layer_size, embedding_size))
        self.sequence_length = sequence_length
        self.hidden_state_size = hidden_state_size
        self.dropout_factor = dropout_factor
        self.dense_layer_size = dense_layer_size

    def forward(self, input):
        """
    Returns a dense vector (embedding) representing the predicted middle word.
    Parameters:
        input:
            The tensor of shape (N, 2*s_l), where N is batch size and s_l is sequence_length. First part of second axis
             are words before word which is to be predicted and second part of second axis are words after word which is
             to be predicted.
    Return:
        The tensor of shape (N, e_s), where N is batch size and e_s is embedding_size.

    Suppose sequence_length is 3, batch size is 1, embedding_size is 128 and we want to predict the word "best" in
    sentance: "hogward is the best school for wizzards". Suppose those are ids of words used in this sentance:
    {'hogward': 123, 'is': 34, 'the': 13645, 'best': 7452, 'school': 15123, 'for': 541, 'wizzards': 231}.
    To predict the middle word in that sentance we should pass to this function the following tensor:
    [[123, 34, 13645, 231, 541, 15123]]. And we should get tensor of shape (1, 128) being the dense vector representing
    the word 'best'.
        """
        batch_size = input.shape[0]
        input = self.embedding.to_dense(input)
        _, (hiddens_before,
            _) = self.lstm_before(input[:, :self.sequence_length, :])
        _, (hiddens_after,
            _) = self.lstm_after(input[:, self.sequence_length:, :])
        return self.tail(
            torch.stack([
                torch.cat((hiddens_before[1, i], hiddens_after[1, i]), dim=0)
                for i in range(batch_size)
            ]))

    def save(
        self,
        filepath: str,
    ) -> bool:
        """
    Saves object in file described by filepath. The directory in which file is gonna tobe must exist before calling
    this function. If file doesn't exist it will be created, otherwise it will be truncated. If an problem was
    encounter while trying to save this model in a given, file function returns False. Otherwise, function returns True.
    Parameters:
        filepath:
            The path of the file in which this model will be saved.
    Return:
        True in case of success, False in case of failure.
        """
        parameters_dict = self.info()
        parameters_dict['words_batch_state_dict'] = self.state_dict()
        parameters_dict['embedding_state_dict'] = self.embedding.state_dict()
        try:
            torch.save(parameters_dict, filepath)
            return True
        except Exception as e:
            print(
                f"Sorry, an exception occurred while trying to save model to file {filepath}"
            )
            return False

    def info(self) -> Dict[str, int or float]:
        parameters_dict = {
            "sequence_length": self.sequence_length,
            "hidden_state_size": self.hidden_state_size,
            "words_batch_dropout_factor": self.dropout_factor,
            "corpus_size": self.embedding.corpus_size,
            "embedding_size": self.embedding.embedding_size,
            "embedding_dropout_factor": self.embedding.dropout_factor,
            "dense_layer_size": self.dense_layer_size,
            'embedding_sizes': self.embedding.sizes
        }
        return parameters_dict

    @staticmethod
    def load(filepath: str) -> 'WordsBatch':
        """
    An static function to. Loads an embedding model from file and returns it. If any problems occur while trying to read
    object from a file, function returns None.
    Parameters:
        filepath:
            The filepath to object in which file is saved. It should have been created by save method of this class.
    Return:
        The Embedding class object in case of success. None otherwise.
        """
        try:
            parameters_dict = torch.load(filepath)
        except Exception as e:
            print(
                f"Sorry, an exception occurred while trying to save model to file {filepath}"
            )
            return None

        if 'dense_layer_size' in parameters_dict:
            dense_layer_size = parameters_dict['dense_layer_size']
        else:
            dense_layer_size = 1024
        if 'embedding_sizes' in parameters_dict:
            embedding_sizes = parameters_dict['embedding_sizes']
        else:
            embedding_sizes = [512,1024,2048]

        embedding = Embedding(parameters_dict['corpus_size'],
                              parameters_dict['embedding_size'],
                              parameters_dict['embedding_dropout_factor'],
                              sizes=embedding_sizes)
        embedding.load_state_dict(parameters_dict['embedding_state_dict'])

        words_batch = WordsBatch(embedding,
                                 parameters_dict['hidden_state_size'],
                                 parameters_dict['words_batch_dropout_factor'],
                                 parameters_dict['sequence_length'],
                                 dense_layer_size = dense_layer_size)
        words_batch.load_state_dict(parameters_dict['words_batch_state_dict'])
        return words_batch

# Author: Jedrzej Chmiel

In [5]:
#author: Jedrzej Chmiel
import pickle

class Corpus:
    """
This class can be used to change words to tokens and tokens to words. After creating an object of this class named
'the_corpus' you can check what word has id 15432 using the_corpus[15432]. You can also check what is the id of word
'cat' using the_corpus['cat'].
    """

    def __init__(self, dictionary_filepath: str):
        """
    Creates object of class Corpus.
    Parameters:
        dictionary_filepath:
            The file path to the dictionary[str, int] saved using pickle module. The dictionary should contain id's of
            all necessary words. Exemplatory dictionary:
            {"cat": 0, "wizzard": 1, "grass": 2}
            Dictionary should not contain any id gaps! (If max id is 5 and min id is 0 one, then one word should be
            assigned to each integer number belonging to <0, 5>). The lowest id should be 0. This function checks if
            dictionary saved in passed file is correct. If there is an id gap in dictionary, function prints this
            information on the screen and raises an exception.
    This constructor creates a list of str. In this list under each index is word which id is this index. This list is
    used then to quickly check what word has given id.
        """
        try:
            with open(dictionary_filepath, 'rb') as file:
                self.dictionary = pickle.load(file)
        except Exception as e:
            print(
                "There was an error while trying to read dictionary from file: ",
                dictionary_filepath)
            print(e)
            return None

        self.__length = len(self.dictionary)
        words = [None for _ in range(self.__length)]
        for word, word_id in self.dictionary.items():
            words[word_id] = word

        if None in words:
            print("Dictionary saved in file: ", dictionary_filepath,
                  " has a id gap.")
            print("There is no word assigned to id: ", words.index(None))
            raise Exception("Id gap in dictionary.")
            return None

        self.__words = words

    def __getitem__(self, index: str or int):
        """
    If type of index is str function returns the id (int) assigned to index.
    If type of index is int function returns the word (str) assigned to index.
    If index is of any other type function raises an TypeError.
    Parameters:
        index:
            word (str) (to get id) or id (int) (to get word)
    Return:
        word (str) or id (int)
        """
        if isinstance(index, int):
            return self.__words[index]
        elif isinstance(index, str):
            return self.__dictionary[index]
        else:
            raise TypeError(
                "Unsupported index type: " + str(type(index)) +
                " (in __getitem__ function of Corpus class object)")

    def __len__(self):
        "Returns the number of words in dictionary. (max_id+1)"
        return self.__length

#author: Jedrzej Chmiel

In [6]:
import torch
from torch.utils.data import Dataset
from nltk.tokenize import word_tokenize

class WordsBatchDataset(Dataset):
    """DataSet for training word batch. One dataset is one book.
    To train over all 7 books you need to create 7 datasets.  Uses word_tokenize from nltk.tokenize to split file
    into words. This function creates tensor of sequential tokens at sequential index. So if the file starts with "you are
     the wizzard, harry.", and following words have following ids: {'you':124, 'are':412, 'the':26, 'wizzard':25,
      ',':432, 'harry':622, '.':11324}, then this tensor will be like [124, 412, 26, 25, 432, 622, 11324, ...]
     """

    def __init__(self,
                 book_filapath,
                 dictionary,
                 sequence_length,
                 transform=None,
                 target_transform=None):
        """
        Creates dataset from one file.
    Parameters:
        book_filapath:
            file path to file from which to read, should be .txt file with UTF-8 encoding.
        dictionary:
            dictionary of id's of each word. Like {'cat':0, 'wizzard':1, ''hermione': 2, ...}
        sequence_length:
            how many words before and after are used to predict middle word.
        transform:
            function to be applied on each input in __getitem__ method.
        target_transform:
            function to be applied on each target in __getitem__ method.
        """
        super().__init__()
        self.__transform = transform
        self.__target_transform = target_transform
        self.__sequence_length = sequence_length

        try:
            with open(book_filapath, 'rt', encoding='UTF-8') as file:
                words = word_tokenize(file.read())
        except Exception as e:
            print("There was an error while trying to read words from file: ",
                  book_filapath)
            print(e)
            return
        self.tokens = torch.tensor([dictionary[word] for word in words],
                                   dtype=torch.long)
        self.__length = len(self.tokens) - (2 * sequence_length)

    def __getitem__(self, index):
        """
    Parameters:
        index:
            The index of a word. Which is gonna to be predicted.
        Return:
            A tupple. First element of a tuple is tensor of shape (2*s_l,), where s_l is sequence_length. First part of
            tensor are words before word which is to be predicted and second part of tensor are words after word which is
            to be predicted. The second element of tuple is token of predicted word.

    Suppose sequence_length is 3, and the file passed to constructor, of object of this classed called 'obj', starts with:
    "hogward is the best school for wizzards.". Suppose those are ids of words used in this sentance:
    {'hogward': 123, 'is': 34, 'the': 13645, 'best': 7452, 'school': 15123, 'for': 541, 'wizzards': 231}.

    Calling obj[0] should return (tensor([123, 34, 13645, 231, 541, 15123], tensor(7452))
    You can't have 'Hogward' as middle word becouse there are no words before. The actual length of the dataset is
    nr of words in file (len of list produced by word_tokenize) - 2 * sequence_length.
        """
        index = index + self.__sequence_length
        X = torch.cat(
            (self.tokens[index - self.__sequence_length:index],
             torch.flip(
                 self.tokens[index + 1:index + self.__sequence_length + 1],
                 (0, ))),
            dim=0)
        y = self.tokens[index]
        if self.__transform is not None:
            X = self.__transform(X)
        if self.__target_transform is not None:
            y = self.__target_transform(y)
        return X, y

    def __len__(self):
        """Returns length of this dataset.The actual length of the dataset is
    nr of words in file (len of list produced by word_tokenize) - 2 * sequence_length.
        """
        return self.__length

In [7]:
#author: Jedrzej Chmiel
import torch
from torch.utils.data import Dataset

class OneItemDataset(Dataset):
    """This class is used to train encoding and embedding without words_batch. One item of this dataset is just a one
    item tensor (shape (,) ). This item is an token representing a word."""

    def __init__(self, dictionary_length, transform=None):
        super().__init__()
        self.__transform = transform
        self.__length = dictionary_length

    def __getitem__(self, index: int) -> torch.Tensor:
        """
    Parameters:
        index:
            An token of w word.
    Return:
        A one item tensor with this token :).
        :return:
        """
        x = torch.tensor(index, dtype=torch.long)
        if self.__transform is not None:
            x = self.__transform(x)
        return x

    def __len__(self):
        return self.__length

#author: Jedrzej Chmiel

In [None]:
!pip install nltk==3.7

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import os
import os.path
import pickle
from typing import List, Dict
from datetime import datetime
from google.colab import drive

In [None]:
import nltk
nltk.download('punkt')

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

In [None]:
drive.mount('/content/drive')

In [None]:
MAIN_DATA_DIR = 'drive/MyDrive/data_harry_potter'
# MAIN_DATA_DIR = 'data'

In [None]:
corpus = Corpus(MAIN_DATA_DIR+'/dictionary/second_dictionary_30_04.pickle')
one_item_dataset = OneItemDataset(len(corpus))

In [None]:
words_batch_datasets = [WordsBatchDataset(MAIN_DATA_DIR+'/harry_potter_books/prepared_txt/harry_potter_'+str(i)+'_prepared.txt', corpus.dictionary, sequence_length=6)
                        for i in range(1,8)]

In [None]:
def test_words_batch(model:WordsBatch, datasets: List[OneItemDataset], batch_size=2048) -> float:
    """
Tests passed word batch model using all datasets and returns MSE
Parameters:
    model:
        Object of WordsBatch class to be tested.
    datasets:
        List of onjects of WordsBatchDataset class. Datasets on which model will be tested.
    """
    mse = 0.0
    with torch.no_grad():
      loss_function = nn.MSELoss(reduction='sum')
      for dataset in datasets:
        loader = DataLoader(dataset, batch_size=1024, shuffle=False, drop_last=False)
        for X, y in loader:
          X = X.to(DEVICE)
          y = y.to(DEVICE)
          y = model.embedding.to_dense(y)
          pred = model(X)
          loss = loss_function(pred, y)
          mse +=  loss.item()

    total_length = sum([len(dataset) for dataset in datasets])
    mse = mse / total_length
    return mse

In [None]:
def train_words_batch(model:WordsBatch, datasets: List[OneItemDataset], batch_size: int, epochs: int,
                    optimizer: optim.Optimizer, saves_dir:str = None, results:Dict[str, int or float] = None,
                    start_epoch:int = 0):
    """
This function trains words_batch model. It uses MSE loss function to do this.
Parameters:
    model:
        Model to be trained. Object of WordsBatch class.
    datasets:
        List of all datasets on which model will be trained. First function uses all inputs from first dataset,
        then all input from second dataset, ..., then all outputs from last dataset. This is the end of first epoch.
    batch_size: the batch size to used
    optimizer: optimizer to use to update weights and biases.
    saves_dir: If not None in this directory function will save the model after each epoch. Files will be named as
        words_batch_epoch_1.pth, words_batch_epoch_2.pth, words_batch_epoch_3.pth, ...
        If in passed directory already exist file called for example words_batch_epoch_1.pth it will be truncated.
        If passed directory does not exist, it will be created.
Returns:
    List of MSEs before each epoch.
    """
    model.train()
    loss_function = nn.MSELoss()
    loaders = [DataLoader(dataset, batch_size, shuffle=True) for dataset in datasets]
    if saves_dir is not None:
        if not os.path.exists(saves_dir):
            os.makedirs(saves_dir)
        with open(saves_dir+'/results.pickle', 'wb') as file:
                    pickle.dump(results, file)
        with open(saves_dir+'/results.txt', 'wt') as file:
            for key, item in results.items():
                file.write(key + ': ' + str(item) + '\n')

    end_epoch = start_epoch+epochs
    for epoch in range(start_epoch, end_epoch):
        for i,loader in enumerate(loaders):
            print(f"Epoch {epoch}/{end_epoch}, dataset {i+1}/7")
            for X, y in tqdm(loader, desc="batch: "):
                X = X.to(DEVICE)
                with torch.no_grad():
                    y = model.embedding.to_dense(y.to(DEVICE))
                pred = model(X)
                loss = loss_function(pred, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if saves_dir is not None:
            model.save(saves_dir+'/'+f"words_batch_epoch_{epoch}.pth")
            if results is not None:
                mse = test_words_batch(model, datasets)
                print(f"MSE is: {mse}")
                results[f'mse_after_epoch_{epoch}'] = mse
                with open(saves_dir+'/results.pickkle', 'wb') as file:
                    pickle.dump(results, file)
                with open(saves_dir+'/results.txt', 'at') as file:
                    file.write(f'mse_after_epoch_{epoch} : {mse}\n')
                model.train()

In [None]:
def train_encoding(model: Embedding, dataset: OneItemDataset, batch_size: int, epochs: int,
                   optimizer: optim.Optimizer, saves_dir: str = None, with_training:bool = True) -> List[float]:
    """
This function trains encoding part of Embedding class object. It uses CrossEntropyLoss.
Parameters:
    model:
        Model to be trained. Object of Embedding class.
    datasets:
        Object of OneItemDataset class. Data on which to train.
    batch_size: the batch size to used
    optimizer: optimizer to use to update weights and biases.
    saves_dir: If not None in this directory function will save the model after each epoch. Files will be named as
        words_batch_epoch_1.pth, words_batch_epoch_2.pth, words_batch_epoch_3.pth, ...
        If in passed directory already exist file called for example words_batch_epoch_1.pth it will be truncated.
        If passed directory does not exist, it will be created.
Returns:
    List of acurate factor before each epoch.
    """
    model.train()
    loss_function = nn.CrossEntropyLoss()
    acurate = 0
    acurates = []
    loader = DataLoader(dataset, batch_size, shuffle=True)
    if saves_dir is not None:
        if not os.path.exists(saves_dir):
            os.makedirs(saves_dir)
    for epoch in range(epochs):
        print(f"Epoch {epoch}/{epochs}")
        for y in tqdm(loader, desc="batch: "):
            with torch.no_grad():
                X = embedding.to_dense(y)
            pred = model(X)
            loss = loss_function(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            acurate += torch.count_nonzero(torch.argmin(pred,dim=1)==y).item()

        model.save(saves_dir+'/'+f"embedding_epoch_{epoch}.pth")
    print(f"Acurate: {acurate/len(dataset)}")
    acurates.append(acurate)

In [None]:
def test_encoding(model, dataset):
    """
Tests passed embedding model using all datasets and returns acurate factor.
Parameters:
    model:
        Object of Embedding class to be tested.
    datasets:
        Object of OneItemDataset class. Data on which to test.
Return:
    Acurate factor (float).
    """
    acurate = 0
    model.eval()
    with torch.no_grad():
        for y in tqdm(dataset):
            y = torch.unsqueeze(y, dim=0).to(DEVICE)
            X = model.to_dense(y)
            pred = model(X)
            acurate += torch.count_nonzero(torch.argmin(pred,dim=1)==y).item()
    result = acurate/len(dataset)
    print(f"Acurate fraction: {result}")
    return  result

In [None]:
embedding = Embedding(corpus_size=len(corpus), embedding_size=64, dropout_factor=0.18, sizes=[128, 512])
embedding = embedding.to(DEVICE)
words_batch = WordsBatch(embedding, hidden_state_size=96, dropout_factor=0.18, sequence_length=6, dense_layer_size=256)
words_batch = words_batch.to(DEVICE)

In [None]:
dir_name = MAIN_DATA_DIR+'/models/'+ datetime.now().strftime("training_embedding_%d_%m_%Y___%H_%M")
results = words_batch.info()
results['mse_initial'] = test_words_batch(words_batch, words_batch_datasets)

In [None]:
results, dir_name

In [None]:
# words_batch = WordsBatch.load(dir_name+'/words_batch_epoch_6.pth')
# words_batch = words_batch.to(DEVICE)
# with open(dir_name+'/results.pickle', 'rb') as file:
#   results = pickle.load(file)
# results

In [None]:
# dir_name = MAIN_DATA_DIR+'/models/training_words_batch_09_05_2022___15_58'

In [None]:
optimizer_words_batch = optim.Adam(words_batch.parameters())
results['optimizer_type'] = 'Adam'

In [None]:
# train_words_batch(words_batch, words_batch_datasets, batch_size=16, epochs=15,
#               optimizer=optimizer_words_batch, saves_dir=dir_name, results=results,
#               start_epoch = 0)