## Multi-Layer Perceptron Models

### Imports and Utils

In [1]:
"""
Importing the necessary libraries
"""
import re
import os
from time import time
import pickle

import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import string

# Remove all the warnings
import warnings
warnings.filterwarnings('ignore')

# Set env CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [2]:
# Function to load the data
def load_text_data(file_path):
    """
    file_path: str: The path to the file

    Returns: str: The text data in the file
    """
    # Load the data
    with open(file_path, 'rb') as file:
        data = file.read().decode('utf-8')

    return data

### Dataset Loading and Preprocessing

In [None]:
# Load the data
file_path = '/content/corpora/Start-ups.txt'
file_data = load_text_data(file_path)

# Initialize the tokenizer
tokenizer = get_tokenizer('basic_english')
tokens = tokenizer(file_data)

# Get the unique tokens
words = sorted(list(set(tokens)))
words.append('<unk>')

# Create encoding and decoding dictionaries
encodings = {token: idx for idx, token in enumerate(words)}
decodings = {idx: token for token, idx in encodings.items()}

In [None]:
block_size = 4

# Create the dataset with the encoding
dataset = [encodings[token] for token in tokens]

# Create the input and target sequences
input_seq = [dataset[i:i+block_size] for i in range(len(dataset)-block_size)]
target_seq = [dataset[i+block_size] for i in range(len(dataset)-block_size)]

# Convert the input and target sequences to tensors
input_seq = torch.tensor(input_seq).to(device)
target_seq = torch.tensor(target_seq).to(device)

### Model Creation and Training

In [3]:
class MLP(nn.Module):
    """
    A Multi-Layer Perceptron.
    """

    def __init__(self, block_size: int, vocab_size: int, emb_dim: int, random_state: int = None):
        """
        Constructor for Multi-Layer Perceptron.

        block_size: int: input block size
        vocab_size: int: vocabulary of the embedded words
        emd_dim: int: embedding dimension of the characters
        random_state: int: random state for reproducibility
        """

        super(MLP, self).__init__()
        if random_state is not None:
            torch.manual_seed(random_state)
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.embeddings = nn.Sequential(
            nn.Embedding(vocab_size, emb_dim),
            nn.Flatten()
        )
        self.layers = nn.Sequential(
            nn.Linear(block_size * emb_dim, 256),
            nn.SiLU(),
            nn.Linear(256, 32),
            nn.SiLU(),
            nn.Linear(32, vocab_size)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: torch.Tensor: The input tensor.
        """

        x = self.embeddings(x)
        x = self.layers(x)
        return x

    def fit(self, X: torch.Tensor, y: torch.Tensor, epochs: int = 1000, batch_size: int = 4096, learning_rate: float = 0.01, print_cost: bool = False):
        """
        X: torch.Tensor: The input tensor
        y: torch.Tensor: The target tensor
        epochs: int: The number of epochs
        batch_size: int: The batch size while applying mini-batch gradient descent
        learning_rate: float: learning rate of the optimizer
        print_cost: bool: Whether to print the cost or not
        """
        self.lr = learning_rate

        X, y = X.reshape(-1, self.block_size).to(device), y.reshape(-1).to(device)
        dataset = TensorDataset(X, y)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)

        Losses = []
        for i in range(epochs):
            for batch_X, batch_y in dataloader:
                # Forward pass
                predictions = self.forward(batch_X)
                loss = criterion(predictions, batch_y)
                Losses.append(loss.item())

                # Backward pass
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

            # Print the cost
            if print_cost and (i+1) % 10 == 0:
                print(f'Loss at epoch {i+1}: {loss.item():.3f}')
                print("\n------------------------------------------------------------\n")

        return Losses

    def predict(self, X: torch.Tensor, decodings: dict, context_len: int):
        """
        X: torch.Tensor: The input tensor
        decodings: dict: The dictionary containing decoding of the characters
        context_len: int: The length of the context
        """

        X = X.reshape(1, self.block_size).to(device)

        for _ in range(context_len):
            y_pred = self.forward(X)
            id_pred = torch.distributions.Categorical(logits=y_pred).sample().item()
            decode = decodings[id_pred]
            X = torch.cat((X[:, 1:], torch.tensor([[id_pred]], device=device)), 1)
            yield decode

    def save_model(self, path):
        """
        Save the model parameters.

        path: str: The path where the model parameters should be saved.
        """

        model_info = {
            'block_size': self.block_size,
            'vocab_size': self.vocab_size,
            'emb_dim': self.emb_dim,
            'state_dict': self.state_dict()
        }

        torch.save(model_info, path)

    @staticmethod
    def load_model(path):
        """
        Load the model parameters.

        path: str: The path from where the model parameters should be loaded.
        """

        model_info = torch.load(path, map_location=torch.device('cpu'))
        model = MLP(block_size=model_info['block_size'], vocab_size=model_info['vocab_size'], emb_dim=model_info['emb_dim'])
        model.load_state_dict(model_info['state_dict'])
        return model

In [None]:
# Defining the model
model = MLP(block_size=block_size, vocab_size=len(words), emb_dim=32, random_state=42).to(device)

In [None]:
# Training the model
Losses = model.fit(input_seq, target_seq, epochs=50, batch_size=4096, learning_rate=0.01, print_cost=True)

In [None]:
input_idx = 0
first_token = True
for idx in input_seq[input_idx]:
    token = decodings[idx.item()]
    if first_token:
        print(token, end='')
        first_token = False
    elif token in string.punctuation:
        print(token, end='')
    else:
        print(' ' + token, end='')

for token in model.predict(input_seq[input_idx], decodings, 200):
    if token in string.punctuation:
        print(token, end='')
    else:
        print(' ' + token, end='')

the investors so much improve expect forgot organizations won exactly useful story order o article few for yes deliver increasing exampleâ€”quickly promising describe flatter might organization found figure course embition something recipe watch completely might which never credentials some ask prepare interested willful worse make syrelicon substantial function third by so had seem scares smaller borned projects reason subset harder selling turns worse helpful size say mode instead lucky ability wasting problemâ€”people already sentences useless sentenced fast device many months different performance ______ percentage risk outding before judging evidences published replaces driven lead unconscious would public phase everyone kill formidable ideas useless times subsets sensitive treed shows trying labels projects around reasonsâ€”when call schleps incompeted prefer unusual expand problemâ€”people return helpful developers components avoided offering changes wasting launched how sentenc

In [None]:
# Specify the directory you want to save in
directory = "Models"
os.makedirs(directory, exist_ok=True)

# Saving the model
filepath = os.path.join(directory, f"LSTM_{os.path.splitext('Shakespheare.txt')[0]}_{32}_{block_size}.pth")
model.save_model(filepath)

In [None]:
# Specify the directory you want to load from
directory = "Models"

# Defining the model
model = MLP(block_size=block_size, vocab_size=len(words), emb_dim=32, random_state=42).to(device)

# Load the model
filepath = os.path.join(directory, f"LSTM_{os.path.splitext('Shakespheare.txt')[0]}_{32}_{block_size}.pth")
model.load_model(filepath)

### Testing and Plotting

In [None]:
test_seq = 'How are you doing'

# Tokenize the input
test_seq_tokens = [tkn for tkn in tokenizer(test_seq) if tkn]

# Process the input
if len(test_seq_tokens) > model.block_size:
    test_seq_tokens = test_seq_tokens[:model.block_size]
elif len(test_seq_tokens) < model.block_size:
    test_seq_tokens = ['<unk>'] * (model.block_size - len(test_seq_tokens)) + test_seq_tokens

# Print the output sequence
test_seq_encoded = torch.tensor([encodings.get(token, encodings['<unk>']) for token in test_seq_tokens])

first_token = True
for idx in test_seq_encoded:
    token = decodings[idx.item()]
    if token != '<unk>':
        if first_token:
            print(token, end='')
            first_token = False
        elif token in string.punctuation:
            print(token, end='')
        else:
            print(' ' + token, end='')

for token in model.predict(test_seq_encoded, decodings, 100):
    if token != '<unk>':
        if token in string.punctuation:
            print(token, end='')
        else:
            print(' ' + token, end='')

### Generating and Saving Models

In [None]:
# Specify the directory you want to save in
directory = "Models"
os.makedirs(directory, exist_ok=True)

# Directory containing the corpora
corpus_dir = '/content/corpora'

# Initialize the tokenizer
tokenizer = get_tokenizer('basic_english')

# Different embeddings and block sizes to try
embeddings = [2, 4, 8, 16, 32]
block_sizes = [32]

# For each corpus in the corpus directory
for corpus_file in os.listdir(corpus_dir):
    # Load the data
    file_data = load_text_data(os.path.join(corpus_dir, corpus_file))

    # Implement the tokenizer
    tokens = tokenizer(file_data)

    # Get the unique tokens
    words = sorted(list(set(tokens)))
    words.append('<unk>')

    # Create encoding and decoding dictionaries
    encodings = {token: idx for idx, token in enumerate(words)}
    decodings = {idx: token for token, idx in encodings.items()}

    # Save encodings and decodings files
    filepath = os.path.join(directory, f"LSTM_{os.path.splitext(corpus_file)[0]}_encodings.pkl")
    with open(filepath, 'wb') as file:
        pickle.dump(encodings, file)
    filepath = os.path.join(directory, f"LSTM_{os.path.splitext(corpus_file)[0]}_decodings.pkl")
    with open(filepath, 'wb') as file:
        pickle.dump(decodings, file)

    # Create the dataset
    dataset = [encodings[token] for token in tokens]

    # For each combination of block size and embedding
    for block_size in block_sizes:

        # Create the input and target sequences
        input_seq = [dataset[i:i+block_size] for i in range(len(dataset)-block_size)]
        target_seq = [dataset[i+block_size] for i in range(len(dataset)-block_size)]

        input_seq = torch.tensor(input_seq).to(device)
        target_seq = torch.tensor(target_seq).to(device)

        for emb_dim in embeddings:
            print(f"LSTM Model with Corpus - {os.path.splitext(corpus_file)[0]}, block size - {block_size}, and embedding dimensions - {emb_dim}")

            # Defining the model
            model = MLP(block_size=block_size, vocab_size=len(words), emb_dim=emb_dim, random_state=42).to(device)
            Losses = model.fit(input_seq, target_seq, epochs=100, batch_size=4096, learning_rate=0.01)
            print(f"Model Loss - {Losses[-1]:.3f}")

            # Saving the model
            filepath = os.path.join(directory, f"LSTM_{os.path.splitext(corpus_file)[0]}_{block_size}_{emb_dim}.pth")
            model.save_model(filepath)
            print("\n-----------------------------------------------------------------------------------------------------------\n")

LSTM Model with Corpus - Leo_Tolstoy, block size - 32, and embedding dimensions - 2


## Convolutional Neural Network Models

### Imports and Utils

### Dataset Loading and Preprocessing

### Model Creation and Training

### Testing and Plotting