# Import modules

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import unicodedata
import string
import os
from tqdm.auto import tqdm
import io
import glob
import random

# Sample Data Preprocessing for the Names Sequential Data

## Convert letters into ASCII

In [3]:
Letters = string.ascii_letters + " .,;!@'"

def convert_to_ascii(s):
    '''
    Unicode string to ASCII: https://stackoverflow.com/a/518232/2809427
    '''
    return ''.join(
        letter for letter in unicodedata.normalize('NFD', s)
        if unicodedata.category(letter) != 'Mn'
        and letter in Letters
    )

#Example Usage
# print(f"Convert Café à la mode to ASCII: {convert_to_ascii("Café à la mode")}")

## Perform One-hot encoding use OneHotEncoder from sklearn

In [4]:
from sklearn.preprocessing import OneHotEncoder


def line_to_one_hot_encoded_tensor(line):
    # Convert line to a 2D array of shape (line_length, 1)
    line_array = np.array(list(line)).reshape(-1, 1)
    
    # Initialize OneHotEncoder with the set of allowed ASCII characters
    encoder = OneHotEncoder(categories=[list(Letters)], sparse_output=False)
    
    # Fit and transform the line array to one-hot encoded array
    one_hot_encoded = encoder.fit_transform(line_array)
    
    # Convert the one-hot encoded array to a PyTorch tensor
    tensor = torch.tensor(one_hot_encoded, dtype=torch.float32)
    return tensor

# Example usage
# line = "Hello, world!@"
# tensor = line_to_one_hot_encoded_tensor(line)
# print(tensor)

## Load the Data and divide Input: names and Output: categories

In [5]:
def load_data():
    '''
    source: https://github.com/patrickloeber/pytorch-examples/blob/master/rnn-name-classification/utils.py 
    '''
    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []
    
    def find_files(path):
        return glob.glob(path)
    
    # Read a file and split into lines
    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [convert_to_ascii(line) for line in lines]
    
    for filename in find_files('data/names/*.txt'):
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        
        lines = read_lines(filename)
        category_lines[category] = lines
        
    return category_lines, all_categories

In [None]:
def random_example(category_lines, all_categories):
    '''
    source: https://github.com/patrickloeber/pytorch-examples/blob/master/rnn-name-classification/utils.py 
    '''
    def random_choice(a):
        idx = random.randint(0, len(a) - 1)
        return a[idx]
    
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = line_to_one_hot_encoded_tensor(line)
    return category, line, category_tensor, line_tensor


# Creating NAIVE RNN Model

In [19]:
class NaiveRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NaiveRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.input2output = nn.Linear(input_size + hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_data, hidden_layer):
        combined = torch.cat((input_data, hidden_layer), 1)
        hidden = self.input2hidden(combined)
        output = self.input2output(combined)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_layer)

## Dummy Data for Model Testing

In [None]:
# Hyperparameters
input_size = 10
hidden_size = 20
output_size = 5
sequence_length = 8
batch_size = 4

# Generate dummy data
# Input tensor of shape (batch_size, input_size)
dummy_input = torch.randn(batch_size, input_size)
# Hidden layer tensor of shape (batch_size, hidden_size)
dummy_hidden = torch.zeros(batch_size, hidden_size)

# Instantiate the model
model = NaiveRNN(input_size, hidden_size, output_size)

# Forward pass with dummy data
output, hidden = model(dummy_input, dummy_hidden)

print("Input shape:", dummy_input.shape)
print("Hidden shape:", dummy_hidden.shape)
print("Output shape:", output.shape)
print("Output:", output)


Input shape: torch.Size([4, 10])
Hidden shape: torch.Size([4, 20])
Output shape: torch.Size([4, 5])
Output: tensor([[-0.0545,  0.1814,  0.0032, -0.5043, -0.4034],
        [-0.0982,  0.3429, -0.1974,  0.0364, -0.1794],
        [ 0.4941, -0.6254, -0.0274,  0.2372, -0.0259],
        [ 0.4371, -0.2034,  0.0966, -0.1574,  0.2628]],
       grad_fn=<AddmmBackward0>)
