In [None]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt


In [None]:
words = open('names.txt', 'r').read().splitlines()

In [None]:
# Hyperparameters

EPOCHS = 1000
BATCH_SIZE = 4

In [None]:
def char_to_int(data):
    '''
    Given a dataset of words(names), char_to_int converts the unique characters to an integer and assigns an id to them.
    This is for train step.

    Args:
        data: a list of names

    Returns:
        char_ids: a dictionary of keys being characters and values the corrosponding integer id to each token
    '''

    char_ids = {}
    chars = sorted(set(''.join(data)))
    char_ids['.'] = 0
    for idx, c in enumerate(chars):
        char_ids[c] = idx + 1
    return char_ids


def int_to_char(data: dict):
    '''
    Given a dataset of ids, int_to_char converts the ids to their original character. This is for inference step.

    Args:
        data: a dictionary of (chars, ids)

    Returns:
        char_ids: a dictionary of (ids, chars)
    '''
    chars = {}
    for k, v in data.items():
        chars[v] = k

    return chars


char_ids = char_to_int(words)
id_chars = int_to_char(char_ids)

In [None]:
def make_dataset(data: list, sequence_length: int) -> torch.tensor:
    '''
    Making dataset from x sequential tokens

    Args: 
        data: words
        sequence_length: length of sequential characters to be a sample in the dataset
    
    Returns:
        X, y: data and labels of the dataset
    '''
    X = []
    y = []

    random.shuffle(X)
    
    for word in data:
        context = [0] * sequence_length
        for ch in word + '.':
            X.append(context)
            y.append(char_ids[ch])
            c_id = char_ids[ch]
            
            context = context[1:] + [c_id]    
    
    return torch.tensor(X), torch.tensor(y)


X, y = make_dataset(words, 3)

In [None]:
def data_split(X: torch.tensor, y: torch.tensor, trian_size: int) -> torch.tensor:
    '''
    Splitting dataset in 3 sets of train, validation, and test

    Args:
        X: sequential characters dataset
        y: next tokens or labels

    Returns: 
        train_size: size of the training set
    '''

    trian_size = round(X.size()[0]*trian_size)
    validation_size = round((X.size()[0] - trian_size)*0.5)
    
    X_train = X[:trian_size]
    X_validation = X[trian_size:trian_size+validation_size]
    X_test = X[trian_size+validation_size:]

    y_train = y[:trian_size]
    y_validation = y[trian_size:trian_size+validation_size]
    y_test = y[trian_size+validation_size:]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


X_train, X_validation, X_test, y_train, y_validation, y_test = data_split(X, y, trian_size=0.8)
print(f"Numebr of train samples: {X_train.size()[0]}")
print(f"Number of validation samples: {X_validation.size()[0]}")
print(f"Number of test samples: {X_test.size()[0]}")

In [None]:
def create_model(input_shape: tuple, layer2_neurons: int, embedding_dim: int, n_classes: int) -> dict:
    '''
    Creating the model of the paper with 3 layers.

    Args:
        input_shape: sequence length of the input data
        layer2_neurons: number of hidden layer neurons
        embedding_dim: embedding layer dimensions. The number of dimensions we want our data to be presented with
        n_classes: number of classes to be predicted. Last layer neurons

    Returns:
        A dictionary of all layers that we call parameters of the network.
    
    '''
    C = torch.rand((n_classes, embedding_dim))
    # embed = C[input_data] # 27*3*2
    # layer1 = embed.view((-1, input_data.size()[-1]*embedding_dim))
    W1 = torch.rand((input_shape*embedding_dim, layer2_neurons)) # merging dimensions is always from the most inner dimension
    b1  = torch.rand(layer2_neurons)
    W2 = torch.rand((layer2_neurons, n_classes))
    b2 = torch.rand(n_classes)

    return {'C': C, 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

parameters = create_model(X.size()[-1], 100, 2, 27)
print(f"Total number of parameters: {sum(v.nelement() for k, v in parameters.items())}")

w = 0 bashe chi mishe<br>
batch norm chra jitter mikone o bad bud? vaghti y sample darim std o normesh chie asln? solved (kolle dade)



add batch norm<br>
read andrew's blog post

https://chatgpt.com/share/6778454c-3130-800a-b1b4-a3ba9cb750f1
layerNorm is not among all the data



We cannot normalize based on all the data in the dataset during training because we need to perform backpropagation in real-time. Instead, the largest normalization we use during training is BatchNorm, which computes mean and variance over mini-batches. (In a sense, using all data in Gradient Descent would resemble BatchNorm over the entire dataset.)

The formula involving 0.999 and 0.001 (seen in videos) is used to ensure that the mean and variance across all training data are tracked and used consistently after training. This is essential for inference, where we rely on the tracked statistics from the training phase rather than recomputing them on test data. Using test data for normalization would invalidate the model, as it would essentially "peek" into test data, which is against the principles of proper model evaluation. Or even we use the train data after the training is done to calculate these mean and inference and use it on test data. But this is not possible when layers are grow and it is not efficient or scalable. So a similar approach would be keeping track of 0.999 and 0.001.


Tracking the mean and variance in BatchNorm layers during training ensures that we can use them during inference. This approach also enables single-data-point training (SGD) since the normalization is no longer based on just one data point but on the accumulated statistics from the entire training process.



BatchNorm Statistics: Batch normalization tracks running mean and variance using an exponential moving average (e.g., weights like 0.999 and 0.001). This ensures that during inference, the model uses statistics computed during training rather than recalculating them on test data, which would break the principle of not relying on unseen data.

SGD Compatibility: BatchNorm enables single-sample SGD to work effectively because it doesn't rely on the statistics of just one sample during inference but uses the accumulated mean and variance from training. This mitigates the issue of unreliable normalization when the batch size is small or consists of just one sample.

Why Statistics Are Tracked: This approach ensures the model generalizes based on training data. Using test data to compute mean and variance would introduce data leakage, compromising model evaluation and fairness.
