# Character Level Language Modeling with MLP
architecture by Bengio et. al.



2 perspectives to look at Embedding layer: 
1) direct access through indexing
2) One-hot filtering
https://chatgpt.com/share/6775af97-1aac-800a-b76a-3a9c2a2043c0



This code does not have a concept of sequence that's why we concatenate the embeddings to be multiplied to W - It doesn't care if x comes before y



In [146]:
import torch
import torch.nn.functional as F

In [3]:
words = open('names.txt', 'r').read().splitlines()

In [4]:
def char_to_int(data):
    '''
    Given a dataset of words(names), char_to_int converts the unique characters to an integer and assigns an id to them.
    This is for train step.

    Args:
        data: a list of names

    Returns:
        char_ids: a dictionary of keys being characters and values the corrosponding integer id to each token
    '''

    char_ids = {}
    chars = sorted(set(''.join(data)))
    char_ids['.'] = 0
    for idx, c in enumerate(chars):
        char_ids[c] = idx + 1
    return char_ids


def int_to_char(data: dict):
    '''
    Given a dataset of ids, int_to_char converts the ids to their original character. This is for inference step.

    Args:
        data: a dictionary of (chars, ids)

    Returns:
        char_ids: a dictionary of (ids, chars)
    '''
    chars = {}
    for k, v in data.items():
        chars[v] = k

    return chars


char_ids = char_to_int(words)
id_chars = int_to_char(char_ids)

In [60]:
char_ids['a']

1

In [169]:
def make_dataset(data: list, sequence_length: int) -> torch.tensor:
    '''
    
    
    '''
    X = []
    y = []

    for word in data:
        context = [0] * sequence_length
        for ch in word + '.':
            X.append(context)
            y.append(char_ids[ch])
            c_id = char_ids[ch]
            
            context = context[1:] + [c_id]
            
    
    return torch.tensor(X), torch.tensor(y)


X, y = make_dataset(words, 3)

In [171]:
def data_split(X, y, trian_size):
    trian_size = round(X.size()[0]*trian_size)
    validation_size = round((X.size()[0] - trian_size)*0.5)

    X_train = X[:trian_size]
    X_validation = X[trian_size:trian_size+validation_size]
    X_test = X[trian_size+validation_size:]

    y_train = y[:trian_size]
    y_validation = y[trian_size:trian_size+validation_size]
    y_test = y[trian_size+validation_size:]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


X_train, X_validation, X_test, y_train, y_validation, y_test = data_split(X, y, trian_size=0.8)
print(f"Numebr of train samples: {X_train.size()[0]}")
print(f"Number of validation samples: {X_validation.size()[0]}")
print(f"Number of test samples: {X_test.size()[0]}")

Numebr of train samples: 182517
Number of validation samples: 22814
Number of test samples: 22815


In [172]:
def create_model(input_shape, layer2_neurons, embedding_dim, n_classes):

    C = torch.rand((n_classes, embedding_dim))
    # embed = C[input_data] # 27*3*2
    # layer1 = embed.view((-1, input_data.size()[-1]*embedding_dim))
    W1 = torch.rand((input_shape*embedding_dim, layer2_neurons))
    b1  = torch.rand(layer2_neurons)
    W2 = torch.rand((layer2_neurons, n_classes))
    b2 = torch.rand(n_classes)

    return {'C': C, 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

parameters = create_model(X.size()[-1], 100, 2, 27)
print(f"Total number of parameters: {sum(v.nelement() for k, v in parameters.items())}")

Total number of parameters: 3481


In [173]:
def train(parameters, X_train, y_train, embedding_dim, batch_size=0, epochs=10, lr=0.1):
    
    for epoch in range(epochs):
        for batch in range(batch_size):
            data = X_train[batch*(X_train.size()[0] // batch_size): (X_train.size()[0] // batch_size)*(batch+1)]
            label = y_train[batch*(y_train.size()[0] // batch_size): (y_train.size()[0] // batch_size)*(batch+1)]
            embeds = parameters['C'][data]
            layer1 = embeds.view((-1, data.size()[-1]*embedding_dim))
            h = torch.tanh(layer1 @ parameters['W1'])*parameters['b1']
            logits = (h @ parameters['W2'])*parameters['b2']
            loss = F.cross_entropy(logits, label)
            print(f"Epoch= {epoch+1}, iteration={batch}, loss= {loss}")
            
            for _, v in parameters.items():
                v.grad = None

            loss.backward()

            for _, v in parameters.items():
                v.data += -lr*v.grad
        
for k,v in parameters.items():
    v.requires_grad = True

train(parameters, X_train, y_train, 2, batch_size=8, epochs=10)

Epoch= 1, iteration=0, loss= 14.770447731018066
Epoch= 1, iteration=1, loss= 7.94075870513916
Epoch= 1, iteration=2, loss= 8.011354446411133
Epoch= 1, iteration=3, loss= 6.2299580574035645
Epoch= 1, iteration=4, loss= 5.167121887207031
Epoch= 1, iteration=5, loss= 3.960541009902954
Epoch= 1, iteration=6, loss= 3.697648048400879
Epoch= 1, iteration=7, loss= 3.8183038234710693
Epoch= 2, iteration=0, loss= 3.2145400047302246
Epoch= 2, iteration=1, loss= 3.07409405708313
Epoch= 2, iteration=2, loss= 3.429368019104004
Epoch= 2, iteration=3, loss= 2.9411416053771973
Epoch= 2, iteration=4, loss= 3.01171612739563
Epoch= 2, iteration=5, loss= 2.9506006240844727
Epoch= 2, iteration=6, loss= 3.0948920249938965
Epoch= 2, iteration=7, loss= 3.135560989379883
Epoch= 3, iteration=0, loss= 3.201793670654297
Epoch= 3, iteration=1, loss= 2.8715460300445557
Epoch= 3, iteration=2, loss= 2.907761812210083
Epoch= 3, iteration=3, loss= 3.1215295791625977
Epoch= 3, iteration=4, loss= 3.0855050086975098
Epoch=

Softmax + log likelihood = CrossEntropy

merging dimensions is always from the most inner dimension

cross entropy comapres only with relative to the correct label

crossentropy is preferred