# Character Level Language Modeling with MLP
architecture by Bengio et. al.



2 perspectives to look at Embedding layer: 
1) direct access through indexing
2) One-hot filtering
https://chatgpt.com/share/6775af97-1aac-800a-b76a-3a9c2a2043c0



This code does not have a concept of sequence that's why we concatenate the embeddings to be multiplied to W - It doesn't care if x comes before y



In [2]:
import torch
import torch.nn.functional as F

In [None]:
words = open('names.txt', 'r').read().splitlines()

In [None]:
def char_to_int(data):
    '''
    Given a dataset of words(names), char_to_int converts the unique characters to an integer and assigns an id to them.
    This is for train step.

    Args:
        data: a list of names

    Returns:
        char_ids: a dictionary of keys being characters and values the corrosponding integer id to each token
    '''

    char_ids = {}
    chars = sorted(set(''.join(data)))
    char_ids['.'] = 0
    for idx, c in enumerate(chars):
        char_ids[c] = idx + 1
    return char_ids


def int_to_char(data: dict):
    '''
    Given a dataset of ids, int_to_char converts the ids to their original character. This is for inference step.

    Args:
        data: a dictionary of (chars, ids)

    Returns:
        char_ids: a dictionary of (ids, chars)
    '''
    chars = {}
    for k, v in data.items():
        chars[v] = k

    return chars


char_ids = char_to_int(words)
id_chars = int_to_char(char_ids)

In [5]:
def make_dataset(data: list, sequence_length: int) -> torch.tensor:
    '''
    
    
    '''
    X = []
    y = []

    for word in data:
        context = [0] * sequence_length
        for ch in word + '.':
            X.append(context)
            y.append(char_ids[ch])
            c_id = char_ids[ch]
            
            context = context[1:] + [c_id]
            
    
    return torch.tensor(X), torch.tensor(y)


X, y = make_dataset(words, 3)

In [6]:
def data_split(X, y, trian_size):
    trian_size = round(X.size()[0]*trian_size)
    validation_size = round((X.size()[0] - trian_size)*0.5)

    X_train = X[:trian_size]
    X_validation = X[trian_size:trian_size+validation_size]
    X_test = X[trian_size+validation_size:]

    y_train = y[:trian_size]
    y_validation = y[trian_size:trian_size+validation_size]
    y_test = y[trian_size+validation_size:]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


X_train, X_validation, X_test, y_train, y_validation, y_test = data_split(X, y, trian_size=0.8)
print(f"Numebr of train samples: {X_train.size()[0]}")
print(f"Number of validation samples: {X_validation.size()[0]}")
print(f"Number of test samples: {X_test.size()[0]}")

Numebr of train samples: 182517
Number of validation samples: 22814
Number of test samples: 22815


In [7]:
def create_model(input_shape, layer2_neurons, embedding_dim, n_classes):

    C = torch.rand((n_classes, embedding_dim))
    # embed = C[input_data] # 27*3*2
    # layer1 = embed.view((-1, input_data.size()[-1]*embedding_dim))
    W1 = torch.rand((input_shape*embedding_dim, layer2_neurons))
    b1  = torch.rand(layer2_neurons)
    W2 = torch.rand((layer2_neurons, n_classes))
    b2 = torch.rand(n_classes)

    return {'C': C, 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

parameters = create_model(X.size()[-1], 100, 2, 27)
print(f"Total number of parameters: {sum(v.nelement() for k, v in parameters.items())}")

Total number of parameters: 3481


In [8]:
def train(parameters, X_train, y_train, embedding_dim, batch_size=0, epochs=10, lr=0.1):
    
    for epoch in range(epochs):
        for batch in range(batch_size):
            data = X_train[batch*(X_train.size()[0] // batch_size): (X_train.size()[0] // batch_size)*(batch+1)]
            label = y_train[batch*(y_train.size()[0] // batch_size): (y_train.size()[0] // batch_size)*(batch+1)]
            embeds = parameters['C'][data]
            layer1 = embeds.view((-1, data.size()[-1]*embedding_dim))
            h = torch.tanh(layer1 @ parameters['W1'])*parameters['b1']
            logits = (h @ parameters['W2'])*parameters['b2']
            loss = F.cross_entropy(logits, label)
            print(f"Epoch= {epoch+1}, iteration={batch}, loss= {loss}")
            
            for _, v in parameters.items():
                v.grad = None

            loss.backward()

            for _, v in parameters.items():
                v.data += -lr*v.grad

    
    return parameters


        
for k,v in parameters.items():
    v.requires_grad = True

model = train(parameters, X_train, y_train, 2, batch_size=8, epochs=10)

Epoch= 1, iteration=0, loss= 10.953051567077637
Epoch= 1, iteration=1, loss= 7.868553638458252
Epoch= 1, iteration=2, loss= 6.843010425567627
Epoch= 1, iteration=3, loss= 5.887442111968994
Epoch= 1, iteration=4, loss= 7.735470771789551
Epoch= 1, iteration=5, loss= 8.262130737304688
Epoch= 1, iteration=6, loss= 5.98015832901001
Epoch= 1, iteration=7, loss= 4.598536491394043
Epoch= 2, iteration=0, loss= 3.715553045272827
Epoch= 2, iteration=1, loss= 3.0508549213409424
Epoch= 2, iteration=2, loss= 3.076706647872925
Epoch= 2, iteration=3, loss= 3.1377391815185547
Epoch= 2, iteration=4, loss= 2.9129652976989746
Epoch= 2, iteration=5, loss= 2.9304914474487305
Epoch= 2, iteration=6, loss= 3.075181007385254
Epoch= 2, iteration=7, loss= 3.0618844032287598
Epoch= 3, iteration=0, loss= 2.968907117843628
Epoch= 3, iteration=1, loss= 2.7882468700408936
Epoch= 3, iteration=2, loss= 2.7739949226379395
Epoch= 3, iteration=3, loss= 2.778329849243164
Epoch= 3, iteration=4, loss= 2.809187412261963
Epoch=

Softmax + log likelihood = CrossEntropy

In [13]:
X_test[0]

tensor([ 1, 14, 14])

In [84]:
def inference(parameters, n_names):

    sequence_length = parameters['W1'].size()[0] // parameters['C'].size()[1]
    tokens = [0] * sequence_length

    names = []

    for _ in range(n_names):
        chars = []
        while True:
            embed = parameters['C'][tokens]
            layer1 = embed.view((-1, sequence_length*parameters['C'].size()[1]))
            h = torch.tanh(layer1 @ parameters['W1'])*parameters['b1']
            logits = (h @ parameters['W2'])*parameters['b2']
            probs = torch.softmax(logits.data[0], dim=0)
            pred = torch.multinomial(probs, 1, replacement=True)
            if pred==0:
                break
            chars.append(id_chars[pred.item()])

            tokens = tokens[1:] + [pred.item()]

        names.append(chars)

    return names
    
print([''.join(i) for i in inference(model, 10)])

['iijrashermaaatau', 'hzanelri', 'srnreydl', 'sarcnaie', '', 'noayilaodarnv', 'damnyaiairdmnidondhyshrra', 'erlnoayyinnydoleldiiqioahonsvraimyyalagery', 'rdaiiz', 'cesnateiueiseebaeysnen']


merging dimensions is always from the most inner dimension

cross entropy comapres only with relative to the correct label

crossentropy is preferred