In [None]:
import torch
import torch.nn.functional as F

In [10]:
words = open('names.txt', 'r').read().splitlines()

## Data Preprocessing

In [72]:
def char_to_int(data):
    char_ids = {}
    chars = sorted(set(''.join(data)))

    for idx, c in enumerate(chars):
        char_ids[c] = idx + 1
    
    char_ids['.'] = 0

    return char_ids


def int_to_char(data):
    int_char = {v:k for k,v in data.items()}
    return int_char


char_ids = char_to_int(words)
id_char = int_to_char(char_ids)

In [46]:
def make_dataest(data, ids):
    X = []
    y = []

    for w in data:
        s = ['.'] + list(w) + ['.']
        for ch1, ch2 in zip(s, s[1:]):
            X.append(ids[ch1])
            y.append(ids[ch2])
    
    return torch.tensor(X), torch.tensor(y)

data, label = make_dataest(words[:6], char_ids)

## Train

In [101]:
def train(X, y, epochs):
    W = torch.randn((27,27), requires_grad=True)
    for epoch in range(epochs):
        xenc = F.one_hot(X, 27).float()   # one hot function only accepts integer values
        logits = xenc @ W  # W acts as the same matrix (P) in probabilistic method and xenc acts as w[idx] which triggers the right row
        e = torch.exp(logits)
        probs = e / e.sum(dim=1, keepdim=True)
        loss = -probs[range(len(X)), y].log().mean() + 0.1*(W**2).mean() # regularization in here is like adding 1 to N, (N+1)
        print(f"Epoch: {epoch}, loss: {loss}")

        W.grad = None
        loss.backward()
        W.data += -10 * W.grad

    return W

model = train(data, label, 10)

Epoch: 0, loss: 3.8021974563598633
Epoch: 1, loss: 3.4321272373199463
Epoch: 2, loss: 3.111724615097046
Epoch: 3, loss: 2.8578903675079346
Epoch: 4, loss: 2.661515474319458
Epoch: 5, loss: 2.498030185699463
Epoch: 6, loss: 2.355353593826294
Epoch: 7, loss: 2.229495048522949
Epoch: 8, loss: 2.1183438301086426
Epoch: 9, loss: 2.020134210586548


## Inference

In [147]:
def inference(model, num_words, id_char):
    names = []
    idx = 0
    for i in range(num_words):
        name = ''
        while True:
            x_enc = F.one_hot(torch.tensor([idx]), num_classes=27).float()
            logits = x_enc @ model
            p = logits.exp()
            p = p / p.sum(dim=1, keepdims=True)
            idx = torch.multinomial(p, num_samples=1, replacement=True).item() # it should be probability value
            # Model in keras or Torch do not have Multinomial. They only calculate until logits or softmax and then we always took argmax
            # However instead of argmax, for variery, we can take randomly based on their probabilities using multinomial function
            if idx == 0:
                break
            name += name.join(id_char[idx])

        names.append(name)

    return names


names = inference(model, 10, id_char)
print(names)

['snxdeooqhnmophyvjcxjxttehisa', 'xeeeloxd', 'kyma', 'e', 'sambuta', 'a', 'mia', 'ymqvnc', 'e', 'egjmfoxbel']


label smoothing in neural net is to try making the Ws close to each other which means giving the same weight to each data



diffusion models learn the distribution space of each pixel probability based on their neighborhood with other pixels (instead of sequence)