In [58]:
import copy
import random

In [97]:
import numpy as np

In [None]:
words = open('names.txt', 'r').read().splitlines()

In [54]:
type(words)

list

### Train, Validation, Test Split

In [65]:
_NAME_COUNT_ = len(words)

_TRAIN_FRAC_ = 0.80
_HOLD_FRAC_ = 0.2
words_trn = words[:int(_NAME_COUNT_ * _TRAIN_FRAC_)]
words_hold = words[:int(_NAME_COUNT_ * _HOLD_FRAC_)]

In [66]:
random.seed(1337)

words_val = random.sample(words_hold, len(words_hold) // 2)

words_test = list(set(words_hold).difference(set(words_val)))


In [67]:
len(words_hold) // 2

3203

In [68]:
int(_NAME_COUNT_ * _TRAIN_FRAC_)

25626

In [69]:
def split_name_into_trigrams(name: str) -> list[tuple]:

    _START_CHAR_ = '.'
    _END_CHAR_ = _START_CHAR_

    chars = [_START_CHAR_] * 2 + list(name) + [_END_CHAR_]
    trigram_list = []
    
    for ch1, ch2, ch3 in zip(chars, chars[1:], chars[2:]):
        
        trigram = (ch1, ch2, ch3)
        trigram_list.append(trigram)

    return trigram_list

In [70]:
split_name_into_trigrams("nitesh")

[('.', '.', 'n'),
 ('.', 'n', 'i'),
 ('n', 'i', 't'),
 ('i', 't', 'e'),
 ('t', 'e', 's'),
 ('e', 's', 'h'),
 ('s', 'h', '.')]

In [71]:
_START_CHAR_ = "."
_END_CHAR_ = _START_CHAR_

count_dict = {}

for w in words_trn:
    chs = [_START_CHAR_] * 2 + list(w) + [_END_CHAR_]

    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram = (ch1, ch2, ch3)
        count_dict[trigram] = count_dict.get(trigram, 0) + 1


In [72]:
sorted(count_dict.items(), key= lambda kv: -kv[1])

[(('.', '.', 'a'), 3700),
 (('.', '.', 'k'), 2334),
 (('.', '.', 'm'), 2115),
 (('.', '.', 'j'), 1847),
 (('.', '.', 's'), 1668),
 (('n', 'a', '.'), 1654),
 (('a', 'h', '.'), 1595),
 (('.', '.', 'l'), 1330),
 (('.', '.', 'r'), 1319),
 (('.', '.', 'e'), 1278),
 (('.', '.', 'd'), 1260),
 (('.', '.', 'c'), 1248),
 (('.', 'm', 'a'), 1229),
 (('.', 'k', 'a'), 1018),
 (('.', '.', 'b'), 1008),
 (('a', 'n', '.'), 963),
 (('.', '.', 't'), 963),
 (('.', 'j', 'a'), 962),
 (('.', '.', 'n'), 945),
 (('l', 'y', 'n'), 920),
 (('o', 'n', '.'), 910),
 (('i', 'a', '.'), 875),
 (('e', 'n', '.'), 844),
 (('y', 'n', '.'), 811),
 (('i', 'e', '.'), 804),
 (('a', 'r', 'i'), 782),
 (('a', 'n', 'n'), 777),
 (('a', 'n', 'a'), 760),
 (('.', '.', 'z'), 725),
 (('e', 'l', 'l'), 723),
 (('y', 'a', '.'), 696),
 (('.', '.', 'h'), 688),
 (('i', 'a', 'n'), 681),
 (('l', 'a', '.'), 663),
 (('m', 'a', 'r'), 637),
 (('i', 'y', 'a'), 619),
 (('a', 'n', 'i'), 619),
 (('n', 'n', 'a'), 615),
 (('r', 'a', '.'), 614),
 (('n', 'i

### Creating Matrix Representation of 3-Chars Counts

It would be 3 dimension (x1, x2) -> x3

In [73]:
import torch

#### How do encode Character strings into an array?

In [74]:
chars = sorted(list(set(''.join(words))))

## String to Index Mapping ##
### Encoding ###
stoi = {s:i + 1 for i,s in enumerate(chars)}

### adding encoding for special start and end chars. "." is kept for both
stoi[_START_CHAR_] = 0

#### Decoding array into Character Strings

In [75]:
itos = {i:s for s, i in stoi.items()}

### How would matrix represenation of trigram would look like?

In [122]:
N = torch.zeros((len(stoi), len(stoi), len(stoi)), dtype=torch.int32) ## Word Count Matrix using training Set

for w in words_trn:
    chs = [_START_CHAR_] * 2 + list(w) + [_END_CHAR_]

    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram_set = (ch1, ch2, ch3) ## x1,x2 -> x3

        ix1 = stoi.get(ch1) # Encodings (x1)
        ix2 = stoi.get(ch2) # Encodings (x2)
        ix3 = stoi.get(ch3) # Encodings (x3)

        N[ix1, ix2, ix3] += 1

### Visualization Later on

### Explict Approach

In [123]:
N[0, 0, 1]

tensor(3700, dtype=torch.int32)

#### Smoothing, Where Count is Zero

In [145]:
_SMOOTHING_VAL_ = 50

def smooth_count_matrix(N: torch.tensor, smoothing_val) -> torch.tensor:
    N_ = copy.deepcopy(N)

    N_[N_ == 0] = smoothing_val

    return N_

In [146]:
Ns = smooth_count_matrix(N, _SMOOTHING_VAL_)

In [147]:
Ns[stoi.get('e'), stoi.get('n'), stoi.get('.')]

tensor(844, dtype=torch.int32)

In [148]:
P = Ns / Ns.sum(dim=-1, keepdim=True)

In [149]:
ix1 = 0
ix2 = 0
g = torch.Generator().manual_seed(2147483647)

for k in range(15):
    ix1 = 0
    ix2 = 0
    
    out = []
    while True:
        p = P[ix1, ix2]

        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix3])

        ix1 = ix2
        ix2 = ix3

        if ix3 == 0 :
            break
    print("".join(out))


cexbm.
moglkuikicqzktyhwmvmzlmjttain.
lkfukzkktda.
samiypubjtbhrmgofzx.
moqixqctvujkwptmdtgkkjhmkmmscdgu.
nkbvgyrywftbspmhwcidendtallasw.
jadxxblnwglhpyiw.
isana.
rpfdwnpkwzkm.
der.
jfixmt.
gbikaylquabjvotf.
khyqxqevecmrbxmcwyhrrjnnaxmvpfkmwmghfvjzxobnmymor.
lbptjypxwqegpfwhccfyzfvkszmqmvwbhmiwqmdgzqskmjhgaelxwmmk.
ashcxfmbtlcslhycfpycvazvz.


#### Performance in Validation & Test Split

In [150]:
log_prob_matrix = P.log()

In [151]:
def get_neg_log_likelyhood_score(name: str, log_prob_matrix: torch.tensor) -> float:

    chs = ["."] * 2 + list(name) + ["."]

    iteration_Count = 0
    neg_log_prob_val = 0
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):        
        ix1 = stoi.get(ch1)
        ix2 = stoi.get(ch2)
        ix3 = stoi.get(ch3)

        neg_log_prob = -log_prob_matrix[ix1, ix2, ix3].item()
        iteration_Count += 1
        neg_log_prob_val += neg_log_prob
        
    
    return neg_log_prob_val / iteration_Count


In [152]:
## Train Split
iteration_Count = 0
neg_log_prob_score = 0

for w in words_trn:
     chs = ["."] * 2 + list(w) + ["."]
     
     neg_log_prob_val = 0
     for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        
        trigram_set = (ch1, ch2, ch3) ## x1,x2 -> x3
        
        ix1 = stoi.get(ch1)
        ix2 = stoi.get(ch2)
        ix3 = stoi.get(ch3)

        joint_prob = P[ix1, ix2, ix3].item()
        neg_log_prob = -log_prob_matrix[ix1, ix2, ix3].item()
        iteration_Count += 1
        neg_log_prob_val += neg_log_prob
     neg_log_prob_score += neg_log_prob_val
     
     

print(f"Mean Likelihood Score: {neg_log_prob_score / iteration_Count}")

Mean Likelihood Score: 2.6453503193378483


In [153]:
train_nll_list = [get_neg_log_likelyhood_score(name_trn, log_prob_matrix) for name_trn in words_trn]

print(f"Mean Likelihood Score Train: {np.mean(train_nll_list)} | Smoothing Val : {_SMOOTHING_VAL_}")


Mean Likelihood Score Train: 2.66499390229177 | Smoothing Val : 50


In [154]:
[get_neg_log_likelyhood_score(nm, log_prob_matrix) for nm in ["emily", "emma"]]

[2.0834452708562217, 2.5640096187591555]

In [155]:
## Validation Split
iteration_Count = 0
neg_log_prob_score = 0

for w in words_val:
     chs = ["."] * 2 + list(w) + ["."]
     
     neg_log_prob_val = 0
     for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        
        trigram_set = (ch1, ch2, ch3) ## x1,x2 -> x3
        
        ix1 = stoi.get(ch1)
        ix2 = stoi.get(ch2)
        ix3 = stoi.get(ch3)

        joint_prob = P[ix1, ix2, ix3].item()
        neg_log_prob = -log_prob_matrix[ix1, ix2, ix3].item()
        iteration_Count += 1
        neg_log_prob_val += neg_log_prob
     neg_log_prob_score += neg_log_prob_val
     


print(f"Mean Likelihood Score: {neg_log_prob_score / iteration_Count}")
   

Mean Likelihood Score: 2.469656677465579


'emma'

In [156]:
## Test Split
iteration_Count = 0
neg_log_prob_score = 0

for w in words_test:
     chs = ["."] * 2 + list(w) + ["."]
     
     neg_log_prob_val = 0
     for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        
        trigram_set = (ch1, ch2, ch3) ## x1,x2 -> x3
        
        ix1 = stoi.get(ch1)
        ix2 = stoi.get(ch2)
        ix3 = stoi.get(ch3)

        joint_prob = P[ix1, ix2, ix3].item()
        neg_log_prob = -log_prob_matrix[ix1, ix2, ix3].item()
        iteration_Count += 1
        neg_log_prob_val += neg_log_prob
     neg_log_prob_score += neg_log_prob_val
     


print(f"Mean Likelihood Score: {neg_log_prob_score / iteration_Count}")

Mean Likelihood Score: 2.4882345505656467


### Tuning Smooting Parameter

In [161]:
_SMOOTHING_VAL_LIST_ = [0.1, 0.5, 0.9 ,1, 5, 10]

for _SMOOTHING_VAL_ in _SMOOTHING_VAL_LIST_:

    Ns = smooth_count_matrix(N, _SMOOTHING_VAL_)

    P = Ns / Ns.sum(dim=-1, keepdim=True)

    log_prob_matrix = P.log()

    ## Training ##

    train_nll_list = [get_neg_log_likelyhood_score(name_trn, log_prob_matrix) for name_trn in words_trn]

    print(f"Mean Likelihood Score Train: {np.mean(train_nll_list)} | Smoothing Val : {_SMOOTHING_VAL_}")

    validation_nll_list = [get_neg_log_likelyhood_score(name_valid, log_prob_matrix) for name_valid in words_val]

    print(f"Mean Likelihood Score Validation: {np.mean(validation_nll_list)} | Smoothing Val : {_SMOOTHING_VAL_}")
   



Mean Likelihood Score Train: 2.1635934422815866 | Smoothing Val : 0.1
Mean Likelihood Score Validation: 2.0454016414709444 | Smoothing Val : 0.1
Mean Likelihood Score Train: 2.1635934422815866 | Smoothing Val : 0.5
Mean Likelihood Score Validation: 2.0454016414709444 | Smoothing Val : 0.5
Mean Likelihood Score Train: 2.1635934422815866 | Smoothing Val : 0.9
Mean Likelihood Score Validation: 2.0454016414709444 | Smoothing Val : 0.9
Mean Likelihood Score Train: 2.198252430266894 | Smoothing Val : 1
Mean Likelihood Score Validation: 2.071388509914831 | Smoothing Val : 1
Mean Likelihood Score Train: 2.2810540887906834 | Smoothing Val : 5
Mean Likelihood Score Validation: 2.1400390281611954 | Smoothing Val : 5
Mean Likelihood Score Train: 2.3526064256188253 | Smoothing Val : 10
Mean Likelihood Score Validation: 2.2018263152848845 | Smoothing Val : 10
