In [1]:
words = open('names.txt', 'r').read().splitlines()

In [2]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)

32033

In [4]:
min(len(w) for w in words)

2

In [5]:
max(len(w) for w in words)

15

In [6]:
# 单词拆分
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram]  = b.get(bigram, 0) + 1

In [7]:
import torch

In [8]:
# 使用tensor矩阵存储对应的字母表
N = torch.zeros((27, 27), dtype=torch.int32)

In [9]:
# 构建字母到index的映射
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [4]:
# 画图
#import matplotlib.pyplot as plt
#plt.imshow(N)

In [5]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

NameError: name 'N' is not defined

In [6]:
itos = {i:s for s,i in stoi.items()}

In [7]:
# 画模型矩阵图
import matplotlib.pyplot as plt

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i,j].item(), ha="center", va="top", color='gray')
plt.axis('off')

NameError: name 'N' is not defined

<Figure size 1600x1600 with 0 Axes>

In [14]:
N[0, 1:].shape

torch.Size([26])

In [15]:
p = N[0].float()
p /= p.sum()
p

tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])

In [17]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p, num_samples=1,replacement=True,generator=g).item()
itos[ix]

'm'

In [18]:
p.sum()

tensor(1.)

In [19]:
g = torch.Generator().manual_seed(2147483647)
p = torch.rand(3, generator=g)
p = p / p.sum()
p

tensor([0.6064, 0.3033, 0.0903])

In [32]:
torch.multinomial(p, num_samples=20,replacement=True,generator=g)

tensor([12, 13, 16,  2,  9, 14, 12, 14, 19, 21,  9,  5,  8, 18, 14,  0,  0, 25,
         4,  0])

In [58]:
# 计算模型概率
# 为了使模型更加的平滑，给所有的count + 1，这样就不会出现log无限大的值
P = (N+1).float()
P /= P.sum(1, keepdim=True)

In [41]:
P[10].sum()

tensor(1.)

In [65]:
# 尝试根据概率参数输出单词
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1,replacement=True,generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break

    print(''.join(out))

mor.
axx.
minaymoryles.
kondlaisah.
anchshizarie.


In [54]:
# 计算模型的概率相似性 Products of likelihoods
# https://en.wikipedia.org/wiki/Likelihood_function

log_likelihood = 0.0
n = 0

for w in ["jackq"]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1,ix2]
        logprob = torch.log(prob)
        # log(a*b*c) = log(a) + log(b) + log(c)
        log_likelihood += logprob
        n += 1
        print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

.j: 0.0756 -2.5826
ja: 0.5036 -0.6860
ac: 0.0139 -4.2767
ck: 0.0891 -2.4183
kq: 0.0002 -8.5305
q.: 0.0970 -2.3331
log_likelihood=tensor(-20.8273)
nll=tensor(20.8273)
3.4712085723876953


In [62]:
# log(a*b*c) = log(a) + log(b) + log(c)
# Goal: maxmize likelihood of the data with ehe model parameters (likelihood equal inner product)
# equal to maximizing the log likelihood (because log is monotonic)
# equal to minimizing the negative log likelihood
# euqal to minimizing the average negative log likelihood
# 也就是说我们需要训练一个模型，其参数的概率值和数据中的概率值相似度最高，这样就能产生类似的数据

In [70]:
# 我们的目的是训练一个模型，当输入一个字母时，能够根据概率预测下一个字母可能是什么

# create the training set of bigrams (x, y) x为当前的字母index, y为下一个字母的index
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

. e
e m
m m
m a
a .


In [68]:
xs

tensor([ 0,  5, 13, 13,  1])

In [69]:
ys

tensor([ 5, 13, 13,  1,  0])

In [129]:
# 初始化27个神经元，每个神经元接受27个inputs (one hot的shape为27)
g = torch.Generator().manual_seed(2147483647 + 1)
W = torch.randn((27, 27), generator=g)

In [130]:
# 将xs转化为one hot
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27).float() # 输入 one hot 编码
# (5, 27) @ (27, 27) = (5, 27) 
logits = (xenc @ W) # 预测 log-counts
# softmax
counts = logits.exp() # counts equals to N
probs = counts / counts.sum(1, keepdims=True) # 下一个字符的概率

In [None]:
probs.shape

tensor(1., grad_fn=<SumBackward0>)

In [8]:
nlls = torch.zeros(5)
for i in range(5):
    # i-th bigram
    x = xs[i].item() # input character index
    y = ys[i].item() # label character index
    print('------')
    print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x}, {y})')
    print('input to the neural net:', x)
    print('output probabilities from the neural net:', probs[i])
    print('label (actual next character):', y)
    p = probs[i, y]
    print('probability assigned by the net to the correct character: ', p.item())
    logp = torch.log(p)
    print('log lokelihood:', logp.item())
    nll = -logp
    print('negative log likelihood:', nll.item())
    nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e loss = ', nll.mean().item())

NameError: name 'torch' is not defined

In [133]:
# plt.imshow(xenc)

In [134]:
xs

tensor([ 0,  5, 13, 13,  1])

In [135]:
ys

tensor([ 5, 13, 13,  1,  0])

In [168]:
# initial
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [205]:
# forward pass 
xenc = F.one_hot(xs, num_classes=27).float() # 输入 one hot 编码
logits = xenc @ W # 预测 log-counts
counts = logits.exp() # counts equals to N
probs = counts / counts.sum(1, keepdims=True) # 下一个字符的概率
loss = -probs[torch.arange(5), ys].log().mean()

In [206]:
print(loss.item())

3.531052350997925


In [207]:
# backward oass
W.grad = None # set to zero
loss.backward()
W.data += -0.1 * W.grad

In [215]:
# 最终版本
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initial
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  228146


In [9]:
# gradient descent
for k in range(100):
    # forward pass 
    xenc = F.one_hot(xs, num_classes=27).float() # 输入 one hot 编码
    logits = xenc @ W # 预测 log-counts
    counts = logits.exp() # counts equals to N
    probs = counts / counts.sum(1, keepdims=True) # 下一个字符的概率
    loss = -probs[torch.arange(num), ys].log().mean()
    print(loss.item())
    
    # backward oass
    W.grad = None # set to zero
    loss.backward()
    
    # update
    W.data += -10 * W.grad

NameError: name 'F' is not defined

In [229]:
# finally, sample from the neural net
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    
    out = []
    ix = 0
    while True:
        
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float() # 输入 one hot 编码
        logits = xenc @ W # 预测 log-counts
        counts = logits.exp() # counts equals to N
        p = counts / counts.sum(1, keepdims=True) # 下一个字符的概率
        # p = P[ix]
        ix = torch.multinomial(p, num_samples=1,replacement=True,generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break

    print(''.join(out))
        

mor.
axx.
minaymoryles.
kondlaisah.
anchshizarie.
