使用向量空间映射字母，训练神经网络调整映射

In [1]:
import torch
from matplotlib import pyplot as plt

In [2]:
words = open("names.txt").read().splitlines()

In [3]:
char = sorted(set(''.join(words)))
stoi = {s:i+1 for i,s in enumerate(char)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [4]:
# shuffle up the words
import random
random.seed(42)
random.shuffle(words)

In [22]:
block_size = 8

def make_data(words):
    X,Y = [],[]
    for w in words:
        content = [0] * block_size
        w = w + '.'
        for ch in w:
            ix = stoi[ch]
            X.append(content)
            Y.append(ix)
            content = content[1:] + [ix]
    dataX = torch.tensor(X)
    dataY = torch.tensor(Y)
    return dataX,dataY

In [24]:
n1 = int(len(words) * 0.8)
n2 = int(len(words) * 0.9)

Xtr,Ytr = make_data(words[:n1])
Xdev,Ydev = make_data(words[n1:n2])
Xte,Yte = make_data(words[n2:])
Xtr.shape, Ytr.shape

(torch.Size([182625, 8]), torch.Size([182625]))

In [101]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weigth = torch.randn((fan_in, fan_out))
        self.bias = torch.randn((fan_out)) if bias else None

    def __call__(self, x):
        self.out =  x @ self.weigth
        if self.bias is not None :
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weigth] + ([] if self.bias is None else [self.bias])

    def cuda(self):
        self.weigth = self.weigth.cuda()
        if self.bias is not None :
            self.bias = self.bias.cuda()

class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # gamma * x + beta
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # 全数据集 标准差和均值
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        if self.training:
            if x.ndim == 2:
                dim = 0
            else: 
                dim = (0,1)
            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)  # normalize to unit variance
        self.out = self.gamma * xhat + self.beta 
        # 更新
        if self.training:
            with torch.no_grad():
                self.running_mean = self.running_mean*(1-self.momentum) + xmean * self.momentum
                self.running_var = self.running_var*(1-self.momentum) + xvar * self.momentum
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

    def cuda(self):
        self.gamma = self.gamma.cuda()
        self.beta = self.beta.cuda()
        self.running_mean = self.running_mean.cuda()
        self.running_var = self.running_var.cuda()        

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
        
    def parameters(self):
        return []

    def cuda(self):None

class Embedding:
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))

    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out

    def parameters(self):
        return [self.weight]

    def cuda(self):
        self.weight = self.weight.cuda()

class FlattenConsecutive:
    def __init__(self, n):
        self.n = n

    def __call__(self, x):
        B, T, C = x.shape
        T //= self.n
        C *= self.n
        x = x.view(B,T,C)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out

    def parameters(self):
          return []

    def cuda(self):None

class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def cuda(self):
        for layer in self.layers:
            layer.cuda()

In [120]:
n_embd = 24 # the dimensionality of the character embedding vectors
n_hidden = 128 # the number of neurons in the hidden layer of the MLP

model = Sequential([
    Embedding(27, n_embd),
    FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, 27)
])
model.cuda()

with torch.no_grad():
    model.layers[-1].weigth *= 0.1

parameters = model.parameters()
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

76579


In [144]:
batchsize = 100

for i in range(100000):
    index = torch.randint(0, Xtr.shape[0], (batchsize,))
    X = Xtr[index].cuda()
    Y = Ytr[index].cuda()
    yi = model(X)
    # print(yi)
    loss = torch.nn.functional.cross_entropy(yi, Y)
    # print(i, loss.item())
    for p in parameters:
        p.grad = None
    loss.backward()

    with torch.no_grad():
        for p in parameters:
            # print(p.device, p.grad.device)
            p -= 0.01 * p.grad

In [145]:
loss

tensor(2.2990, device='cuda:0', grad_fn=<NllLossBackward0>)

In [146]:
for layer in model.layers:
  layer.training = False
with torch.no_grad():
    X = Xte.cuda()
    Y = Yte.cuda()
    yi = model(X)
    # print(yi)
    loss = torch.nn.functional.cross_entropy(yi, Y)
    print(loss.item())
with torch.no_grad():
    X = Xtr.cuda()
    Y = Ytr.cuda()
    yi = model(X)
    # print(yi)
    loss = torch.nn.functional.cross_entropy(yi, Y)
    print(loss.item())

2.1144723892211914
2.0808932781219482


In [None]:
2.1345136165618896
2.1082568168640137

2.1144723892211914
2.0808932781219482