# import the library

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# process the data

In [2]:
data = []
with open("data.txt", "r") as f:
    for line in f:
        inner_data = list(line.strip().split())
        for idx in range(len(inner_data)):
            if inner_data[idx].isdigit():
                inner_data[idx] = int(inner_data[idx])
        data.append(inner_data)


In [3]:
import random  
  
import numpy as np  
import torch  
from torch.utils.data import Dataset,DataLoader  

random.seed(0)
np.random.seed(0)
  
# 定义字典  
words_x = '<PAD>,1,2,3,4,5,6,7,8,9,0,<SOS>,<EOS>,+'  
vocab_x = {word: i for i, word in enumerate(words_x.split(','))}  
vocab_xr = [k for k, v in vocab_x.items()] #反查词典  
  
words_y = '<PAD>,1,2,3,4,5,6,7,8,9,0,<SOS>,<EOS>'  
vocab_y = {word: i for i, word in enumerate(words_y.split(','))}  
vocab_yr = [k for k, v in vocab_y.items()] #反查词典  
#两数相加数据集  
def get_data():  
    # 定义词集合  
    words = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']  
  
    # 每个词被选中的概率  
    p = np.array([7, 5, 5, 7, 6, 5, 7, 6, 5, 7])  
    p = p / p.sum()  
  
    # 随机采样n1个词作为s1  
    n1 = random.randint(10, 20)  
    s1 = np.random.choice(words, size=n1, replace=True, p=p)  
    s1 = s1.tolist()  

    # 随机采样n2个词作为s2  
    n2 = random.randint(10, 20)  
    s2 = np.random.choice(words, size=n2, replace=True, p=p)  
    s2 = s2.tolist()  
  
    # x等于s1和s2字符上的相加  
    x = s1 + ['+'] + s2  
      
    # y等于s1和s2数值上的相加  
    y = int(''.join(s1)) + int(''.join(s2))  
    y = list(str(y))  
      
    # 加上首尾符号  
    x = ['<SOS>'] + x + ['<EOS>']  
    y = ['<SOS>'] + y + ['<EOS>']  
  
    # 补pad到固定长度  
    x = x + ['<PAD>'] * 50  
    y = y + ['<PAD>'] * 51  
    x = x[:50]  
    y = y[:51]  
  
    # 编码成token  
    token_x = [vocab_x[i] for i in x]  
    token_y = [vocab_y[i] for i in y]  
  
    # 转tensor  
    tensor_x = torch.LongTensor(token_x)  
    tensor_y = torch.LongTensor(token_y)  
    return tensor_x, tensor_y  
  
  
def show_data(tensor_x,tensor_y) ->"str":  
    words_x = "".join([vocab_xr[i] for i in tensor_x.tolist()])  
    words_y = "".join([vocab_yr[i] for i in tensor_y.tolist()])  
    return words_x,words_y  
  
  
x,y = get_data()   
print(x.shape,y.shape)
print(x,y)



torch.Size([50]) torch.Size([51])
tensor([11,  5,  7,  6,  5,  4,  6,  4,  9,  9,  3,  7,  5,  5,  9, 10, 10, 13,
        10,  8,  7,  8,  9,  7,  4,  7,  1,  6,  1,  9,  5,  4,  2,  7, 12,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]) tensor([11,  6,  6,  4,  4,  4,  3,  9,  7, 10,  9,  9,  5,  1,  3,  2,  7, 12,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])


In [4]:
# 定义数据集  
class TwoSumDataset(torch.utils.data.Dataset):  
    def __init__(self,size = 100000):  
        super(Dataset, self).__init__()  
        self.size = size  
  
    def __len__(self):  
        return self.size  
  
    def __getitem__(self, i):  
        return get_data()  
      
ds_train = TwoSumDataset(size = 100000)  
ds_val = TwoSumDataset(size = 10000)  
  
  
# 数据加载器  
dl_train = DataLoader(dataset=ds_train,  
         batch_size=200,  
         drop_last=True,  
         shuffle=True)  
  
dl_val = DataLoader(dataset=ds_val,  
         batch_size=200,  
         drop_last=True,  
         shuffle=False)  
  
for src,tgt in dl_train:  
    print(src.shape)  
    print(tgt.shape)  
    break   


torch.Size([200, 50])
torch.Size([200, 51])


# model

In [6]:
class PositionEncoding(nn.Module):  
    "Implement the PE function."  
    def __init__(self, d_model, dropout, max_len=5000):  
        super(PositionEncoding, self).__init__()  
        self.dropout = nn.Dropout(p=dropout)  
          
        # Compute the positional encodings once in log space.  
        pe = torch.zeros(max_len, d_model)  
        position = torch.arange(0, max_len).unsqueeze(1)  
        div_term = torch.exp(torch.arange(0, d_model, 2) *  
                             -(math.log(10000.0) / d_model))  
        pe[:, 0::2] = torch.sin(position * div_term)  
        pe[:, 1::2] = torch.cos(position * div_term)  
        pe = pe.unsqueeze(0)  
        self.register_buffer('pe', pe)  
          
    def forward(self, x):  
        x = x + self.pe[:, :x.size(1)]  
        return self.dropout(x)  

In [20]:
class TransformerModel(nn.Transformer):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.1):
        super(TransformerModel, self).__init__(d_model = ninp, nhead = nhead, num_encoder_layers = nlayers,
                 num_decoder_layers = nlayers, dim_feedforward = nhid, device=None)
        self.pe = PositionEncoding(ninp,dropout)
        self.embed = nn.Embedding(ntoken, ninp)
        self.nhead = nhead
        self.nhid = nhid
        self.nlayers = nlayers
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)
    

    def forward(self, src, tgt, src_mask=None):
        src = self.embed(src)
        src = self.pe(src)
        output = self.encoder(src, src_mask)
        output = self.decoder(output, tgt)
        return output



In [21]:
model = TransformerModel(ntoken = len(vocab_x), ninp = 32, nhead = 2, nhid = 512, nlayers=4)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)




In [23]:
for epoch in range(100):
    for i in range(100):
        input_data, target_data = get_data()
        target_data = target_data[:-1]
        optimizer.zero_grad()
        output = model(input_data, target_data)
        loss = criterion(output, target_data)
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

RuntimeError: The size of tensor a (50) must match the size of tensor b (32) at non-singleton dimension 1

In [None]:
# test
input_data, target_data = get_data()
output = model(input_data)
print(input_data)
print(target_data)
print(torch.argmax(output, dim=-1))

tensor([11,  3, 10,  5,  7,  7,  6,  9,  5,  5,  9,  1,  6,  2,  5,  7, 13,  9,
         1,  6,  6,  4,  1, 10,  8,  8,  3,  3, 10, 12,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
tensor([11,  3, 10,  6,  6,  9,  3,  5,  9,  7, 10, 10,  4,  5,  8,  7, 12,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
tensor([11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
