# 运用yield 和(中文切词pkuseg)

In [1]:
import torch
import torchtext
import pkuseg
seg=pkuseg.pkuseg()

#生产词list
def yield_cn():   
    with open('cmn.txt',encoding='utf-8') as f:
#         c=1
        for line in f.readlines():
#             print(c)
#             c+=1
            yield seg.cut(line.split('\t')[1])
            
def yield_en():   
    with open('cmn.txt',encoding='utf-8') as f:
#         c=1
        for line in f.readlines():
#             print(c)
#             c+=1
            yield line.split('\t')[0].lower()[:-1].split()+[line.split('\t')[0].lower()[-1]]

  from .autonotebook import tqdm as notebook_tqdm


# 运用torchtext.vocab.Vocab

In [2]:
cn_vocab=torchtext.vocab.build_vocab_from_iterator(yield_cn(),specials=['<pad>',"<bos>","<eos>",'<unk>'])
en_vocab=torchtext.vocab.build_vocab_from_iterator(yield_en(),specials=['<pad>',"<bos>","<eos>",'<unk>'])

In [3]:
cn_sents=[s for s in yield_cn()]
en_sents=[s for s in yield_en()]

# 运用torchtext.transforms

In [4]:
max_seq_len=50
en_transform=torchtext.transforms.Sequential(
    
    torchtext.transforms.Truncate(max_seq_len-1),
    torchtext.transforms.AddToken("<eos>",begin=False),
    torchtext.transforms.VocabTransform(en_vocab),
    torchtext.transforms.ToTensor(padding_value=en_vocab['<pad>']),
    torchtext.transforms.PadTransform(max_length=max_seq_len,pad_value=en_vocab["<pad>"]),#PadTransform的input要为tensor
    
    
)

cn_transform_x=torchtext.transforms.Sequential(
    
    torchtext.transforms.Truncate(max_seq_len-2),
    torchtext.transforms.AddToken("<bos>",begin=True),
    torchtext.transforms.AddToken("<eos>",begin=False),
    torchtext.transforms.VocabTransform(cn_vocab),
    torchtext.transforms.ToTensor(padding_value=cn_vocab['<pad>']),
    torchtext.transforms.PadTransform(max_length=max_seq_len,pad_value=cn_vocab["<pad>"]),#PadTransform的input要为tensor
    
)

cn_transform_y=torchtext.transforms.Sequential(
    
    torchtext.transforms.Truncate(max_seq_len-1),
    torchtext.transforms.AddToken("<eos>",begin=False),
    torchtext.transforms.VocabTransform(cn_vocab),
    torchtext.transforms.ToTensor(padding_value=cn_vocab['<pad>']),
    torchtext.transforms.PadTransform(max_length=max_seq_len,pad_value=cn_vocab["<pad>"]),#PadTransform的input要为tensor
    
)

en_data=en_transform(en_sents)
cn_x=cn_transform_x(cn_sents)
cn_y=cn_transform_y(cn_sents)

# 运用torch.utils.data.Dataset 和torch.utils.data.DataLoader

## 必须在getitem之前数据就转为tensor,不然会有歧义(在dataloader里)
## 又注单标签不用转为Tensor,(转了反而麻烦)

In [5]:
class CMNDataset(torch.utils.data.Dataset):
    def __init__(self,endata,cndata,cny):
        self.endata=endata
        self.cndata=cndata
        self.cny=cny
    
    def __getitem__(self,index):
        return self.endata[index],self.cndata[index],self.cny[index]
    
    def __len__(self):
        return len(self.endata)

In [6]:
tot=len(en_sents)
train_size=int(tot*0.6)
eval_size=int(tot*0.2)
test_size=tot-train_size-eval_size

train=CMNDataset(en_data[:train_size],cn_x[:train_size],cn_y[:train_size])
valid=CMNDataset(en_data[train_size:train_size+eval_size],cn_x[train_size:train_size+eval_size],cn_y[train_size:train_size+eval_size])
test=CMNDataset(en_data[train_size+eval_size:],cn_x[train_size+eval_size:],cn_y[train_size+eval_size:])


train_loader=torch.utils.data.DataLoader(train,batch_size=32,shuffle=True,drop_last=True)
valid_loader=torch.utils.data.DataLoader(valid,batch_size=32,shuffle=True,drop_last=True)
test_loader=torch.utils.data.DataLoader(test,batch_size=32,shuffle=True,drop_last=True)

# 验证 运用torch.vocab.Vocab

In [7]:
for e in train_loader:
    en,cnx,cny=e
    
#     验证
    print("en")
    for sent in en[:10]:
        print(en_vocab.lookup_tokens(sent[:10].tolist()))

    print("-"*30)
    print("cn_x")
    for sent in cnx[:10]:
        print(cn_vocab.lookup_tokens(sent[:10].tolist()))

    print("-"*30)
    print("cn_y")
    for sent in cny[:10]:
        print(cn_vocab.lookup_tokens(sent[:10].tolist()))

en
["i'm", 'fed', 'up', 'with', 'this', '.', '<eos>', '<pad>', '<pad>', '<pad>']
['the', 'child', 'will', 'be', 'six', '.', '<eos>', '<pad>', '<pad>', '<pad>']
["i'm", 'not', 'tired', 'right', 'now', '.', '<eos>', '<pad>', '<pad>', '<pad>']
["i'll", 'be', 'back', 'at', 'seven', "o'clock", '.', '<eos>', '<pad>', '<pad>']
['getting', 'started', 'was', 'difficult', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
['prices', 'have', 'dropped', 'recently', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
['be', 'nice', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
["what's", 'that', 'building', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['we', 'bought', 'a', 'pound', 'of', 'tea', '.', '<eos>', '<pad>', '<pad>']
["i'm", 'afraid', 'you', "can't", 'marry', 'her', '.', '<eos>', '<pad>', '<pad>']
------------------------------
cn_x
['<bos>', '这', '我', '受够', '了', '。', '<eos>', '<pad>', '<pad>', '<pad>']
['<bos>', '那', '孩子', '要', '六', '歲', '了', '。', '<eos>',

en
['tom', 'has', 'no', 'siblings', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
['plastic', 'does', 'not', 'burn', 'easily', '.', '<eos>', '<pad>', '<pad>', '<pad>']
['i', 'caught', 'him', 'stealing', 'the', 'money', '.', '<eos>', '<pad>', '<pad>']
['have', 'you', 'ever', 'written', 'a', 'blog', '?', '<eos>', '<pad>', '<pad>']
['we', 'work', 'together', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['then', 'what', '?', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['the', 'police', 'held', 'him', 'in', 'custody', '.', '<eos>', '<pad>', '<pad>']
["i'm", 'still', 'angry', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['you', 'are', 'wanted', 'on', 'the', 'phone', '.', '<eos>', '<pad>', '<pad>']
['i', 'fell', 'asleep', 'while', 'watching', 'tv', '.', '<eos>', '<pad>', '<pad>']
------------------------------
cn_x
['<bos>', 'Tom', '沒有', '兄弟', '姊妹', '。', '<eos>', '<pad>', '<pad>', '<pad>']
['<bos>', '塑料', '不易', '燃烧', '。', '<eos>', '<pad>', '<

['<bos>', '汤姆', '嫉妒', '吗', '？', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', '他', '從銀', '行', '得到', '了', '貸款', '。', '<eos>', '<pad>']
['<bos>', 'John', '比', '我', '大', '两', '岁', '。', '<eos>', '<pad>']
['<bos>', '我', '很', '餓', '。', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', '你', '最', '喜欢', '的', 'iPad', '应用', '是', '什么', '?']
['<bos>', '玛丽', '，', '我', '喜欢', '你', '！', '<eos>', '<pad>', '<pad>']
['<bos>', '你', '明天', '去', '學校', '。', '<eos>', '<pad>', '<pad>', '<pad>']
['<bos>', '比較', '你', '和', '湯姆', '的', '答案', '。', '<eos>', '<pad>']
['<bos>', '那', '是', '他家', '。', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', '请', '穿', '衣服', '。', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
------------------------------
cn_y
['汤姆', '嫉妒', '吗', '？', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['他', '從銀', '行', '得到', '了', '貸款', '。', '<eos>', '<pad>', '<pad>']
['John', '比', '我', '大', '两', '岁', '。', '<eos>', '<pad>', '<pad>']
['我', '很', '餓', '。', '<eos>', '<pad>', '<pad>', '<

# 模型的循环结构
### 用nn.ModuleList()即可,和list一样直接用索引调用,之前感觉一直只有一个model在里面,是因为我设置的层数确实为1