## 数据集纠错处理
本示例用到的数据文件来源于http://statmt.org/wmt18/translation-task.html#download 中的News Commentary v13
由于中文会部分出现乱码，需要将对应的数据丢弃

数据集是gbk的，在读写操作时需要转成utf-8

In [99]:
# 处理问题数据
# data_en = []
# data_cn = []
# with open("(origin)news-commentary-v13.zh-en.en","r",encoding='utf-8') as f:
#     data_en = f.read().split('\n')

# with open("(origin)news-commentary-v13.zh-en.zh","r",encoding='utf-8') as f:
#     data_cn = f.read().split('\n')

In [100]:
#   将没问题的数据单独挑出来
# data_cn_corrected = []
# data_en_corrected = []
# for i in range(data_cn.__len__()):
#     if data_cn[i].__contains__("�"):
#         continue
#     data_en_corrected.append(data_en[i])
#     data_cn_corrected.append(data_cn[i])

In [101]:
#   将处理过的数据存进新文件
# with open("news-commentary-v13.zh-en.en","w",encoding='utf-8') as f:
#     for i in data_en_corrected:
#         f.write(i+'\n')


# with open("news-commentary-v13.zh-en.cn","w",encoding='utf-8') as f:
#     for i in data_cn_corrected:
#         f.write(i+'\n')


## Word Embedding词嵌入
### 本节可能涉及的知识点：
>获取上下文关系</br>
>   &emsp;Skip-Gram(本例采用)</br>
>   &emsp;CBOW</br>
>因词库太大而不能直接使用softmax输出，对应的处理方式</br>
>   &emsp;基于Huffman tree的Hierarchical Softmax方法</br>
>   &emsp;负采样negative sampling(本例采用)</br>
>分词</br>

In [102]:
# 复现文章reference:
# Distributed Representations of Words and Phrases and their Compositionality
import torch
import torch.nn as nn
import torch.nn.functional as fun
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import numpy as np
import random
import math
import jieba #中文分词工具
import pandas as pd

### 一些全局配置变量

In [103]:
USE_CUDA = torch.cuda.is_available()

contextWindow = 5
K = 100 # 反例相比正例的倍数

NUM_EPOCHS = 10
BATCH_SIZE = 128
LEARNING_RATE = 0.1
EMBEDDING_SIZE = 120

### 分别对原始数据中的中文和英文进行分词处理

In [104]:
with open("news-commentary-v13.zh-en.cn","r",encoding='utf-8') as f:
    text_cn = f.read()

with open("news-commentary-v13.zh-en.en","r",encoding='utf-8') as f:
    text_en = f.read()

#   当不给split函数传递任何参数时，分隔符sep会采用任意形式的空白字符
text_en = text_en.split()   #英文单词直接用空格分就行
text_cn = jieba.lcut(text_cn,cut_all=False)   #用jieba的精确模式对中文进行分词

vocab_en = dict(Counter(text_en))
vocab_cn = dict(Counter(text_cn))

创建两组映射对应英\中文词汇表中序号-单词的对应关系

In [105]:
mappingCN_index2word = [word for word in vocab_cn.keys()]
mappingCN_word2index = {word:i for i,word in enumerate(mappingCN_index2word)}

mappingEN_index2word = [word for word in vocab_en.keys()]
mappingEN_word2index = {word:i for i,word in enumerate(mappingEN_index2word)}

获取每个词语出现的频率的3/4次幂，再归一化

In [106]:
cn_word_counts = np.array([count for count in vocab_cn.values()],dtype=np.float32)
en_word_counts = np.array([count for count in vocab_en.values()],dtype=np.float32)

cn_word_freq = (cn_word_counts / np.sum(cn_word_counts) ** (3./4.))
cn_word_freq = cn_word_freq / np.sum(cn_word_freq)  #   normalization

en_word_freq = (en_word_counts / np.sum(en_word_counts) ** (3./4.))
en_word_freq = en_word_freq / np.sum(en_word_freq)  #   normalization

### 构建嵌入模型

In [107]:
class EmbeddingModel (nn.Module):
    def __init__(self, vocab_size, embed_size):
        super (EmbeddingModel, self).__init__()

        #   这为啥非得分两个网络
        self.vocab_size = vocab_size    #词库总长度
        self.embed_size = embed_size    #嵌入后词向量的维度

        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)
    def forward(self, input_labels, pos_labels, neg_labels):
        # input_label: [batch_size]
        # pos_labels: [batch_size, (window_size * 2)]
        # neg_labels: [batch_size, (window_size * 2 * K)]
        input_embedding = self.in_embed (input_labels)  # [batch_size, embed_size]
        pos_embedding = self.out_embed (pos_labels)  # 正例 [batch_size, (window_size 2), embed_size]
        neg_embedding = self.out_embed (neg_labels)  # 反例 [batch_size, (window_size 2 K), embed_size]

        # unsquuze(x) 是在第x维增加一个维度 squeeze与之相反
        input_embedding = input_embedding.unsqueeze (2) # [batch_size, embed size, 1]

        #bmm  Batch Map Mul
        pos_dot = torch.bmm(pos_embedding, input_embedding).squeeze(2)  # [batch_size, (window_size * 2)]
        neg_dot = torch.bmm(neg_embedding, -input_embedding).squeeze(2)  # [batch_size, (window_size * 2 * K)]

        # 这为啥用logsigmoid我还不知道，论文里是这么写的
        log_pos = fun.logsigmoid(pos_dot).sum(1)    # 把所有第一维元素加和
        log_neg = fun.logsigmoid(neg_dot).sum(1)

        loss = log_pos + log_neg
        return -loss


### 定义英文/中文词汇数据集

In [108]:
class WordEmbeddingDataset (Dataset):
    #   词汇列表 两个映射列表 词频列表 
    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
        super(WordEmbeddingDataset, self). __init__()
        self.text_encoded= [word_to_idx.get(word) for word in text] # 返回指定键的值，如果值不在字典中返回default值
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor (word_counts)   #   貌似没用
           
    def __len__(self):
        # 这个数据集一共有多少个item
        return len(self.text_encoded)
           
    def __getitem__ (self, idx):
        center_word = self.text_encoded[idx]
        pos_indices = list(range(idx-contextWindow,idx))+list(range (idx + 1, idx+contextWindow+1))#   window内单词的index
        pos_indices = [i % len(self.text_encoded) for i in pos_indices] #   处理词表两端越界问题 比如就一万个词，上界到一万零二的情况
        pos_words =self.text_encoded[pos_indices]

        #   以word_freqs内元素的值作为概率，从word_freqs的index中抽取K*pos_words.shape[0]个值，有放回
        neg_words = torch.multinomial(self.word_freqs, K*pos_words.shape[0], True)   # 负例采样单词
        return center_word, pos_words, neg_words


In [109]:
dataset_cn = WordEmbeddingDataset (text_cn, mappingCN_word2index, mappingCN_index2word, cn_word_freq, cn_word_counts)
dataset_en = WordEmbeddingDataset (text_en, mappingEN_word2index, mappingEN_index2word, en_word_freq, en_word_counts)
dataloader_cn = DataLoader(dataset_cn, batch_size = BATCH_SIZE, shuffle = True, num_workers = 0)
dataloader_en = DataLoader(dataset_en, batch_size = BATCH_SIZE, shuffle = True, num_workers = 0)

model_cn = EmbeddingModel(len(vocab_cn),EMBEDDING_SIZE)
model_en = EmbeddingModel(len(vocab_en),EMBEDDING_SIZE)
if USE_CUDA:
    model_cn = model_cn.cuda()
    model_en = model_en.cuda()


optimizer_cn = torch.optim.SGD(model_cn.parameters(),lr=LEARNING_RATE)
optimizer_en = torch.optim.SGD(model_en.parameters(),lr=LEARNING_RATE)


### 训练并保存模型
最好分开两个文件训练，能快点

In [110]:
if __name__ == '__main__':
    for e in range(NUM_EPOCHS):
        for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader_cn):
            input_labels = input_labels.long().cuda()
            pos_labels = pos_labels.long().cuda()
            neg_labels = neg_labels.long().cuda()

            optimizer_cn.zero_grad()
            loss = model_cn(input_labels,pos_labels,neg_labels).mean()
            loss.backward()
            optimizer_cn.step()

            if i % 100 == 0:
                print("cn_epoch",e,"iteration",i,loss.item())

    

        for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader_en):
            input_labels = input_labels.long().cuda()
            pos_labels = pos_labels.long().cuda()
            neg_labels = neg_labels.long().cuda()

            optimizer_en.zero_grad()
            loss = model_en(input_labels,pos_labels,neg_labels).mean()
            loss.backward()
            optimizer_en.step()

            if i % 100 == 0:
                print("cn_epoch",e,"iteration",i,loss.item())

    torch.save(model_cn, "./model_cn.pth")
    torch.save(model_en, "./model_en.pth")

cn_epoch 0 iteration 0 4493.7255859375
cn_epoch 0 iteration 100 1563.85888671875
cn_epoch 0 iteration 200 1259.671142578125
cn_epoch 0 iteration 300 1009.2308959960938
cn_epoch 0 iteration 400 853.2382202148438
cn_epoch 0 iteration 500 774.4933471679688
cn_epoch 0 iteration 600 603.8095703125
cn_epoch 0 iteration 700 728.233154296875
cn_epoch 0 iteration 800 638.7422485351562
cn_epoch 0 iteration 900 628.965576171875
cn_epoch 0 iteration 1000 515.0325927734375
cn_epoch 0 iteration 1100 611.810546875
cn_epoch 0 iteration 1200 603.5675048828125
cn_epoch 0 iteration 1300 469.0502014160156
cn_epoch 0 iteration 1400 431.205810546875
cn_epoch 0 iteration 1500 397.4700927734375
cn_epoch 0 iteration 1600 410.4333190917969
cn_epoch 0 iteration 1700 427.3876037597656
cn_epoch 0 iteration 1800 323.62945556640625
cn_epoch 0 iteration 1900 372.12274169921875
cn_epoch 0 iteration 2000 311.14697265625
cn_epoch 0 iteration 2100 349.5462646484375
cn_epoch 0 iteration 2200 369.9750061035156
cn_epoch 0 i

In [111]:
torch.save(model_cn.in_embed,"./cn_in_embed.pth")
torch.save(model_cn.out_embed,"./cn_out_embed.pth")

torch.save(model_en.in_embed,"./en_in_embed.pth")
torch.save(model_en.out_embed,"./en_out_embed.pth")