# 词向量
- 学习词向量的概念
- 用skip-thought模型训练词向量
- 学习使用pytorch dataset和dataloader
- 学习定义pytorch模型
- 学习torch.nn 中常见的Module
    - embedding
- 学习常见的pytorch operations
   - bmm
   - logsigmoid
- 保存和读取模型

## Distributed Representations of words and Phrases and their Compositionality
Skip-gram 模型，使用论文中的noice contrastive sampling的目标函数，没有实现subsampling：论文section2.3

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random
import math 

import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

USE_CUDA = torch.cuda.is_available()
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)
    
    
# 定义一些超参数
C = 3 # context window 周围三个单词
K = 100 # number of negatuve samples 每出现一个正确的词要出现100个错误的词。


BATCH_SIZE=128
NUM_EPOCHS=2
learning_rate = 0.2
MAX_VOCAB_SIZE = 30000
EMBEDDING_SIZE = 100

In [2]:
def wird_tokenize(text):
    return text.split()

- 从文本文件中读取所有文字，通过这些文本创建一个vocabulary
- 由于单词数量可能太大，我们只选取常见的MAX_VOCAB_SIZE个单词
- 添加一个UNK单词表示所有不常见的单词
- 需要记录单词到index的mapping，以及index到单词的mapping，单词的count，单词的(normalizaed) frequncy,以及单词总数

In [3]:
with open('data/text8.train.txt','r') as fin:
    text = fin.read()
    
text[:100]

'anarchism originated as a term of abuse first used against early working class radicals including th'

In [4]:
text = text.split()
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))
vocab['<unk>'] = len(text) - np.sum(list(vocab.values()))

In [5]:
idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word:i for i,word in enumerate(idx_to_word)}

In [6]:
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts/np.sum(word_counts)

In [7]:
# 把概率提高到四分之三次方再 重新normalize
# 3/4次方之后，会将高概率的单词的概率值，分一部分给低概率的单词，因为相同的操作，对高概率单词的概率值影响更大
word_freqs = word_freqs**(3./4.)
# 归一化
word_freqs = word_freqs/np.sum(word_freqs)

VOCAB_SIZE = len(idx_to_word)

In [8]:
word_freqs

array([1.6247008e-02, 1.0514009e-02, 8.0499463e-03, ..., 5.0115582e-06,
       5.0115582e-06, 1.1669193e-02], dtype=float32)

# 实现Dataloader
一个dataloader 需要以下内容：
- 把所有text编码成数字，然后用subsampling预处理这些文字
- 保存vocabulary，单词count，normalized word frequency
- 每个iteration sample 一个中心词
- 根据当前的中心词返回context单词
- 根据中心词sample一些negative单词
- 返回单词的counts

为了使用dataloader 需要定义两个function:
- __len__ 需要返回整个数据集中有多少个item
- __get__ 根据给定的index返回一个item

有了dataloader后，可以轻松随机打乱整个数据集，拿到一个batch数据等等

In [9]:
# 创建一个dataset
class WordEmbeddingDataset(tud.Dataset):
    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
        super(WordEmbeddingDataset,self).__init__()
        self.text_encoded = [word_to_idx.get(word,word_to_idx['<unk>']) for word in text]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)
    
    def __len__(self):
        # 这个数据集一共有多少个item
        return len(self.text_encoded)
        
        
    def __getitem__(self,idx):
#         center_word = self.text_encoded[idx]
#         pos_indexes = list(range(idx-C, idx)) + list(range(idx+1, idx+C+1)) 
#         pos_indices = [i%len(self.text_encoded) for i in pos_indices] 
#         pos_words = self.text_encoded[pos_indexes] 
#         neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0],True) # 傅立采样单词
#         
    
        center_word = self.text_encoded[idx]
        pos_indices = list(range(idx-C, idx)) + list(range(idx+1, idx+C+1))# window内单词的index
        pos_indices = [i%len(self.text_encoded) for i in pos_indices]# 取余，防止超出text长度
        pos_words = self.text_encoded[pos_indices] # 周围单词
        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)# 傅立采样单词
        return center_word,pos_words,neg_words

In [10]:
dataset = WordEmbeddingDataset(text, word_to_idx, idx_to_word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset,batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

In [11]:
# next(iter(dataloader))

In [12]:
class EmbeddingModel(nn.Module):
    # 差embedding multinomial bmm
    def __init__(self,vocab_size,embed_size):
        ''' 初始化输出和输出embedding
        '''
        super(EmbeddingModel,self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        initrange = 0.5 / self.embed_size
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
        self.out_embed.weight.data.uniform_(-initrange, initrange)
        
        
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
        self.in_embed.weight.data.uniform_(-initrange, initrange)
        
    def forward(self,input_labels,pos_labels,neg_labels):
        # input_label:[batch_size]
        # pos_labels:[batch_size,(window_size * 2)]
        # neg_labels:[batch_size,(window_size*2*K)]
        '''
        input_labels: 中心词, [batch_size]
        pos_labels: 中心词周围 context window 出现过的单词 [batch_size * (window_size * 2)]
        neg_labelss: 中心词周围没有出现过的单词，从 negative sampling 得到 [batch_size, (window_size * 2 * K)]
        
        return: loss, [batch_size]
        '''
        batch_size = input_labels.size(0)
        input_embedding = self.in_embed(input_labels) # [batch_size,embed_size]
        pos_embedding = self.out_embed(pos_embedding) # [batch_size,(window_size*2)]
        neg_embedding = self.out_embed(neg_embedding) # [batch_size,(window_size*2*k),embed_size]
        
        input_embedding = input_embedding.unsquuze(2) # [batch_size,embed_size,1]
        
        log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze() # B * (2*C)
        log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze() # B * (2*C*K)

        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1) # batch_size
       
        loss = log_pos + log_neg
        
        return -loss

IndentationError: expected an indented block (<ipython-input-12-87a8fc0fea77>, line 4)