In [1]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data

sys.path.append('..')
import d2lzh_pytorch as d2l

In [9]:
def load_data_from_file(file_path):
    with open(file_path, 'r') as fr:
        lines = fr.readlines()
    raw_dataset = [st.split() for st in lines]
    print('sentence: %d' % len(raw_dataset))
    return raw_dataset

def load_data_from_text(text):
    text = 'We are about to study the idea of a computational process.Computational processes are abstract beings that inhabit computers.As they evolve processes manipulate other abstract things called data'
    sts = text.split('.')
    raw_dataset = [st.split() for st in sts]
    return raw_dataset

    
def token_index(raw_dataset):
    counter = collections.Counter([tk for st in raw_dataset for tk in st])
    counter = dict(filter(lambda x: x[1] >=1, counter.items()))
    idx_to_token = [tk for tk,_ in counter.items()]
    token_to_idx = {tk: idx for idx,tk in enumerate(idx_to_token)}
    dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
    num_tokens = sum([len(st) for st in dataset])
    print('tokens: % d' % num_tokens)
    return dataset, counter, idx_to_token, token_to_idx, num_tokens

def discard(dataset, counter, num_tokens, idx_to_token, prop=1e-4):
    '''
    idx: 单词的数字索引
    prop: 超参数，默认为1e-4
    random.uniform(0,1)会随机产生0-1之间的数，当f_wi远大于prop，
    即该单词出现次数很高，它与总词数之比远大于prop，
    此时 1 - math.sqrt(prop / f_wi)近乎为1，
    不等式成立，返回True,该词被丢弃，反之返回False,保留。
    '''
    subsampled_dataset = []
    for st in dataset:
        subsampled_st = []
        for tk in st:
            f_wi = counter[idx_to_token[tk]] / num_tokens
            if not random.uniform(0,1) < 1 - math.sqrt(prop / f_wi):
                subsampled_st.append(tk)
        subsampled_dataset.append(subsampled_st)
    sub_num_tokens = sum([len(st) for st in subsampled_dataset])
    return subsampled_dataset, sub_num_tokens

def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size), min(len(st), center_i + window_size + 1)))
            indices.remove(center_i) # 去掉中心词
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

def get_negative(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                i = 0
                # 从population中按照权重sampling_weights随机选取k(100000)个索引
                neg_candidates = random.choices(population, sampling_weights, k=int(1e5))

            neg = neg_candidates[i]
            i += 1
            if neg not in set(contexts): # 噪声词不能是背景词
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

def load_all_data(file_path):
    raw_dataset = load_data_from_file(file_path)
    dataset, counter, idx_to_token, token_to_idx, num_tokens = token_index(raw_dataset)
    subsampled_dataset, sub_num_tokens = discard(dataset, counter, num_tokens, idx_to_token, prop=1e-4)
    all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)
    sampling_weights = [(counter[w] / sub_num_tokens)**0.75 for w in idx_to_token]
    all_negatives = get_negative(all_contexts, sampling_weights, 5)
    return all_centers, all_contexts, all_negatives, idx_to_token, token_to_idx

def load_all_data2(text):
    raw_dataset = load_data_from_text(text)
    dataset, counter, idx_to_token, token_to_idx, num_tokens = token_index(raw_dataset)
    all_centers, all_contexts = get_centers_and_contexts(dataset, 5)
    weights = [(counter[w] / num_tokens)**0.75 for w in idx_to_token]
    all_negatives = get_negative(all_contexts, dataset, 5)
    return all_centers, all_contexts, all_negatives, idx_to_token, token_to_idx

def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1)) 
    return pred

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])
    def __len__(self):
        return len(self.centers)

def batch_data(data):
    """
    用作DataLoader的参数collate_fn
    data: 长为batch_size的list，list中的每个元素都是Dataset类调用__getitem__得到结果
    """
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1]*cur_len + [0]*(max_len - cur_len)]
        labels += [[1]*len(context) + [0]*(max_len - len(context))]
    return (torch.tensor(centers).view(-1,1),
           torch.tensor(contexts_negatives),
           torch.tensor(masks), torch.tensor(labels))    

class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        '''
        input：predict， Tensor shape of (batch_size, len)
        target：truth label，  Tensor of the same shape as input
        mask: 用于指定batch中参与损失函数计算的部分预测值和标签
        当掩码为1时，相应位置的预测值和标签将参与损失函数的计算；当掩码为0时，不参与计算
        '''
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
        return res.mean(dim=1) * mask.shape[1] /mask.float().sum(dim=1)
    
def train(net, data_iter, loss, optimizer, num_epochs):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negative, net[0], net[1])
            l = loss(pred.view(label.shape), label, mask).mean() # 一个batch的平均loss
            optimizer.zero_grad() #梯度清零
            l.backward() # 计算梯度
            optimizer.step() # 权值更新
            l_sum += l.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs' % (epoch + 1, l_sum / n, time.time() - start))
        
def train2(net, data_iter, loss, optimizer, num_epochs):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negative, net[0], net[1])
            l = loss(pred.view(label.shape), label, mask).mean() # 一个batch的平均loss
            optimizer.zero_grad() #梯度清零
            l.backward() # 计算梯度
            optimizer.step() # 权值更新
            l_sum += l.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs' % (epoch + 1, l_sum / n, time.time() - start))
        

In [11]:
batch_size = 512
num_workers = 0
lr = 0.01
num_epochs = 100
# 读取数据
file_path = r'E:\ly\Code\Jupyter\Pytorch\Dive-into-DL-Pytorch\Datasets\ptb\ptb.train.txt'
all_centers, all_contexts, all_negatives, idx_to_token, token_to_idx = load_all_data(file_path)
dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True, collate_fn=batch_data, num_workers=num_workers)
# 定义模型
embed_size = 100
net = nn.Sequential(
        nn.Embedding(num_embeddings=len(token_to_idx), embedding_dim=embed_size),
        nn.Embedding(num_embeddings=len(token_to_idx), embedding_dim=embed_size)
)
# 定义损失函数
loss = SigmoidBinaryCrossEntropyLoss()
# 定义优化函数
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
# 进行训练
train(net, data_iter, loss, optimizer, num_epochs)

sentence: 42068
tokens:  887521
train on cuda
epoch 1, loss 1.97, time 17.64s
epoch 2, loss 0.63, time 17.80s
epoch 3, loss 0.45, time 17.82s
epoch 4, loss 0.40, time 17.79s
epoch 5, loss 0.37, time 17.82s
epoch 6, loss 0.35, time 17.86s
epoch 7, loss 0.34, time 17.90s
epoch 8, loss 0.33, time 17.86s
epoch 9, loss 0.32, time 17.79s
epoch 10, loss 0.32, time 17.78s
epoch 11, loss 0.31, time 17.83s
epoch 12, loss 0.31, time 17.82s
epoch 13, loss 0.30, time 17.85s
epoch 14, loss 0.30, time 17.84s
epoch 15, loss 0.30, time 17.63s
epoch 16, loss 0.29, time 17.85s
epoch 17, loss 0.29, time 17.83s
epoch 18, loss 0.29, time 17.83s
epoch 19, loss 0.29, time 17.80s
epoch 20, loss 0.28, time 17.83s
epoch 21, loss 0.28, time 17.87s
epoch 22, loss 0.28, time 17.83s
epoch 23, loss 0.28, time 17.82s
epoch 24, loss 0.28, time 17.65s
epoch 25, loss 0.28, time 18.01s
epoch 26, loss 0.28, time 17.71s
epoch 27, loss 0.28, time 17.93s
epoch 28, loss 0.28, time 17.90s
epoch 29, loss 0.27, time 17.91s
epoch 

In [16]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1)
                                * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))



cosine sim=0.416: candy
cosine sim=0.404: claimants
cosine sim=0.395: performed
cosine sim=0.387: consistent
cosine sim=0.386: labeled
cosine sim=0.384: borrowings
cosine sim=0.383: per
cosine sim=0.381: pilot
cosine sim=0.380: laid
cosine sim=0.378: threats


In [22]:
get_similar_tokens('chip', 10, net[0])

cosine sim=0.449: microsoft
cosine sim=0.435: hurdles
cosine sim=0.432: micro
cosine sim=0.407: retailer
cosine sim=0.406: mimic
cosine sim=0.403: users
cosine sim=0.397: intel
cosine sim=0.393: fast-growing
cosine sim=0.393: heights
cosine sim=0.382: radar


In [23]:
get_similar_tokens('asbestos', 10, net[0])

cosine sim=0.416: candy
cosine sim=0.404: claimants
cosine sim=0.395: performed
cosine sim=0.387: consistent
cosine sim=0.386: labeled
cosine sim=0.384: borrowings
cosine sim=0.383: per
cosine sim=0.381: pilot
cosine sim=0.380: laid
cosine sim=0.378: threats


In [21]:
W1 = net[0].weight.data
W2 = net[1].weight.data
x = W1[token_to_idx['asbestos']]
print(x.shape)
print(W2.shape)
pred = torch.matmul(x,W2.view(W2.shape[1], -1))
print(pred.shape)

_, topk = torch.topk(pred, 10)
topk = topk.cpu().numpy()
print(topk)

for i in topk[1:]:
    print(i,idx_to_token[i])

torch.Size([100])
torch.Size([9999, 100])
torch.Size([9999])
[5779 8932 9617 4615 5972 2661 6173 3049 3751 6307]
8932 divisive
9617 micro
4615 hired
5972 affecting
2661 lying
6173 fleet
3049 adding
3751 microwave
6307 honduras


#### 处理数据集

In [19]:
with open(r'E:\ly\Code\Jupyter\Pytorch\Dive-into-DL-Pytorch\Datasets\ptb\ptb.train.txt', 'r') as fr:
    lines = fr.readlines()
    raw_dataset = [st.split() for st in lines]
print('sentence: %d' % len(raw_dataset))

sentence: 42068


#### 建立词语索引

In [None]:
counter = collections.Counter([tk for st in raw_dataset for tk in st])
# 只保留在数据集中出现5次及以上的单词
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))
# 将词映射到整数索引
idx_to_token = [tk for tk,_ in counter.items()]
token_to_idx = {tk: idx for idx,tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
print('tokens: % d' % num_tokens)

#### 二次采样
-  文本中一般会出现一些高频词，如"the",'a'等，这些词的高频出现对模型不利，
- 因此进行二次采样来丢弃这些高频词, 每个单词被丢弃的概率为：
- P(wi) = max(1-sqrt(t/f(wi), 0)

In [None]:
def discard(idx, prop=1e-4):
    '''
    idx: 单词的数字索引
    prop: 超参数，默认为1e-4
    random.uniform(0,1)会随机产生0-1之间的数，当f_wi远大于prop，
    即该单词出现次数很高，它与总词数之比远大于prop，
    此时 1 - math.sqrt(prop / f_wi)近乎为1，
    不等式成立，返回True,该词被丢弃，反之返回False,保留。
    '''
    f_wi  = counter[idx_to_token[idx]] / num_tokens
    return random.uniform(0,1) < 1 - math.sqrt(prop / f_wi)

In [None]:
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
sub_num_tokens = sum([len(st) for st in subsampled_dataset])
print('subsampled_tokens: %d' % sub_num_tokens)

In [None]:
# 比较一个词在二次采样前后出现在数据集中的次数
def compare_counts(token):
    num_before = sum([st.count(token_to_idx[token]) for st in dataset])
    num_after = sum([st.count(token_to_idx[token]) for st in subsampled_dataset])
    return '%s: before=%d, after=%d' % (token, num_before, num_after)

compare_counts('the')

#### 提取中心词和背景词
在1-max_window_size之间随机选择一个整数作为窗口半径

In [None]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size), min(len(st), center_i + window_size + 1)))
            indices.remove(center_i) # 去掉中心词
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [None]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

#### negative sampling 负采样
- 对于一对中心词和背景词，随机采样K个噪声词
- 噪声词采样率P(w)设为w词频与总词频之比的0.75次方

In [None]:
def get_negative(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                i = 0
                # 从population中按照权重sampling_weights随机选取K个索引
                neg_candidates = random.choices(population, sampling_weights, k=int(1e5))

            neg = neg_candidates[i]
            i += 1
            if neg not in set(contexts): # 噪声词不能是背景词
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives
sampling_weights = [(counter[w] / sub_num_tokens)**0.75 for w in idx_to_token]
all_negatives = get_negative(all_contexts, sampling_weights, 5)

#### 读取数据
- 在一个batch中，每个样本都包括一个中心词、ni个背景词 + mi个噪声词,"
- contexts_negatives: 因为ni + mi不一样大，因此在后面填充0来统一为长度max(ni+mi)，
- mask: contexts_negatives中为填充项时，mask中对应为0，其他为1
- labels: contexts_negatives中为contexts时，labels中对应位置为1，其他为0

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])
    def __len__(self):
        return len(self.centers)

In [None]:
def batch_data(data):
    """
    用作DataLoader的参数collate_fn
    data: 长为batch_size的list，list中的每个元素都是Dataset类调用__getitem__得到结果
    """
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1]*cur_len + [0]*(max_len - cur_len)]
        labels += [[1]*len(context) + [0]*(max_len - len(context))]
    return (torch.tensor(centers).view(-1,1),
           torch.tensor(contexts_negatives),
           torch.tensor(masks), torch.tensor(labels))

### Skip-gram Model

#### 小批量乘法
- 使用小批量乘法运算bmm对两个小批量中的矩阵一一做乘法
- (batch_size,1,embedding_dim) * (batch_size, max_len, embedding_dim) = (batch_size, 1, max_len)
- torch.bmm(X,Y)
- permut(将行列转置) (batch_size, max_len, embedding_dim) -> (batch_size, embedding_dim, max_len)

#### Skip-gram Forward Propagation

In [None]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1)) 
    return pred

#### 二元交叉熵损失函数

In [None]:
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        '''
        input：predict， Tensor shape of (batch_size, len)
        target：truth label，  Tensor of the same shape as input
        mask: 用于指定batch中参与损失函数计算的部分预测值和标签
        当掩码为1时，相应位置的预测值和标签将参与损失函数的计算；当掩码为0时，不参与计算
        '''
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
        return res.mean(dim=1) * mask.shape[1] /mask.float().sum(dim=1)
    
loss = SigmoidBinaryCrossEntropyLoss()

#### 初始化模型参数
##### 嵌入层
- 用来获取词向量的层，通过创建nn.Embedding实例得到
- 嵌入层的权重是(num_embedding, embedding_dim)的矩阵，行数为词典大小，列数为每个词向量的维度
- 输入：词的索引i
- 输出：返回权重矩阵的第i行作为其词向量

In [None]:
embed_size = 100
net = nn.Sequential(
        nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
        nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size)
)

#### 定义训练函数

In [None]:
def train(net, lr, num_epochs):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negative, net[0], net[1])
            l = loss(pred.view(label.shape), label, mask).mean() # 一个batch的平均loss
            optimizer.zero_grad() #梯度清零
            l.backward() # 计算梯度
            optimizer.step() # 权值更新
            l_sum += l.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs' % (epoch + 1, l_sum / n, time.time() - start))

In [None]:
batch_size = 512
num_workers = 0
dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True, collate_fn=batch_data, num_workers=num_workers)
train(net, 0.01, 10)

#### 应用词嵌入模型
- 根据两个词向量的余弦相似度表示词与词之间在语义上的相似度

In [None]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1)
                                * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

get_similar_tokens('government', 5, net[0])

#### 测试模型