In [1]:
# 查看当前挂载的数据集目录
!ls /home/kesci/input/

ptb_train1020


In [None]:
# 查看个人持久化工作区文件
!ls /home/kesci/work/

In [None]:
# 查看当前kernerl下的package
!pip list --format=columns

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import math
import numpy as np
import random
import collections
import time

In [2]:
with open('/home/kesci/input/ptb_train1020/ptb.train.txt', 'r') as f:
    corpus = f.readlines()
    # print(corpus[:3])
    corpus = [line.strip().split(' ') for line in corpus]
    # print(corpus[:3])

def get_vocab(corpus, min_freq):
    tokens = [token for sentence in corpus for token in sentence]
    counter = collections.Counter(tokens)
    token_freq = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    idx2token = [token for token, freq in token_freq if freq >= min_freq]
    token2idx = {}
    for index, token in enumerate(idx2token):
        token2idx[token] = index
    vocab_size = len(idx2token)
    return counter, idx2token, token2idx, vocab_size

counter, idx2token, token2idx, vocab_size = get_vocab(corpus, 5)
vocab_size

    

9858

In [3]:
dataset = [[token2idx[token] for token in sentence if token in idx2token] for sentence in corpus]
num_tokens = sum([len(sentence) for sentence in dataset])

In [4]:
num_tokens

887100

In [5]:
dataset[:3]

[[],
 [8568, 1, 2, 71, 392, 32, 2115, 0, 145, 18, 5, 8569, 274, 406, 2],
 [22, 1, 12, 140, 3, 1, 5277, 0, 3054, 1580, 95]]

In [6]:
## 降采样
def discard(idx):
    return np.random.uniform(0, 1) < 1 - math.sqrt(1e-4 / counter[idx2token[idx]]* num_tokens)

subsampling_dataset = [[token for token in sentence if not discard(token)] for sentence in dataset]
subsampling_dataset[:3]

[[], [8568, 71, 392, 2115, 8569, 406], [140, 5277, 3054, 1580]]

In [7]:
def compare_num_tokens(token):
    before = sum([sentence.count(token2idx[token]) for sentence in dataset])
    after = sum([sentence.count(token2idx[token]) for sentence in subsampling_dataset])
    print('before subsampling, the number of %s is %s'%(token, before))
    print('after subsampling, the number of %s is %s'%(token, after))
compare_num_tokens('the')
compare_num_tokens('<unk>')
    

before subsampling, the number of the is 50770
after subsampling, the number of the is 2150
before subsampling, the number of <unk> is 45020
after subsampling, the number of <unk> is 2030


In [8]:
def get_context(sentence, center, max_window_size):
    window_size = np.random.randint(1, max_window_size + 1)
    start_idx = max(0, center - max_window_size)
    end_idx = min(len(sentence), center + max_window_size)
    return sentence[start_idx: center] + sentence[center + 1: end_idx + 1]

def get_contexts(dataset, max_window_size):
    contexts, centers = [], []
    for sentence in dataset:
        if len(sentence) < 2:
            continue
        centers += sentence
        for center_idx in range(len(sentence)):
            context = get_context(sentence, center_idx, max_window_size)
            contexts.append(context)
    return centers, contexts

all_centers, all_contexts = get_contexts(subsampling_dataset, 5)
print(len(all_centers)==len(all_contexts))

# 测试get_contexts函数
# tiny = [list(range(4,10)), list(range(3,8))]
# print(tiny)
# a, b = get_contexts(tiny, 2)
# for center, context in zip(a,b):
#     print('center %s -- context %s'%(center, context))



True


In [9]:
##trick  考虑到挑选出negative词之后还要舍去在contexts中的词，因此可以先选出很多negative，
##       然后按顺序放进negative列表中。

def get_negatives(contexts, sampling_weights, K):
    negatives, neg_candidates = [], []
    i = 0
    for context in contexts:
        negative = []
        while len(negative) < len(context) * K:
            if i == len(neg_candidates):
                neg_candidates = random.choices(list(range(len(sampling_weights))), sampling_weights, k=int(1e5))
                i = 0
            if neg_candidates[i] not in context:
                negative.append(neg_candidates[i])
            i += 1
        negatives.append(negative)
    return negatives

sampling_weights = [counter[token]**0.75 for token in idx2token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

        
        
    
    

In [10]:
print(subsampling_dataset[1])
print(all_centers[0])
print(all_contexts[0], len(all_contexts[0]))
print(all_negatives[0], len(all_negatives[0]))
print(len(all_contexts) == len(all_negatives))

[8568, 71, 392, 2115, 8569, 406]
8568
[71, 392, 2115, 8569, 406] 5
[90, 8730, 0, 2904, 8, 4260, 358, 6343, 226, 18, 513, 2125, 1264, 1614, 29, 108, 4674, 2, 1153, 2832, 1425, 1725, 8468, 450, 5172] 25
True


In [11]:
def get_contexts_and_negatives(contexts, negatives):
    max_len = max([len(context)+len(negative) for context, negative in zip(contexts, negatives)])
    contexts_negatives, labels, masks = [], [], []
    for context, negative in zip(contexts, negatives):
        current_len = len(context) + len(negative)
        context_negative = context + negative + [0] * (max_len - current_len)
        label = [1] * len(context) + [0] * (max_len - len(context))
        mask = [1] * current_len + [0] * (max_len - current_len)
        contexts_negatives.append(context_negative)
        labels.append(label)
        masks.append(mask)
    return torch.tensor(contexts_negatives), torch.tensor(labels), torch.tensor(masks)

contexts_negatives, labels, masks = get_contexts_and_negatives(all_contexts, all_negatives)

# print(all_centers[1])
# print(all_contexts[1])
# print(all_negatives[1])
# print(contexts_negatives[1])
# print(labels[1])
# print(masks[1])
    

In [12]:
data = Data.TensorDataset(torch.tensor(all_centers), contexts_negatives, labels, masks)
data_iter = Data.DataLoader(data, batch_size=512, shuffle=True)

for batch in data_iter:
    print(batch[0].size(), batch[1].size(), batch[2].size(), batch[3].size())
    break
    

torch.Size([512]) torch.Size([512, 60]) torch.Size([512, 60]) torch.Size([512, 60])


In [13]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SkipGram, self).__init__()
        self.input_embedding = nn.Embedding(vocab_size, embed_size)
        self.output_embedding = nn.Embedding(vocab_size, embed_size)
    def forward(self, center, target):
        center_embedding = self.input_embedding(center) #[batch_size, 1, embed_size]
        target_embedding = self.output_embedding(target) #[batch_size, K, embed_size]
        weight = torch.bmm(center_embedding, target_embedding.transpose(1, 2)) #[batch_size, 1, K]
        return weight

In [14]:
def train_skip_gram(num_epochs, net, optimizer, loss_fn):
    for epoch in range(num_epochs):
        train_loss = 0
        n = 0
        start = time.time()
        for center, context_negative, label, mask in data_iter:
            optimizer.zero_grad()
            weight = net(center.view(-1, 1), context_negative).squeeze(1)
            masked_weight = torch.mul(weight, mask)
            loss = (loss_fn(masked_weight.float(), label.float()).sum(dim=1)/mask.float().sum(dim=1)).mean()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            n += 1
        print('Epoch %s , Train Loss %.5f, Used Time %s'%(epoch, train_loss/n, time.time()-start))
            

In [30]:
net = SkipGram(vocab_size, 100)
loss_fn = nn.BCEWithLogitsLoss(reduction='none')
optimizer = optim.Adam(net.parameters(), lr=0.01)
train_skip_gram(15, net, optimizer, loss_fn)

Epoch 0 , Train Loss 2.21965, Used Time 68.63986110687256
Epoch 1 , Train Loss 1.05805, Used Time 69.09787106513977
Epoch 2 , Train Loss 0.93120, Used Time 68.80610537528992
Epoch 3 , Train Loss 0.89106, Used Time 68.78967523574829
Epoch 4 , Train Loss 0.87243, Used Time 69.08850312232971
Epoch 5 , Train Loss 0.86118, Used Time 69.01385641098022
Epoch 6 , Train Loss 0.85283, Used Time 68.99070835113525
Epoch 7 , Train Loss 0.84597, Used Time 68.91781497001648
Epoch 8 , Train Loss 0.84019, Used Time 69.29813933372498
Epoch 9 , Train Loss 0.83522, Used Time 69.03603482246399
Epoch 10 , Train Loss 0.83102, Used Time 68.72866415977478
Epoch 11 , Train Loss 0.82750, Used Time 68.41314888000488
Epoch 12 , Train Loss 0.82441, Used Time 69.18851280212402
Epoch 13 , Train Loss 0.82176, Used Time 69.12681889533997
Epoch 14 , Train Loss 0.81948, Used Time 69.20515251159668


In [36]:
def get_similar_words(token, embed, k):
    token_embedding = embed[token2idx[token]].view(-1, 1)
    weight = torch.matmul(embed, token_embedding).squeeze(1)/torch.sqrt(torch.sum(torch.mul(embed, embed), dim=1)*torch.sum(torch.mul(token_embedding, token_embedding))+1e-9)
    values, topks = torch.topk(weight, k+1)
    for i in range(k+1):
        print('cosine similarity = %s, word is %s'%(values[i], idx2token[topks[i]]))
        

In [37]:
embed = net.input_embedding.weight.data
get_similar_words('chip', embed, 5)

cosine similarity = tensor(1.), word is chip
cosine similarity = tensor(0.5681), word is intel
cosine similarity = tensor(0.5476), word is chips
cosine similarity = tensor(0.4699), word is microprocessor
cosine similarity = tensor(0.4669), word is bugs
cosine similarity = tensor(0.4593), word is tasks
