In [1]:
import os
import random
import io 
import sys 
import requests
from collections import OrderedDict 
import math 
import numpy as np 
import paddle

In [2]:
def download():
    corpus_url='https://dataset.bj.bcebos.com/word2vec/text8.txt'
    web_requests=requests.get(corpus_url)
    corpus=web_requests.content
    with open('./text8.txt','wb') as f:
        f.write(corpus)
        f.close()
    

In [3]:
download()

In [4]:
def load_text8():
    with open('./text8.txt') as f:
        corpus=f.read().strip('\n')
        f.close()
    return corpus

In [5]:
corpus=load_text8()

In [6]:
corpus[:500]

' anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philoso'

In [7]:
def data_preprocess(corpus):
    corpus=corpus.strip().lower()
    corpus=corpus.split(' ')
    return corpus

In [8]:
corpus=data_preprocess(corpus)

In [9]:
corpus[:50]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the']

## 构造词典

In [10]:
def build_dict(corpus):
    word_freq_dict={}
    for word in corpus:
        if word not in word_freq_dict:
            word_freq_dict[word]=0
        word_freq_dict[word]+=1
    
    word_freq_dict=sorted(word_freq_dict.items(),key=lambda x:x[1],reverse=True)
    
    word2id={}
    id2freq={}
    id2word={}
    
    for word,freq in word_freq_dict:
        ind=len(word2id)
        word2id[word]=ind
        id2freq[ind]=freq
        id2word[ind]=word
    
    return id2freq,word2id,id2word

In [11]:
id2freq,word2id,id2word=build_dict(corpus)

In [12]:
vocab_size=len(id2freq)

In [13]:
vocab_size

253854

In [14]:
for _,(word,word_id) in zip(range(8),word2id.items()):
    print(f'word:{word},its id:{word_id},its freq:{id2freq[word_id]}')

word:the,its id:0,its freq:1061396
word:of,its id:1,its freq:593677
word:and,its id:2,its freq:416629
word:one,its id:3,its freq:411764
word:in,its id:4,its freq:372201
word:a,its id:5,its freq:325873
word:to,its id:6,its freq:316376
word:zero,its id:7,its freq:264975


## 转化为id序列

In [15]:
corpus=[word2id[word] for word in corpus]

In [16]:
corpus[:8]

[5233, 3080, 11, 5, 194, 1, 3133, 45]

## 二次采样

In [17]:
def subsampling(corpus,id2freq):
    
    def discard(word_id):
        q=1-math.sqrt(1e-4/id2freq[word_id]*len(corpus))
        return random.uniform(0,1)<q
    corpus=[word for word in corpus if not discard(word)]
    return corpus

In [18]:
corpus=subsampling(corpus,id2freq)
print(len(corpus))
print(corpus[:8])

8744358
[5233, 3080, 194, 3133, 741, 10571, 133, 27349]


## 构造数据集

In [19]:
def build_data(corpus,word2id,id2freq,max_window_size=3,negative_sample_num=4):
    dataset=[]
    center_word_idx=0
    
    while center_word_idx<len(corpus):
        window_size=random.randint(1,max_window_size)
        positive_word=corpus[center_word_idx]
        
        context_word_range=(max(0,center_word_idx-window_size),min(len(corpus)-1,center_word_idx+window_size))
        context_word_candidates=[corpus[idx] for idx in range(context_word_range[0],context_word_range[1]+1) if idx!=center_word_idx]
        
        for context_word in context_word_candidates:
            dataset.append((positive_word,context_word,1))
            i=0
            while i<negative_sample_num:
                negative_word_candidate=random.randint(0,vocab_size-1)
                if negative_word_candidate is not context_word:
                    dataset.append((positive_word,negative_word_candidate,0))
                    i+=1
        
        center_word_idx=min(len(corpus)-1,center_word_idx+window_size)
        if center_word_idx==(len(corpus)-1):
            center_word_idx+=1
        if(center_word_idx%100000==0):
            print(center_word_idx)
    return dataset
    

In [20]:
dataset=build_data(corpus,word2id,id2freq)

100000
300000
400000
500000
600000
700000
1000000
1100000
1300000
1600000
1700000
2100000
2300000
2400000
2500000
2600000
2700000
2800000
3200000
3400000
3600000
3700000
3800000
4200000
4400000
4600000
4700000
4900000
5000000
5300000
5400000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6400000
6500000
6700000
6900000
7100000
7300000
7400000
7600000
7700000
8000000
8600000


In [21]:
for _,data in zip(range(11),dataset):
    print(data)

(5233, 3080, 1)
(5233, 19129, 0)
(5233, 210594, 0)
(5233, 211012, 0)
(5233, 119091, 0)
(5233, 194, 1)
(5233, 41339, 0)
(5233, 185695, 0)
(5233, 127233, 0)
(5233, 221628, 0)
(194, 5233, 1)


## 构造数据生成器

In [22]:
def build_batch(dataset,batch_size,epoch_num):
    context_word_batch=[]
    target_word_batch=[]
    label_batch=[]
    
    for epoch in range(epoch_num):
        random.shuffle(dataset)
        for context_word,target_word,label in dataset:
            context_word_batch.append([context_word])
            target_word_batch.append([target_word])
            label_batch.append(label)
            
            if len(context_word_batch)==batch_size:
                yield np.array(context_word_batch).astype('int64'),\
                    np.array(target_word_batch).astype('int64'),\
                    np.array(label_batch).astype('float32')
                
                context_word_batch=[]
                target_word_batch=[]
                label_batch=[]
        
    if len(context_word_batch)>0:
        yield np.array(context_word_batch).astype('int64'),\
            np.array(target_word_batch).astype('int64'),\
            np.array(label_batch).astype('float32')
            

## 模型配置

In [23]:
class CBOW(paddle.nn.Layer):
    def __init__(self,vocab_size,embedding_size,init_scale=0.1):
        super(CBOW,self).__init__()
        self.vocab_size=vocab_size
        self.embedding_size=embedding_size
        self.embedding=paddle.nn.Embedding(
            self.vocab_size,
            self.embedding_size,
            weight_attr=paddle.ParamAttr(
                name='embedding_para114514',
                initializer=paddle.nn.initializer.Uniform(
                    low=-0.5/embedding_size,high=0.5/embedding_size
                )
            )
        )
        
        self.embedding_out=paddle.nn.Embedding(
            self.vocab_size,
            self.embedding_size,
            weight_attr=paddle.ParamAttr(
                name='embedding_out_para114514',
                initializer=paddle.nn.initializer.Uniform(
                    low=-0.5/embedding_size,high=0.5/embedding_size
                )
            )
        )
    
    def forward(self,context_words,target_words,label):
        context_words_emb=self.embedding(context_words)
        target_words_emb=self.embedding_out(target_words)
        word_sim=paddle.multiply(context_words_emb,target_words_emb)
        word_sim=paddle.sum(word_sim,axis=-1)
        word_sim=paddle.reshape(word_sim,shape=[-1])
        pred=paddle.nn.functional.sigmoid(word_sim)
        loss=paddle.nn.functional.binary_cross_entropy(pred,label)
#         print(f"see loss:\n{loss}")
        loss=paddle.mean(loss)
#         print(f"see loss after mean:\n{loss}")
        return pred,loss


## 训练

In [24]:
batch_size=512
epoch_num=3
embedding_size=200
step=0
learning_rate=0.001

def get_cos(query1_token,query2_token,embed):
    W=embed
    x=W[word2id[query1_token]]
    y=W[word2id[query2_token]]
    cos=np.dot(x,y)/np.sqrt(np.sum(y*y)*np.sum(x*x)+1e-9)
    flat=cos.flatten()
    print(f"{query1_token}和{query2_token}的cos结果为{cos}")

In [25]:
skip_gram_model=CBOW(vocab_size,embedding_size)
adam=paddle.optimizer.Adam(learning_rate=learning_rate,parameters=skip_gram_model.parameters())

In [26]:

for context_words,target_words,label in build_batch(dataset,batch_size,epoch_num):
    context_words_var=paddle.to_tensor(context_words)
    target_words_var=paddle.to_tensor(target_words)
    label_var=paddle.to_tensor(label)
    
#     print(f"see target_words:\n{target_words}")
    
    pred,loss=skip_gram_model(context_words_var,target_words_var,label_var)
    
    loss.backward()
    adam.minimize(loss)
    skip_gram_model.clear_gradients()
    
    step+=1
    if step%100==0:
        print(f'step {step},loss {loss.numpy()[0]}')
    
    if step%2000==0:
        embedding_matrix=skip_gram_model.embedding.weight.numpy()
        np.save('./embedding',embedding_matrix)
        get_cos("king","queen",embedding_matrix)
        get_cos('she','her',embedding_matrix)
        get_cos('topic','theme',embedding_matrix)
        get_cos('woman','game',embedding_matrix)
        get_cos('one','name',embedding_matrix)

step 100,loss 0.6931471228599548
step 200,loss 0.6931614875793457
step 300,loss 0.6929924488067627
step 400,loss 0.6926132440567017
step 500,loss 0.6902071237564087
step 600,loss 0.6881276369094849
step 700,loss 0.6812677383422852
step 800,loss 0.6705127358436584
step 900,loss 0.6613295078277588
step 1000,loss 0.6545624732971191
step 1100,loss 0.6315697431564331
step 1200,loss 0.6074579954147339
step 1300,loss 0.582335352897644
step 1400,loss 0.5852804780006409
step 1500,loss 0.5580507516860962
step 1600,loss 0.5468419194221497
step 1700,loss 0.4965466558933258
step 1800,loss 0.4820517897605896
step 1900,loss 0.47042879462242126
step 2000,loss 0.48551705479621887
king和queen的cos结果为0.9099274142551904
she和her的cos结果为0.9551867465448861
topic和theme的cos结果为0.9095779579951178
woman和game的cos结果为0.9152326551774839
one和name的cos结果为0.9502887990502381
step 2100,loss 0.4493476450443268
step 2200,loss 0.4242601990699768
step 2300,loss 0.42638760805130005
step 2400,loss 0.4001958668231964
step 2500,loss 

KeyboardInterrupt: 