In [1]:
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
url='http://mattmahoney.net/dc/'

def maybe_download(filename,expected_bytes):
    if not os.path.exists(filename):
        filename,_=urllib.request.urlretrieve(url+filename,filename)
    statinfo=os.stat(filename)
    if statinfo.st_size==expected_bytes:
        print('File existed and verified ',filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify '+filename)
    return filename

filename=maybe_download('text8.zip',31344016)

File and verified  text8.zip


In [3]:
def read_data(filename):
    with zipfile.ZipFile(filename) as fin:
        data=tf.compat.as_str(fin.read(fin.namelist()[0])).split()
    return data

In [4]:
words=read_data(filename)
print('Data size',len(words))

Data size 17005207


In [7]:
words

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the',
 'organization',
 'of',
 'society',
 'it',
 'has',
 'also',
 'been',
 'taken',
 'up',
 'as',
 'a',
 'positive',
 'label',
 'by',
 'self',
 'defined',
 'anarchists',
 'the',
 'word',
 'anarchism',
 'is',
 'derived',
 'from',
 'the',
 'greek',
 'without',
 'archons',
 'ruler',
 'chief',
 'king',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'is',
 'the',
 'belief',
 'that',
 'rulers',
 'are',
 'unnecessary',
 'and',
 'should',
 'be',
 'abolished',
 'although',
 'there',
 'are',
 'differing',
 '

In [9]:
len(set(words))

253854

In [16]:
vocabulary_size=50000
def build_dataset(words):
    count=[['UNK',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    dictionary=dict()
    for word,_ in count:
        dictionary[word]=len(dictionary)
    data=list()
    unk_count=0
    for word in words:
        if word in dictionary:
            index=dictionary[word]
        else:
            index=0
            unk_count+=1
        data.append(index)  # 出现次数越高，序号越小，data为words每个词对应的序号
    count[0][1]=unk_count
    reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
    
    return data,count,dictionary,reverse_dictionary

In [17]:
x=[1,2,2,3,3,3]
collections.Counter(x).most_common(3)

[(3, 3), (2, 2), (1, 1)]

In [18]:
data,count,dictionary,reverse_dictionary=build_dataset(words)

In [10]:
data

[5234,
 3081,
 12,
 6,
 195,
 2,
 3134,
 46,
 59,
 156,
 128,
 742,
 477,
 10572,
 134,
 1,
 27350,
 2,
 1,
 103,
 855,
 3,
 1,
 15068,
 0,
 2,
 1,
 151,
 855,
 3581,
 1,
 195,
 11,
 191,
 59,
 5,
 6,
 10713,
 215,
 7,
 1325,
 105,
 455,
 20,
 59,
 2732,
 363,
 7,
 3673,
 1,
 709,
 2,
 372,
 27,
 41,
 37,
 54,
 540,
 98,
 12,
 6,
 1424,
 2758,
 19,
 568,
 687,
 7089,
 1,
 248,
 5234,
 11,
 1053,
 28,
 1,
 321,
 249,
 44612,
 2878,
 793,
 187,
 5234,
 12,
 6,
 201,
 603,
 11,
 1,
 1135,
 20,
 2622,
 26,
 8984,
 3,
 280,
 32,
 4148,
 142,
 60,
 26,
 6438,
 4187,
 2,
 154,
 33,
 363,
 5234,
 37,
 1138,
 7,
 448,
 345,
 1819,
 20,
 4861,
 1,
 6754,
 2,
 7574,
 1775,
 567,
 1,
 94,
 1,
 248,
 11065,
 12,
 52,
 7089,
 90,
 27,
 271,
 38,
 5949,
 4862,
 20300,
 29,
 0,
 42,
 318,
 6,
 25637,
 528,
 7574,
 372,
 5,
 259,
 2,
 154,
 26,
 1207,
 12,
 7574,
 201,
 1577,
 3,
 15201,
 333,
 1775,
 7089,
 4861,
 345,
 765,
 161,
 407,
 5691,
 756,
 2,
 4106,
 1132,
 4332,
 1537,
 3,
 568,
 8118,
 99

In [13]:
dictionary

{'UNK': 0,
 'the': 1,
 'of': 2,
 'and': 3,
 'one': 4,
 'in': 5,
 'a': 6,
 'to': 7,
 'zero': 8,
 'nine': 9,
 'two': 10,
 'is': 11,
 'as': 12,
 'eight': 13,
 'for': 14,
 's': 15,
 'five': 16,
 'three': 17,
 'was': 18,
 'by': 19,
 'that': 20,
 'four': 21,
 'six': 22,
 'seven': 23,
 'with': 24,
 'on': 25,
 'are': 26,
 'it': 27,
 'from': 28,
 'or': 29,
 'his': 30,
 'an': 31,
 'be': 32,
 'this': 33,
 'which': 34,
 'at': 35,
 'he': 36,
 'also': 37,
 'not': 38,
 'have': 39,
 'were': 40,
 'has': 41,
 'but': 42,
 'other': 43,
 'their': 44,
 'its': 45,
 'first': 46,
 'they': 47,
 'some': 48,
 'had': 49,
 'all': 50,
 'more': 51,
 'most': 52,
 'can': 53,
 'been': 54,
 'such': 55,
 'many': 56,
 'who': 57,
 'new': 58,
 'used': 59,
 'there': 60,
 'after': 61,
 'when': 62,
 'into': 63,
 'american': 64,
 'time': 65,
 'these': 66,
 'only': 67,
 'see': 68,
 'may': 69,
 'than': 70,
 'world': 71,
 'i': 72,
 'b': 73,
 'would': 74,
 'd': 75,
 'no': 76,
 'however': 77,
 'between': 78,
 'about': 79,
 'over': 80

In [14]:
reverse_dictionary

{0: 'UNK',
 1: 'the',
 2: 'of',
 3: 'and',
 4: 'one',
 5: 'in',
 6: 'a',
 7: 'to',
 8: 'zero',
 9: 'nine',
 10: 'two',
 11: 'is',
 12: 'as',
 13: 'eight',
 14: 'for',
 15: 's',
 16: 'five',
 17: 'three',
 18: 'was',
 19: 'by',
 20: 'that',
 21: 'four',
 22: 'six',
 23: 'seven',
 24: 'with',
 25: 'on',
 26: 'are',
 27: 'it',
 28: 'from',
 29: 'or',
 30: 'his',
 31: 'an',
 32: 'be',
 33: 'this',
 34: 'which',
 35: 'at',
 36: 'he',
 37: 'also',
 38: 'not',
 39: 'have',
 40: 'were',
 41: 'has',
 42: 'but',
 43: 'other',
 44: 'their',
 45: 'its',
 46: 'first',
 47: 'they',
 48: 'some',
 49: 'had',
 50: 'all',
 51: 'more',
 52: 'most',
 53: 'can',
 54: 'been',
 55: 'such',
 56: 'many',
 57: 'who',
 58: 'new',
 59: 'used',
 60: 'there',
 61: 'after',
 62: 'when',
 63: 'into',
 64: 'american',
 65: 'time',
 66: 'these',
 67: 'only',
 68: 'see',
 69: 'may',
 70: 'than',
 71: 'world',
 72: 'i',
 73: 'b',
 74: 'would',
 75: 'd',
 76: 'no',
 77: 'however',
 78: 'between',
 79: 'about',
 80: 'over'

In [4]:
np.ndarray(shape=(10,1),dtype=np.int32)

array([[ 889555688],
       [       514],
       [1583115840],
       [         0],
       [1583115968],
       [         0],
       [         0],
       [         0],
       [         0],
       [         0]])

In [5]:
np.ndarray(shape=(10),dtype=np.int32)

array([ 886692752,        514, 1583115840,          0, 1583115968,
                0, 1583116096,          0, 1029034952,        514])

In [21]:
data_index=0

def generate_batch(batch_size,num_skips,skip_window):
    '''
    skip_window: 单词最远可联系的距离
    num_skips: 对每个单词生成样本数量，num_skips<=2*skip_window
    batch_size应是num_skips的整数倍
    span是某个单词创建相关样本时使用的单词个数，包括目标单词本身和其前后的单词，因此span=2*skip_window+1
    '''
    global data_index
    assert batch_size%num_skips==0
    assert num_skips<=2*skip_window
    batch=np.ndarray(shape=(batch_size),dtype=np.int32)
    labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
    span=2*skip_window+1
    buffer=collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index=(data_index+1)%len(data)
    for i in range(batch_size//num_skips):
        target=skip_window
        targets_to_avoid=[skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target=random.randint(0,span-1)
            targets_to_avoid.append(target)
            batch[i*num_skips+j]=buffer[skip_window]  # feature, 目标单词
            labels[i*num_skips+j,0]=buffer[target]  # label， 语境单词/目标单词上下文
        buffer.append(data[data_index])
        data_index=(data_index+1)%len(data)
        
    return batch,labels

In [23]:
batch,labels=generate_batch(batch_size=8,num_skips=2,skip_window=1)
for i in range(8):
    print(batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]])

59 used -> 46 first
59 used -> 156 against
156 against -> 128 early
156 against -> 59 used
128 early -> 742 working
128 early -> 156 against
742 working -> 477 class
742 working -> 128 early


In [27]:
batch_size=128
embedding_size=128
skip_window=1
num_skips=2

valid_size=16
valid_window=100
valid_examples=np.random.choice(valid_window,valid_size,replace=False)
num_sampled=64

In [26]:
np.random.choice(10,3,replace=False)

array([8, 0, 3])

In [30]:
graph=tf.Graph()
with graph.as_default():
    train_inputs=tf.placeholder(tf.int32,shape=[batch_size])
    train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])
    valid_dataset=tf.constant(valid_examples,dtype=tf.int32)
    
    with tf.device('/cpu:0'):
        embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.,1.))
        embed=tf.nn.embedding_lookup(embeddings,train_inputs)
        nce_weights=tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1./math.sqrt(embedding_size)))
        nce_biases=tf.Variable(tf.zeros([vocabulary_size]))
        
    loss=tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                      biases=nce_biases,
                                      labels=train_labels,
                                      inputs=embed,
                                      num_sampled=num_sampled,
                                      num_classes=vocabulary_size))
    optimizer=tf.train.GradientDescentOptimizer(1.).minimize(loss)
    
    norm=tf.sqrt(tf.reduce_mean(tf.square(embeddings),axis=1,keep_dims=True))
    normalized_embeddings=embeddings/norm
    valid_embeddings=tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
    similarity=tf.matmul(valid_embeddings,normalized_embeddings,transpose_b=True)
    
    init=tf.global_variables_initializer()
    num_steps=10001
    
    with tf.Session(graph=graph) as sess:
        init.run()
        print('initialized...')
        
        average_loss=0
        for step in range(num_steps):
            batch_inputs,batch_labels=generate_batch(batch_size,num_skips,skip_window)
            feed_dict={train_inputs:batch_inputs,train_labels:batch_labels}
            
            _,loss_val=sess.run([optimizer,loss],feed_dict=feed_dict)
            average_loss+=loss_val
            
            if step%200==0:
                if step>0:
                    average_loss/=200
                print(f'Step {step}-average loss: {average_loss}')
                average_loss=0
            
            if step%1000==0:
                sim=similarity.eval()
                for i in range(valid_size):
                    valid_word=reverse_dictionary[valid_examples[i]]
                    top_k=8
                    nearest=(-sim[i,:]).argsort()[1:top_k+1]
                    log_str=f'Nearest to {valid_word}'
                    
                    for k in range(top_k):
                        close_word=reverse_dictionary[nearest[k]]
                        log_str=f'{log_str} {close_word},'
                    print(log_str)
                    
        final_embeddings=normalized_embeddings.eval()

initialized...
Step 0-average loss: 247.48025512695312
Nearest to is maison, montezuma, interlude, proceeding, preclear, motorized, pharmacies, bucaram,
Nearest to some wary, attention, crystal, cisneros, weidman, ansi, sacramental, dares,
Nearest to d tattoo, exonerated, compressibility, demo, gch, shutter, vacant, merwara,
Nearest to when algeria, subgenus, disestablished, demolish, hastings, lner, caenorhabditis, autocrat,
Nearest to in gs, desi, zhou, lexeme, scranton, archimedes, prost, bomber,
Nearest to also made, santana, coolant, stimulating, serine, absorber, maximin, exponentiation,
Nearest to four madonna, previously, indictment, composition, combating, russification, sahih, saucers,
Nearest to more depreciation, aristocrats, anarchists, gift, stoneman, loan, imperatoribus, attached,
Nearest to no demigod, timeout, robeson, jeet, customers, kul, judith, parsimony,
Nearest to can freed, gfdl, barnet, nathanael, frankel, volk, smelter, frontiersman,
Nearest to states idf, hol

Step 5200-average loss: 32.93812355041504
Step 5400-average loss: 31.89384117603302
Step 5600-average loss: 31.969724855422974
Step 5800-average loss: 29.610390934944153
Step 6000-average loss: 27.717304821014405
Nearest to is processors, was, in, s, agave, are, ada, andrei,
Nearest to some attention, geometric, ansi, emissions, seeing, the, crystal, quite,
Nearest to d gch, weber, gland, against, preston, modern, story, hindi,
Nearest to when algeria, rotate, best, demolish, america, place, violent, flourished,
Nearest to in and, of, by, to, cardinality, on, gland, at,
Nearest to also made, vs, morning, allies, prominent, video, employer, donald,
Nearest to four composition, rotate, previously, asterism, vs, zero, nine, agave,
Nearest to more attached, anarchists, depreciation, rotate, astronauts, democratic, bottom, gift,
Nearest to no robeson, that, until, customers, backfield, transform, molecular, continued,
Nearest to can freed, making, ran, alphabetical, crew, nathanael, moralit

In [31]:
def plot_with_labels(low_dim_embs,labels,filename='tsne.png'):
    assert low_dim_embs.shape[0]>=len(labels),'low_dim_embs should greater than len(labels)'
    plt.figure(figsize=(18,18))
    for i,label in enumerate(labels):
        x,y=low_dim_embs[i,:]
        plt.scatter(x,y)
        plt.annotate(label,
                    xy=(x,y),
                    xytext=(5,2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
    plt.savefig(filename)

In [33]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne=TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
plot_only=500
low_dim_embs=tsne.fit_transform(final_embeddings[:plot_only,:])
labels=[reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,labels)