# chapter 11 自然语言

In [3]:
## 词袋模型：给定电影评论中的词汇，预测其情绪
# 一个单词，一个维度。
# 已知：句子和对应的评分。

import numpy as np

#为每个单词，创建向量
onehots={}
onehots['cat']=np.array([1,0,0,0])
onehots['the']=np.array([0,1,0,0])
onehots['dog']=np.array([0,0,1,0])
onehots['sat']=np.array([0,0,0,1])

def word2hot(word):
    return onehots[word];

sentence=['the','cat','sat']
x=word2hot(sentence[0]) + \
    word2hot(sentence[1]) + \
    word2hot(sentence[2])
print("Sent Encoding:", x)

# 这样，the cat sat 就被编码成了向量 [1 1 0 1]

Sent Encoding: [1 1 0 1]


> 如果一个单词出现多次，如"cat cat cat"，则可以求和，得到[3,0,0,0]; 或者只取一次[1,0,0,0]。后者对NLP更合适。

In [None]:
## https://github.com/iamtrask/Grokking-Deep-Learning
## 电影评论和标签
## https://github.com/iamtrask/Grokking-Deep-Learning/tree/master/tasksv11

## load data

In [4]:
# 从github下载数据，就是2个文本文件。一个评论，一个标签。

def pretty_print_review_and_label(i):
   print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('dataset/reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('dataset/labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [5]:
pretty_print_review_and_label(0)

POSITIVE	:	bromwell high is a cartoon comedy . it ran at the same time as some other progra...


### Predicting Movie Reviews

In [7]:
import sys

f = open('dataset/reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('dataset/labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(  map(  lambda x:set(x.split(" ")) ,  raw_reviews)  )

In [9]:
len(raw_reviews)

25000

In [10]:
len(tokens) #每句话一个list，里面是出现过的单词，出现多次只统计一个。

25000

In [15]:
raw_reviews[0:2]

['bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \n',
 'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience

In [19]:
rs=list(map(  lambda x:set(x.split(" ")) ,  raw_reviews[0:2]))
len(rs)

2

In [105]:
## 等价写法
wd=set()
for i in range(len(raw_reviews[0:2])):
    x=raw_reviews[i]
    rs=set(x.split(" "))
    print('---%d\n' % i,x, rs)

---0
 bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
 {'', 'than', 'me', 'some', 'insightful', 'other', 'when', 'schools', 'much', 'saw', 'think', 'through', 'students', 'down', 'can', 'repeatedly', 'who', 'survive

In [106]:
## 获得全文单词总数。相同单词计一个。
vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)
print(len(vocab), vocab[1:10]) #74074 不重复的单词个数7.4万

74074 ['madcat', 'ba', 'dolce', 'imperialists', 'reactionary', 'joyrides', 'pong', 'budget', 'avoidances']


In [107]:
# 每个单词在词典中的位置
word2index = {}
for i,word in enumerate(vocab):
    if i<6:
        print("---%d %s\n" % (i, word),word2index)
    word2index[word]=i

print(len(word2index))

---0 believably
 {}
---1 madcat
 {'believably': 0}
---2 ba
 {'believably': 0, 'madcat': 1}
---3 dolce
 {'believably': 0, 'madcat': 1, 'ba': 2}
---4 imperialists
 {'believably': 0, 'madcat': 1, 'ba': 2, 'dolce': 3}
---5 reactionary
 {'believably': 0, 'madcat': 1, 'ba': 2, 'dolce': 3, 'imperialists': 4}
74074


In [34]:
## 
i=0

input_dataset = list()
for sent in tokens: #每个句子
    i+=1
    sent_indices = list()
    for word in sent: #每个单词，把其在词典中的编号id添加到 sent_indices 变量中
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices))) #句子单词id唯一化，添加到list中
    
    if i<4:
        print("\n---%d (%d)(%d)\nsent_indices=" %(i, len(sent_indices), len(input_dataset)), \
              sent_indices,"\n\ninput_dataset=",input_dataset)


---1 (93)(1)
sent_indices= [12780, 13085, 45506, 63687, 25955, 45808, 41999, 46388, 18059, 22214, 47187, 42602, 46931, 9660, 16630, 66297, 45556, 50693, 8552, 73531, 72926, 55650, 21961, 6583, 70690, 53660, 55102, 2368, 6346, 51950, 81, 6893, 57460, 23448, 22901, 14067, 54861, 64060, 44762, 73898, 26359, 24042, 38889, 71329, 36339, 24348, 31566, 69338, 19388, 64407, 47909, 54023, 30148, 60402, 31024, 31902, 16785, 4999, 28767, 18236, 15858, 18245, 45089, 36670, 34346, 65912, 71385, 20911, 39261, 31661, 10972, 49756, 47692, 40167, 23027, 61071, 47703, 53267, 42815, 33205, 61372, 38446, 23045, 66840, 62796, 52984, 16282, 8187, 4219, 15648, 21001, 554, 8799] 

input_dataset= [[50693, 23045, 21001, 41999, 53267, 45089, 70690, 34346, 554, 38446, 64060, 47692, 54861, 81, 47187, 47703, 49756, 28767, 8799, 42602, 57460, 4219, 18059, 61071, 31902, 71329, 73898, 22214, 63687, 6346, 71385, 44762, 69338, 10972, 72926, 40167, 6893, 51950, 45808, 14067, 16630, 26359, 52984, 66297, 54023, 66840, 243

In [40]:
# 25000 句话，每句话的单词id(唯一化)
for i in range(len(input_dataset)):
    if i<3:
        print("\n",i, input_dataset[i])
    pass
print(i)


 0 [50693, 23045, 21001, 41999, 53267, 45089, 70690, 34346, 554, 38446, 64060, 47692, 54861, 81, 47187, 47703, 49756, 28767, 8799, 42602, 57460, 4219, 18059, 61071, 31902, 71329, 73898, 22214, 63687, 6346, 71385, 44762, 69338, 10972, 72926, 40167, 6893, 51950, 45808, 14067, 16630, 26359, 52984, 66297, 54023, 66840, 24348, 13085, 15648, 47909, 31024, 46388, 73531, 18236, 55102, 36670, 2368, 42815, 18245, 62796, 31566, 46931, 39261, 55650, 25955, 8552, 22901, 65912, 4999, 16785, 64407, 23448, 16282, 53660, 31661, 20911, 33205, 6583, 61372, 9660, 19388, 45506, 30148, 21961, 38889, 24042, 12780, 60402, 36339, 45556, 15858, 23027, 8187]

 1 [21001, 46616, 37401, 71706, 24094, 63531, 38446, 45635, 22088, 38472, 6739, 72282, 49756, 23134, 36962, 66152, 12905, 39016, 54384, 4748, 61071, 52393, 70326, 52922, 69309, 23742, 32959, 22214, 54470, 23239, 6346, 10446, 71385, 11482, 67806, 71396, 13545, 27895, 66297, 47874, 15619, 66825, 54030, 18194, 37149, 60719, 31024, 73010, 307, 64825, 21820, 51

In [41]:
# 把结论编码成 否定0和 肯定1
i=0
target_dataset = list()
for label in raw_labels:
    i+=1
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)
print(i)

25000


In [58]:
# 神经网络，找输入和输出的关系

import time, numpy as np 
np.random.seed(1)

start=time.time()

# 值域是(0,1)
def sigmoid(x): 
    return 1/(1 + np.exp(-x)) 

alpha, iterations = (0.01, 2) 
hidden_size = 100  #隐藏层有100个节点

#随机化权重
weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1 #输入单词是总单词个数
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1 #输出只有一个位点，0 or 1

correct,total = (0,0) 

for iter in range(iterations): 
    # train on first 24,000 
    for i in range(len(input_dataset)-1000): 
        x,y = (input_dataset[i],target_dataset[i])
        
        # 引入嵌入层，
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) #embed + sigmoid 
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2))  # linear + softmax // todo? 没有使用softmax
        
        layer_2_delta = layer_2 - y # compare pred with truth 
        
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)  #backprop 
        
        weights_0_1[x] -= layer_1_delta * alpha 
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha 
        
        if(np.abs(layer_2_delta) < 0.5): 
            correct += 1 
            total += 1
        
        #测试
        if( (i!=0 and i % 100 == 0) or i== (len(input_dataset)-1000 -1)  ): 
            progress = str(i/float(len(input_dataset))) 
            ##sys.stdout.write
            info=('\rIter:'+str(iter) + " I:"+str(i) \
                 +' Progress:'+progress[2:4]+'.'+progress[4:6] + "%" \
                 +' Training Accuracy: %0.6f' % (correct/float(total)) ) 

            correct,total = (0,0) 
            # 后1000个做测试
            for i2 in range( len(input_dataset)-1000 ,len(input_dataset) ):
                x = input_dataset[i2] 
                y = target_dataset[i2]
                
                layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) 
                layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
                
                if(np.abs(layer_2 - y) < 0.5): 
                    correct += 1 
                total += 1
            print(info, "\tTest Accuracy:" + str(correct / float(total)), " \tTime:%0.2f" % (time.time()-start))

Iter:0 I:100 Progress:00.4% Training Accuracy: 1.000000 	Test Accuracy:0.525  	Time:0.40
Iter:0 I:200 Progress:00.8% Training Accuracy: 0.537938 	Test Accuracy:0.541  	Time:0.52
Iter:0 I:300 Progress:01.2% Training Accuracy: 0.559501 	Test Accuracy:0.574  	Time:0.64
Iter:0 I:400 Progress:01.6% Training Accuracy: 0.592344 	Test Accuracy:0.645  	Time:0.75
Iter:0 I:500 Progress:02.% Training Accuracy: 0.657336 	Test Accuracy:0.645  	Time:0.86
Iter:0 I:600 Progress:02.4% Training Accuracy: 0.662548 	Test Accuracy:0.687  	Time:0.98
Iter:0 I:700 Progress:02.8% Training Accuracy: 0.706929 	Test Accuracy:0.686  	Time:1.10
Iter:0 I:800 Progress:03.2% Training Accuracy: 0.705717 	Test Accuracy:0.717  	Time:1.22
Iter:0 I:900 Progress:03.6% Training Accuracy: 0.735267 	Test Accuracy:0.725  	Time:1.34
Iter:0 I:1000 Progress:04.% Training Accuracy: 0.744186 	Test Accuracy:0.728  	Time:1.45
Iter:0 I:1100 Progress:04.4% Training Accuracy: 0.747681 	Test Accuracy:0.744  	Time:1.57
Iter:0 I:1200 Progres

Iter:0 I:9300 Progress:37.2% Training Accuracy: 0.861624 	Test Accuracy:0.824  	Time:11.37
Iter:0 I:9400 Progress:37.6% Training Accuracy: 0.838086 	Test Accuracy:0.848  	Time:11.48
Iter:0 I:9500 Progress:38.% Training Accuracy: 0.859779 	Test Accuracy:0.834  	Time:11.62
Iter:0 I:9600 Progress:38.4% Training Accuracy: 0.847005 	Test Accuracy:0.823  	Time:11.73
Iter:0 I:9700 Progress:38.8% Training Accuracy: 0.838208 	Test Accuracy:0.813  	Time:11.84
Iter:0 I:9800 Progress:39.2% Training Accuracy: 0.827808 	Test Accuracy:0.823  	Time:11.96
Iter:0 I:9900 Progress:39.6% Training Accuracy: 0.837017 	Test Accuracy:0.834  	Time:12.08
Iter:0 I:10000 Progress:4.% Training Accuracy: 0.847005 	Test Accuracy:0.839  	Time:12.20
Iter:0 I:10100 Progress:40.4% Training Accuracy: 0.851476 	Test Accuracy:0.839  	Time:12.31
Iter:0 I:10200 Progress:40.8% Training Accuracy: 0.851750 	Test Accuracy:0.838  	Time:12.43
Iter:0 I:10300 Progress:41.2% Training Accuracy: 0.851103 	Test Accuracy:0.817  	Time:12.5

Iter:0 I:18300 Progress:73.2% Training Accuracy: 0.843119 	Test Accuracy:0.844  	Time:22.28
Iter:0 I:18400 Progress:73.6% Training Accuracy: 0.855019 	Test Accuracy:0.844  	Time:22.42
Iter:0 I:18500 Progress:74.% Training Accuracy: 0.856749 	Test Accuracy:0.842  	Time:22.54
Iter:0 I:18600 Progress:74.4% Training Accuracy: 0.854109 	Test Accuracy:0.845  	Time:22.68
Iter:0 I:18700 Progress:74.8% Training Accuracy: 0.858447 	Test Accuracy:0.835  	Time:22.81
Iter:0 I:18800 Progress:75.2% Training Accuracy: 0.848346 	Test Accuracy:0.843  	Time:22.94
Iter:0 I:18900 Progress:75.6% Training Accuracy: 0.855566 	Test Accuracy:0.85  	Time:23.07
Iter:0 I:19000 Progress:76.% Training Accuracy: 0.861240 	Test Accuracy:0.842  	Time:23.20
Iter:0 I:19100 Progress:76.4% Training Accuracy: 0.855576 	Test Accuracy:0.848  	Time:23.34
Iter:0 I:19200 Progress:76.8% Training Accuracy: 0.859779 	Test Accuracy:0.843  	Time:23.47
Iter:0 I:19300 Progress:77.2% Training Accuracy: 0.856490 	Test Accuracy:0.851  	Ti

Iter:1 I:3300 Progress:13.2% Training Accuracy: 0.867593 	Test Accuracy:0.858  	Time:33.11
Iter:1 I:3400 Progress:13.6% Training Accuracy: 0.869365 	Test Accuracy:0.858  	Time:33.23
Iter:1 I:3500 Progress:14.% Training Accuracy: 0.869365 	Test Accuracy:0.856  	Time:33.35
Iter:1 I:3600 Progress:14.4% Training Accuracy: 0.867769 	Test Accuracy:0.856  	Time:33.46
Iter:1 I:3700 Progress:14.8% Training Accuracy: 0.867769 	Test Accuracy:0.823  	Time:33.57
Iter:1 I:3800 Progress:15.2% Training Accuracy: 0.837912 	Test Accuracy:0.857  	Time:33.69
Iter:1 I:3900 Progress:15.6% Training Accuracy: 0.868445 	Test Accuracy:0.85  	Time:33.80
Iter:1 I:4000 Progress:16.% Training Accuracy: 0.862259 	Test Accuracy:0.848  	Time:33.91
Iter:1 I:4100 Progress:16.4% Training Accuracy: 0.860933 	Test Accuracy:0.835  	Time:34.02
Iter:1 I:4200 Progress:16.8% Training Accuracy: 0.848485 	Test Accuracy:0.854  	Time:34.14
Iter:1 I:4300 Progress:17.2% Training Accuracy: 0.866178 	Test Accuracy:0.802  	Time:34.26
It

Iter:1 I:12500 Progress:5.% Training Accuracy: 0.867338 	Test Accuracy:0.825  	Time:43.65
Iter:1 I:12600 Progress:50.4% Training Accuracy: 0.840037 	Test Accuracy:0.813  	Time:43.77
Iter:1 I:12700 Progress:50.8% Training Accuracy: 0.828440 	Test Accuracy:0.847  	Time:43.88
Iter:1 I:12800 Progress:51.2% Training Accuracy: 0.860274 	Test Accuracy:0.809  	Time:44.00
Iter:1 I:12900 Progress:51.6% Training Accuracy: 0.824125 	Test Accuracy:0.853  	Time:44.11
Iter:1 I:13000 Progress:52.% Training Accuracy: 0.865261 	Test Accuracy:0.851  	Time:44.23
Iter:1 I:13100 Progress:52.4% Training Accuracy: 0.863303 	Test Accuracy:0.85  	Time:44.34
Iter:1 I:13200 Progress:52.8% Training Accuracy: 0.862511 	Test Accuracy:0.85  	Time:44.46
Iter:1 I:13300 Progress:53.2% Training Accuracy: 0.862385 	Test Accuracy:0.849  	Time:44.57
Iter:1 I:13400 Progress:53.6% Training Accuracy: 0.861213 	Test Accuracy:0.847  	Time:44.69
Iter:1 I:13500 Progress:54.% Training Accuracy: 0.860274 	Test Accuracy:0.845  	Time:

Iter:1 I:21500 Progress:86.% Training Accuracy: 0.856486 	Test Accuracy:0.849  	Time:54.90
Iter:1 I:21600 Progress:86.4% Training Accuracy: 0.862100 	Test Accuracy:0.83  	Time:55.02
Iter:1 I:21700 Progress:86.8% Training Accuracy: 0.844465 	Test Accuracy:0.839  	Time:55.15
Iter:1 I:21800 Progress:87.2% Training Accuracy: 0.851339 	Test Accuracy:0.842  	Time:55.27
Iter:1 I:21900 Progress:87.6% Training Accuracy: 0.855311 	Test Accuracy:0.838  	Time:55.40
Iter:1 I:22000 Progress:88.% Training Accuracy: 0.851920 	Test Accuracy:0.842  	Time:55.54
Iter:1 I:22100 Progress:88.4% Training Accuracy: 0.854646 	Test Accuracy:0.846  	Time:55.67
Iter:1 I:22200 Progress:88.8% Training Accuracy: 0.859361 	Test Accuracy:0.847  	Time:55.79
Iter:1 I:22300 Progress:89.2% Training Accuracy: 0.859890 	Test Accuracy:0.836  	Time:55.93
Iter:1 I:22400 Progress:89.6% Training Accuracy: 0.848987 	Test Accuracy:0.844  	Time:56.06
Iter:1 I:22500 Progress:9.% Training Accuracy: 0.856881 	Test Accuracy:0.848  	Time

In [108]:
input_dataset[0]

[7,
 7,
 34468,
 41613,
 7,
 49761,
 65920,
 61092,
 33194,
 13348,
 19331,
 54874,
 12882,
 7,
 43613,
 54874,
 72040,
 38888,
 22642,
 28290,
 7,
 20909,
 5706,
 54874,
 66472,
 31557,
 23024,
 35330,
 64414,
 40192,
 59910,
 20909,
 67085,
 50012,
 64414,
 46130,
 65920,
 12125,
 54642,
 5248,
 9491,
 51964,
 28996,
 7,
 70074,
 45806,
 23024,
 61092,
 49761,
 48321,
 10980,
 65920,
 65048,
 12846,
 7,
 45226,
 23024,
 59725,
 49761,
 36329,
 65204,
 54779,
 36329,
 23024,
 67616,
 64414,
 28635,
 10980,
 23024,
 40222,
 31557,
 62292,
 40192,
 36742,
 69785,
 10980,
 67085,
 36179,
 7,
 61078,
 30335,
 31557,
 38888,
 23024,
 19142,
 64414,
 40824,
 27179,
 7,
 7,
 65920,
 63000,
 19320,
 7,
 7,
 23512,
 64414,
 67085,
 36179,
 7,
 61078,
 53759,
 7,
 20909,
 67085,
 39597,
 7,
 40192,
 7134,
 71389,
 32110,
 67085,
 50012,
 65920,
 16280,
 31557,
 7,
 57840,
 7,
 7,
 7,
 57840,
 7,
 7,
 45775,
 36329,
 10980,
 28269,
 34397,
 71389,
 39256,
 3423,
 7,
 28058,
 5589,
 72935,
 17686

## Comparing Word Embeddings

In [59]:
# 查找最相近的单词，
# 就是从输出给隐藏层的神经元的权重。计算两两的欧氏距离。
from collections import Counter
import math 

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        #欧氏距离：差，平方，开方
        raw_difference = weights_0_1[index] - (weights_0_1[target_index]) 
        squared_difference = raw_difference * raw_difference
        
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

#
similar(target='beautiful')

[('beautiful', -0.0),
 ('genius', -0.7110520152787959),
 ('surprisingly', -0.7177760116137191),
 ('impact', -0.7374382913645546),
 ('heart', -0.7422223339181497),
 ('innocent', -0.7466205535017229),
 ('superbly', -0.7599955577340626),
 ('masterpiece', -0.7607324172678727),
 ('expecting', -0.7616315121803886),
 ('atmosphere', -0.7664133774712679)]

In [60]:
similar(target='terrible')

[('terrible', -0.0),
 ('lacks', -0.7370482731293628),
 ('dull', -0.7440112815293795),
 ('avoid', -0.7526276528133371),
 ('badly', -0.7538178915306346),
 ('poor', -0.7891280012839721),
 ('lame', -0.8138783546589667),
 ('worse', -0.8252921707345071),
 ('annoying', -0.835288406213905),
 ('fails', -0.8470321419037599)]

In [63]:
similar(target='boring')

[('boring', -0.0),
 ('avoid', -0.8288264601862085),
 ('annoying', -0.8306772538792903),
 ('horrible', -0.8313355499023517),
 ('dull', -0.8454160540439589),
 ('lacks', -0.8468324798113056),
 ('disappointing', -0.8539429863008574),
 ('poorly', -0.8616707471942846),
 ('terrible', -0.8620474664239852),
 ('mess', -0.8697475409287163)]

## Filling in the Blank

In [65]:
# 哪些单词经常在一起？分成5个单词的一组的短语，删除一个单词，尝试训练一个网络，利用去掉单词之后的剩余部分，预测去掉的单词。

import sys,random,math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)
f = open('dataset/reviews.txt')
raw_reviews = f.readlines()
f.close()

#每个句子分成单词，放到集合中。一个句子一个list。
tokens = list(map(lambda x:(x.split(" ")),raw_reviews))


#单词出现的频数
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
#单词按照词频排序
vocab = list(set(map(lambda x:x[0],wordcnt.most_common())))

#单词位置为单词id，越小出现频率越高
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
#

concatenated = list()
input_dataset = list()
for sent in tokens: # 一句话
    sent_indices = list()
    for word in sent: #每个单词
        try:
            sent_indices.append(word2index[word]) #本句单词转id
            concatenated.append(word2index[word]) #全文的单词转id
        except:
            ""
    input_dataset.append(sent_indices)

# 全文单词id
concatenated = np.array(concatenated)

#句子单词id，一个句子一个数组，打乱句子
random.shuffle(input_dataset)

print('==done==')

==done==


In [72]:
len(vocab)

74075

In [74]:
vocab[0:10]

['madcat',
 'believably',
 'ba',
 'dolce',
 'joyrides',
 'imperialists',
 'reactionary',
 '',
 'pong',
 'avoidances']

In [77]:
#神经网络 2213s
start=time.time()

alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2

weights_1_2 = np.random.rand(len(vocab),hidden_size)*0 #//todo 为什么是0


layer_2_target = np.zeros(negative+1) ####//todo? 不知道啥意思
layer_2_target[0] = 1


def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

for rev_i,review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        # since it's really expensive to predict every vocabulary
        # we're only going to predict a random subset
        target_samples = [review[target_i]]+list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]

        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)  *iterations)) + "   " + str(similar('terrible')))
        print("\t%0.2f seconds"% (time.time()-start))
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)*iterations)))


#
print(similar('terrible'))

Progress:0.0   [('terrible', -0.0), ('regimental', -0.34566542186539917), ('pheiffer', -0.374364744471376), ('bierce', -0.39020443695728774), ('pea', -0.3906727622415326), ('theese', -0.40010102234132305), ('granola', -0.400767102481998), ('tenfold', -0.4010564281215047), ('luster', -0.4023327589553107), ('satiate', -0.40271719019475083)]	1.46 seconds
Progress:0.005   [('terrible', -0.0), ('performed', -0.5216891221155678), ('pieces', -0.5306834044943081), ('emotionally', -0.5495941848390269), ('begun', -0.5522231920984119), ('roger', -0.5527812205888545), ('amongst', -0.5547302290968112), ('articles', -0.5610357410823145), ('proud', -0.5640334674870774), ('eyeing', -0.5640355863436103)]	12.59 seconds
Progress:0.01   [('terrible', -0.0), ('inspired', -0.5700105539140355), ('fans', -0.5886480918063788), ('pieces', -0.6098678101321166), ('sheer', -0.616386039263599), ('lauren', -0.6227787249903806), ('emotionally', -0.6271127024102597), ('everybody', -0.6328923307157198), ('marty', -0.63

Progress:0.115   [('terrible', -0.0), ('horrible', -2.088757988213651), ('terrific', -2.2778780714172506), ('brilliant', -2.329387465054196), ('superb', -2.374021709196357), ('sympathetic', -2.4074334862691047), ('remarkable', -2.415099049318502), ('strictly', -2.4385508747241795), ('promising', -2.4474487561459495), ('clever', -2.4603711587356254)]	259.69 seconds
Progress:0.12   [('terrible', -0.0), ('horrible', -2.272902535569674), ('brilliant', -2.301096562194009), ('dreadful', -2.40253395496484), ('sympathetic', -2.408735543033682), ('watchable', -2.425619895300193), ('thin', -2.4287524507574076), ('promising', -2.434684789744282), ('terrific', -2.440798090552718), ('fantastic', -2.456237750336771)]	270.02 seconds
Progress:0.125   [('terrible', -0.0), ('brilliant', -2.2121079519223916), ('horrible', -2.2865601397603856), ('terrific', -2.452756562920238), ('fantastic', -2.503918792274928), ('remarkable', -2.5144108145573805), ('wonderfully', -2.5151975185274793), ('superb', -2.52891

Progress:0.23   [('terrible', -0.0), ('brilliant', -2.638903867285671), ('horrible', -2.8319037648469103), ('mediocre', -3.1461239338617975), ('fascinating', -3.241630231674369), ('lame', -3.2455864664764285), ('remarkable', -3.2489353985549907), ('dreadful', -3.2621622671593804), ('spectacular', -3.272998966693576), ('pathetic', -3.3596875011859786)]	514.24 seconds
Progress:0.235   [('terrible', -0.0), ('brilliant', -2.6879790204610186), ('horrible', -2.739037084223775), ('mediocre', -3.310682694991091), ('fascinating', -3.359341196488673), ('remarkable', -3.40154553212725), ('spectacular', -3.414408491045197), ('superb', -3.427232151530836), ('dreadful', -3.4492950377035716), ('fantastic', -3.4786139720330267)]	525.79 seconds
Progress:0.24   [('terrible', -0.0), ('horrible', -2.708154843365541), ('brilliant', -2.7084826358100003), ('superb', -3.272188843848006), ('fascinating', -3.3643977491744526), ('remarkable', -3.3822745018216493), ('spectacular', -3.419967106889232), ('fantastic

Progress:0.345   [('terrible', -0.0), ('horrible', -2.9954221275274446), ('brilliant', -3.2910024195759573), ('dreadful', -3.608092557019234), ('lame', -3.6507463304850925), ('pathetic', -3.7098931983798766), ('fantastic', -3.7158881903803818), ('stupid', -3.717196546481616), ('ridiculous', -3.8348556231741675), ('terrific', -3.83880142940571)]	770.39 seconds
Progress:0.35   [('terrible', -0.0), ('horrible', -2.833668944153181), ('brilliant', -3.2265841383655243), ('lame', -3.4826113244666144), ('pathetic', -3.537308841498491), ('dreadful', -3.55450844927869), ('terrific', -3.622849708333857), ('weak', -3.6412690090540667), ('fantastic', -3.6494200960717538), ('stupid', -3.6914899460739763)]	781.81 seconds
Progress:0.355   [('terrible', -0.0), ('horrible', -2.8779816317544418), ('brilliant', -3.304245979106132), ('pathetic', -3.450059713894054), ('lame', -3.4554400264114538), ('dreadful', -3.6209835667635946), ('stupid', -3.645276443361425), ('weak', -3.65759341905786), ('ridiculous', 

Progress:0.46   [('terrible', -0.0), ('horrible', -2.9373000897102863), ('brilliant', -3.585891903431411), ('great', -3.697754880879036), ('pathetic', -3.7280796347187537), ('fantastic', -3.818974632379081), ('bad', -3.8816413362899964), ('superb', -3.9025897956571245), ('weak', -3.917658135350571), ('lame', -3.9277219470965754)]	1028.57 seconds
Progress:0.465   [('terrible', -0.0), ('horrible', -3.0306987662318225), ('brilliant', -3.511494056765962), ('pathetic', -3.77284549960116), ('fantastic', -3.823526708986264), ('bad', -3.8449478200080556), ('superb', -3.935835456626538), ('great', -3.952201531947618), ('dreadful', -4.003762959408115), ('weak', -4.019346381824978)]	1040.05 seconds
Progress:0.47   [('terrible', -0.0), ('horrible', -2.929929566028003), ('brilliant', -3.392697954950734), ('bad', -3.5208421281884243), ('fantastic', -3.707271698983779), ('great', -3.7824774340628142), ('good', -3.8284739438085156), ('pathetic', -3.8729129629905463), ('weak', -3.926742330543328), ('dr

Progress:0.58   [('terrible', -0.0), ('horrible', -3.107671725129604), ('brilliant', -3.240099610925125), ('fantastic', -3.5824438827839575), ('dreadful', -3.8337606665495207), ('wonderful', -3.842058462035976), ('magnificent', -3.9852077559187338), ('pathetic', -3.994942051716587), ('superb', -4.019523591414505), ('lame', -4.050411884316963)]	1292.23 seconds
Progress:0.585   [('terrible', -0.0), ('horrible', -3.035207390215298), ('brilliant', -3.329093389407336), ('fantastic', -3.4461511220699954), ('wonderful', -3.8024042818915125), ('dreadful', -3.8173625555962496), ('magnificent', -3.8427037490529194), ('pathetic', -3.871677612447527), ('poor', -3.9358427054815137), ('fabulous', -4.0731086383439985)]	1303.66 seconds
Progress:0.59   [('terrible', -0.0), ('horrible', -3.128437266413973), ('brilliant', -3.1798540543758262), ('fantastic', -3.3885749804380043), ('magnificent', -3.8178307752323657), ('poor', -3.8183160311952538), ('dreadful', -3.8555073750867197), ('pathetic', -3.9489806

Progress:0.695   [('terrible', -0.0), ('horrible', -2.9328896312374013), ('brilliant', -4.048468717400298), ('wonderful', -4.049435759204722), ('magnificent', -4.104583198295198), ('pathetic', -4.217072278824231), ('lame', -4.230482327388978), ('fantastic', -4.2594882996997425), ('superb', -4.265849265777586), ('dreadful', -4.32807553065753)]	1548.32 seconds
Progress:0.7   [('terrible', -0.0), ('horrible', -2.843504419889355), ('pathetic', -4.1592075273026365), ('magnificent', -4.224853983241633), ('lame', -4.228414898509148), ('dreadful', -4.23496757473528), ('brilliant', -4.23794322855757), ('fantastic', -4.278029436296625), ('wonderful', -4.351101190439004), ('laughable', -4.404969569588731)]	1559.20 seconds
Progress:0.705   [('terrible', -0.0), ('horrible', -2.839580760492336), ('dreadful', -4.152146973344446), ('lame', -4.162224029794165), ('pathetic', -4.308642203739182), ('wonderful', -4.313454679527806), ('brilliant', -4.326576123796189), ('superb', -4.359779341266414), ('magni

Progress:0.81   [('terrible', -0.0), ('horrible', -3.383323457870449), ('bad', -3.7338904345185884), ('dreadful', -3.7914955274120787), ('fantastic', -3.9070014915427924), ('phenomenal', -3.9470062122142604), ('wonderful', -4.107404468327039), ('brilliant', -4.1759285829334365), ('fabulous', -4.189943480135038), ('mediocre', -4.223728494764497)]	1802.72 seconds
Progress:0.815   [('terrible', -0.0), ('horrible', -3.297898215117045), ('fantastic', -3.8379698298437477), ('bad', -3.9218601439562675), ('phenomenal', -3.9527506760647206), ('dreadful', -3.977548291217256), ('wonderful', -4.05780081584928), ('magnificent', -4.092029715224029), ('brilliant', -4.163375734424872), ('mediocre', -4.213109270768967)]	1813.54 seconds
Progress:0.82   [('terrible', -0.0), ('horrible', -3.3114197186280956), ('dreadful', -4.030662861962268), ('phenomenal', -4.139371129915543), ('magnificent', -4.158516045719053), ('fantastic', -4.207790619736105), ('brilliant', -4.262550711493452), ('bad', -4.27129757565

Progress:0.925   [('terrible', -0.0), ('brilliant', -3.281489976804876), ('horrible', -3.3811960885197223), ('great', -4.160010936810392), ('superb', -4.16841071320425), ('wonderful', -4.1838039169933285), ('mediocre', -4.22107557220905), ('fantastic', -4.2330535363642054), ('weak', -4.237178889206568), ('phenomenal', -4.242772361957771)]	2059.90 seconds
Progress:0.93   [('terrible', -0.0), ('brilliant', -3.3943626067703603), ('horrible', -3.448467916029251), ('great', -4.068114327731297), ('weak', -4.21715376872695), ('bad', -4.243390601286484), ('fantastic', -4.258166928736217), ('mediocre', -4.288823068771708), ('phenomenal', -4.2961664968819475), ('lame', -4.310801773256773)]	2070.38 seconds
Progress:0.935   [('terrible', -0.0), ('brilliant', -3.432926089744047), ('horrible', -3.507494437362461), ('superb', -4.080722820716531), ('bad', -4.102065781122613), ('fantastic', -4.213713091698553), ('phenomenal', -4.246350180244463), ('marvelous', -4.329376955096027), ('pathetic', -4.33387

## King - Man + Woman ~= Queen

In [78]:
def analogy(positive=['terrible','good'],negative=['bad']):
    
    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

In [79]:
analogy(['terrible','good'],['bad'])

[('worth', -223.8510410305487),
 ('terrific', -224.05775269898055),
 ('superb', -224.10975525188263),
 ('fine', -224.27145554506822),
 ('decent', -224.3098708663416),
 ('perfect', -224.44106959836273),
 ('terrible', -224.44132308525516),
 ('brilliant', -224.57545003784148),
 ('solid', -224.57835213298148)]

In [80]:
analogy(['elizabeth','he'],['she'])

[('lee', -168.72957322647775),
 ('ms', -169.09991914654248),
 ('mrs', -169.31095649436327),
 ('mr', -169.40027099711355),
 ('been', -169.4418128875975),
 ('j', -169.4426405845735),
 ('victor', -169.5136677687955),
 ('daniel', -169.54664449330875),
 ('walken', -169.61648930669278)]

In [100]:
analogy(['sea','fire'],['water'])

[('fire', -199.95869506106186),
 ('\n', -200.0281991435193),
 ('shame', -200.33725067772926),
 ('laugh', -200.81964736829653),
 ('night', -200.85201806053854),
 ('sea', -200.95428732775616),
 ('smile', -200.98108707474302),
 ('window', -201.1759381975858),
 ('m', -201.1968831388168)]

In [101]:
analogy(['day','moon'],['sun']) #厉害！

[('night', -302.4213462759662),
 ('hour', -302.46031969334877),
 ('br', -302.6302377163057),
 ('\n', -302.93773025693343),
 ('day', -302.9741215842857),
 ('e', -303.094886570874),
 ('year', -303.104988312938),
 ('years', -303.1493299824768),
 ('world', -303.4468045338719)]