/
model.py
157 lines (135 loc) · 6.36 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import math
import random
import numpy as np
import tensorflow as tf
from collections import Counter
class Word2Vec(object):
def __init__(self, config, sess):
self.sess = sess
self.alpha = config['alpha']
self.embed_size = config['embed_size']
self.neg_sample_size = config['neg_sample_size']
self.min_frequency = config['min_frequency']
self.window = config['window']
self.lr = config['lr']
self.min_lr = config['min_lr']
self.table_size = config['table_size'] # unigram table size
def build_vocab(self, filename):
start_time = time.time()
with open(filename) as f:
words = [word for line in f.readlines() for word in line.split()]
self.total_count = len(words)
self.counter = [['UNK', 0]]
self.counter.extend([list(item) for item in Counter(words).most_common()
if item[0] > self.min_frequency])
self.vocab_size = len(self.counter)
word2idx = dict()
for word, _ in self.counter:
word2idx[word] = len(word2idx)
data = list()
unk_count = 0
for word in words:
if word in word2idx:
idx = word2idx[word]
else:
idx = 0 # word2idx['UNK']
unk_count = unk_count + 1
data.append(idx)
self.counter[0][1] = unk_count
idx2word = dict(zip(word2idx.values(), word2idx.keys()))
duration = time.time() - start_time
print("%d words processed in %.2f seconds" % (self.total_count, duration))
print("Vocab size after eliminating words occuring less than %d times: %d" % (self.min_frequency, self.vocab_size))
self.data = data
self.words = words
self.word2idx = word2idx
self.idx2word = idx2word
self.decay = (self.min_lr-self.lr)/(self.total_count*self.window)
self.labels = np.zeros([1, 1+self.neg_sample_size], dtype=np.float32); self.labels[0][0] = 1
self.contexts = np.ndarray(1 + self.neg_sample_size, dtype=np.int32)
self.build_model()
def build_model(self):
self.x = tf.placeholder(tf.int32, [1], name='pos_x')
self.y = tf.placeholder(tf.int32, [1 + self.neg_sample_size], name='pos_x')
init_width = 0.5 / self.embed_size
self.embed = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -init_width, init_width), name='embed')
self.w = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size], stddev=1.0 / math.sqrt(self.embed_size)), name='w')
self.x_embed = tf.nn.embedding_lookup(self.embed, self.x, name='pos_embed')
self.y_w = tf.nn.embedding_lookup(self.w, self.y, name='pos_embed')
self.mul = tf.matmul(self.x_embed, self.y_w, transpose_b=True)
self.p = tf.nn.sigmoid(self.mul)
self.loss = tf.nn.softmax_cross_entropy_with_logits(self.p, self.labels)
self.train = tf.train.GradientDescentOptimizer(0.001).minimize(self.loss)
self.sess.run(tf.initialize_all_variables())
def train_pair(self, word_idx, contexts):
self.sess.run(self.train, feed_dict={self.x: [word_idx], self.y: contexts})
def build_table(self):
start_time = time.time()
total_count_pow = 0
for _, count in self.counter:
total_count_pow += math.pow(count, self.alpha)
word_idx = 1
self.table = np.zeros([self.table_size], dtype=np.int32)
word_prob = math.pow(self.counter[word_idx][1], self.alpha) / total_count_pow
for idx in xrange(self.table_size):
self.table[idx] = word_idx
if idx / self.table_size > word_prob:
word_idx += 1
word_prob += math.pow(self.counter[word_idx][1], self.alpha) / total_count_pow
if word_idx > self.vocab_size:
word_idx = word_idx - 1
print("Done in %.2f seconds.", time.time() - start_time)
def sample_contexts(self, context):
self.contexts[0] = context
idx = 0
while idx < self.neg_sample_size - 1:
neg_context = self.table[random.randrange(self.table_size)]
if context != neg_context:
self.contexts[idx+2] = neg_context
idx += 1
def train_stream(self, filename):
print("Training...")
start_time = time.time()
c = 0
with open(filename) as f:
words = [word for line in f.readlines() for word in line.split()]
for idx, word in enumerate(words):
try:
word_idx = self.word2idx[word]
reduced_window = random.randrange(self.window)
self.words[0] = word_idx
for jdx in xrange(idx - reduced_window, idx + reduced_window + 1):
context = words[jdx]
if jdx != idx:
try:
context_idx = self.word2idx[context]
self.sample_contexts(context_idx)
self.train_pair(word_idx, self.contexts)
self.lr = max(self.min_lr, self.lr + self.decay)
c += 1
if c % 100000 == 0:
loss = self.sess.run(self.loss, feed_dict={self.x: [word_idx], self.y: self.contexts})
print("%d words trained in %.2f seconds. Learning rate: %.4f, Loss : %.4f" % (c, time.time() - start_time, self.lr, loss))
except:
continue
except:
continue
def get_sim_Words(self, idxs, k):
if type(idxs[0]) == str:
idxs = np.array([self.word2idx[word] for word in idxs])
else:
idxs = np.array(idxs)
vals, idxs = sess.run(
[nearby_val, nearby_idx], {nearby_character: idxs})
for i in xrange(len(idxs)):
print(idx2word[idxs[i]])
print()
for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
print("%-20s %6.4f" % (idx2word[neighbor], distance))
def train_model(self, corpus):
self.train_stream(corpus)