/
Word2VecFeatureGenerator.py
156 lines (128 loc) · 6.17 KB
/
Word2VecFeatureGenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from FeatureGenerator import *
import pandas as pd
import numpy as np
import cPickle
import gensim
from sklearn.preprocessing import normalize
from helpers import *
class Word2VecFeatureGenerator(FeatureGenerator):
def __init__(self, name='word2vecFeatureGenerator'):
super(Word2VecFeatureGenerator, self).__init__(name)
def process(self, df):
print 'generating word2vec features'
df["Headline_unigram_vec"] = df["Headline"].map(lambda x: preprocess_data(x, exclude_stopword=False, stem=False))
df["articleBody_unigram_vec"] = df["articleBody"].map(lambda x: preprocess_data(x, exclude_stopword=False, stem=False))
n_train = df[~df['target'].isnull()].shape[0]
print 'Word2VecFeatureGenerator: n_train:',n_train
n_test = df[df['target'].isnull()].shape[0]
print 'Word2VecFeatureGenerator: n_test:',n_test
# 1). document vector built by multiplying together all the word vectors
# using Google's pre-trained word vectors
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
print 'model loaded'
Headline_unigram_array = df['Headline_unigram_vec'].values
print 'Headline_unigram_array:'
print Headline_unigram_array
print Headline_unigram_array.shape
print type(Headline_unigram_array)
# word vectors weighted by normalized tf-idf coefficient?
#headlineVec = [0]
headlineVec = map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), Headline_unigram_array)
headlineVec = np.array(headlineVec)
print 'headlineVec:'
print headlineVec
print 'type(headlineVec)'
print type(headlineVec)
#headlineVec = np.exp(headlineVec)
headlineVec = normalize(headlineVec)
print 'headlineVec'
print headlineVec
print headlineVec.shape
headlineVecTrain = headlineVec[:n_train, :]
outfilename_hvec_train = "train.headline.word2vec.pkl"
with open(outfilename_hvec_train, "wb") as outfile:
cPickle.dump(headlineVecTrain, outfile, -1)
print 'headline word2vec features of training set saved in %s' % outfilename_hvec_train
if n_test > 0:
# test set is available
headlineVecTest = headlineVec[n_train:, :]
outfilename_hvec_test = "test.headline.word2vec.pkl"
with open(outfilename_hvec_test, "wb") as outfile:
cPickle.dump(headlineVecTest, outfile, -1)
print 'headline word2vec features of test set saved in %s' % outfilename_hvec_test
print 'headine done'
Body_unigram_array = df['articleBody_unigram_vec'].values
print 'Body_unigram_array:'
print Body_unigram_array
print Body_unigram_array.shape
#bodyVec = [0]
bodyVec = map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), Body_unigram_array)
bodyVec = np.array(bodyVec)
bodyVec = normalize(bodyVec)
print 'bodyVec'
print bodyVec
print bodyVec.shape
bodyVecTrain = bodyVec[:n_train, :]
outfilename_bvec_train = "train.body.word2vec.pkl"
with open(outfilename_bvec_train, "wb") as outfile:
cPickle.dump(bodyVecTrain, outfile, -1)
print 'body word2vec features of training set saved in %s' % outfilename_bvec_train
if n_test > 0:
# test set is available
bodyVecTest = bodyVec[n_train:, :]
outfilename_bvec_test = "test.body.word2vec.pkl"
with open(outfilename_bvec_test, "wb") as outfile:
cPickle.dump(bodyVecTest, outfile, -1)
print 'body word2vec features of test set saved in %s' % outfilename_bvec_test
print 'body done'
# compute cosine similarity between headline/body word2vec features
simVec = np.asarray(map(cosine_sim, headlineVec, bodyVec))[:, np.newaxis]
print 'simVec.shape:'
print simVec.shape
simVecTrain = simVec[:n_train]
outfilename_simvec_train = "train.sim.word2vec.pkl"
with open(outfilename_simvec_train, "wb") as outfile:
cPickle.dump(simVecTrain, outfile, -1)
print 'word2vec sim. features of training set saved in %s' % outfilename_simvec_train
if n_test > 0:
# test set is available
simVecTest = simVec[n_train:]
outfilename_simvec_test = "test.sim.word2vec.pkl"
with open(outfilename_simvec_test, "wb") as outfile:
cPickle.dump(simVecTest, outfile, -1)
print 'word2vec sim. features of test set saved in %s' % outfilename_simvec_test
return 1
def read(self, header='train'):
filename_hvec = "%s.headline.word2vec.pkl" % header
with open(filename_hvec, "rb") as infile:
headlineVec = cPickle.load(infile)
filename_bvec = "%s.body.word2vec.pkl" % header
with open(filename_bvec, "rb") as infile:
bodyVec = cPickle.load(infile)
filename_simvec = "%s.sim.word2vec.pkl" % header
with open(filename_simvec, "rb") as infile:
simVec = cPickle.load(infile)
print 'headlineVec.shape:'
print headlineVec.shape
#print type(headlineVec)
print 'bodyVec.shape:'
print bodyVec.shape
#print type(bodyVec)
print 'simVec.shape:'
print simVec.shape
#print type(simVec)
return [headlineVec, bodyVec, simVec]
#return [simVec.reshape(-1,1)]
# Copyright 2017 Cisco Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.