forked from tmdavid/NLI_Code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
glove.py
195 lines (159 loc) · 6.97 KB
/
glove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from __future__ import print_function
import pandas as pd
from os.path import splitext
import string
import re
import argparse
from collections import Counter, defaultdict
import numpy as np
__author__ = 'david_torrejon'
"""
TODO
implement the option to download data direct from snli webpage + uncompress...
http://nlp.stanford.edu/projects/snli/snli_1.0.zip
CLEAN the code
AS OF 25/02 the coocurrence matrix works properly but its useless the text that is given to be generated.
"""
"""
# all the functions here are prepared to deal with snli corpus provided, hence a list of strings, not a whole text.
Simple Implementation of the GloVe model:
Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.
http://nlp.stanford.edu/projects/glove/
X -> co-ocurrence matrix
Xij = number of times j appears in context of i
Xi = SUMk(Xik)
Pij = P(j|i)= Xij/Xi Probability of j appear in i
Context -> sentence
"""
'''
"sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", "sentence2": "The sisters are hugging goodbye while holding to go packages after just eating lunch.", "sentence2_binary_parse": "( ( The sisters ) ( ( are ( ( hugging goodbye ) ( while ( holding ( to ( ( go packages ) ( after ( just ( eating lunch ) ) ) ) ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP are) (VP (VBG hugging) (NP (UH goodbye)) (PP (IN while) (S (VP (VBG holding) (S (VP (TO to) (VP (VB go) (NP (NNS packages)) (PP (IN after) (S (ADVP (RB just)) (VP (VBG eating) (NP (NN lunch))))))))))))) (. .)))"}
"sentence2_parse": "(ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are) (VP (VBG holding) (NP (NNS packages)))) (. .)))"}
"sentence2_parse": "(ROOT (S (NP (DT The) (NNS men)) (VP (VBP are) (VP (VBG fighting) (PP (IN outside) (NP (DT a) (NNS deli))))) (. .)))"}
'''
parser = argparse.ArgumentParser(description='Get File to process with GloVe')
parser.add_argument('-f', metavar='file_name', type=str, nargs=1, help='name of the file')
#some parameters
default_folder = 'snli_1.0/'
ndimensions = 300
zero = 0.0
#not_useful tokens
non_useful_tokens = ['a', 'the', 'in', 'the', 'and', 'on', 'with', 'of', 'to', 'an', 'at',
'is', 'while']
#functions
def test_init():
glove_init()
def tokenize_sentence(sentence):
regex = re.compile('[%s]' % re.escape(string.punctuation))
sentence = regex.sub('', sentence).lower()
tokenized_sentence = sentence.split(" ")
return tokenized_sentence
def token_count(corpus, ndim):
tokens_dict = defaultdict(int) #empty dict faster access than checking if its in a list
word2idx = {}
idx = 1
for sentence in corpus:
tokens_sentence = tokenize_sentence(sentence)
for token in tokens_sentence:
#print (token)
if token not in non_useful_tokens:
tokens_dict[token]+=1
if token not in word2idx:
word2idx[token] = idx
#print (token, idx)
idx +=1
#tokens_list = sorted(tokens_dict.items(), key=itemgetter(1), reverse=True)
c = Counter(tokens_dict)
common_tokens = c.most_common()
print (common_tokens[:10])
#print (sum(c.values()))
# return 300 most common tokens
# can return all and after matrix pick 300 better?
return common_tokens, word2idx, idx
def convert_sentence_index(sentence, word2id):
sindex = []
tokens_sentence = tokenize_sentence(sentence)
for token in tokens_sentence:
if token not in non_useful_tokens:
sindex.append(word2id[token])
return sindex
def get_coocu_matrix(corpus, word2idx, nrows, test=True):
# not proper for computations...would be better a np matrix but to represent... word -> index?
cooc_m = np.zeros((nrows+1, nrows+1))
#prob_m = np.zeros((nrows+1, nrows+1))
print ('Matrix of size',cooc_m.size,'created...')
print ('Generating coocurrence matrix...')
"""
1 2 3 SUM-> first row and first column have indexes of words
1 x y z x+y+z
2 x1 y1 z1 x1+y1+z1
3 x2 y2 z2 x2+y2+z2
easy to compute the probabilities...
"""
for sentence in corpus:
idx_sent = convert_sentence_index(sentence, word2idx)
for idx_s in idx_sent:
cooc_m[idx_s][0]=idx_s
for idx_w in idx_sent:
cooc_m[0][idx_w]=idx_w
cooc_m[idx_s][idx_w]+=1
#compute probabilities
first_row = True
for row in cooc_m:
if not first_row:
row[nrows] = (np.sum(row))-row[0]
first_row = False
print('Counting done...')
print('Generating probabilities...')
#copyto(dst, src)
prob_m = np.zeros((nrows+1, nrows+1))
for i in range(1, nrows+1):
if(i%500==0):
print('Updated',i,'probabilities...')
for j in range(nrows+1):
if j > 0:
prob_m[i][j] = cooc_m[i][j]/cooc_m[i][nrows]
else:
prob_m[i][j] = cooc_m[i][j]
if test:
value_test = word2idx['policeman']
array_test = prob_m[value_test]
top_n_values = -15 # needs to be negative to pick the last sorted values from np.array
top_ten_ndarray = array_test.argsort()[top_n_values:][::-1]
for key, value in word2idx.items():
if value == value_test:
print ('looking coocurrences for: ', key, 'with id', value)
top_ten = top_ten_ndarray.tolist()
top_ten.remove(value_test)
for element in top_ten:
for key, value in word2idx.items():
if value == element:
print (key,'(',value,') with probability:', prob_m[value_test][value])
pass
print('Probabilities generated')
def glove_init():
try:
args = parser.parse_args()
file_name = default_folder + args.f[0]
print("Reading file", file_name, "...")
file_path, file_ext = splitext(file_name)
with open(file_name, 'rb') as f:
data = f.readlines()
data_json_str = "[" + ','.join(data) + "]"
data_df = pd.read_json(data_json_str)
print(file_ext,"loaded...")
# sentence1_parse sentence2_parse, sentence1, sentence2
keep_columns = ['sentence1','sentence2','sentence1_parse','sentence2_parse']
data_df = data_df[keep_columns]
sentences = data_df['sentence1'].tolist() + data_df['sentence2'].tolist()
'''
I dont know whether to delete 2 sentences from sentence 1, because there are
3 sentences meaning the same for every pack of sentencnes
s1-s2 E, s1-s2 N, s1-s2 C, where s1 is always same
'''
# build dictionary?
print("Counting word appearances...")
tokens, word2index, nidx = token_count(sentences, ndimensions)
coocurrence_matrix = get_coocu_matrix(sentences, word2index, nidx)
except BaseException as e:
print (e)
test_init()