# Counter-Fitting GoogleNews Embeddings

In [66]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [20]:
# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('GoogleNews_id2word.bin', binary_mode=True))
id2word = msgpack.unpack(f, raw=False)
id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
id2word_df.head()

Unnamed: 0,id
1,in
2,for
3,that
4,is
5,on


In [21]:
# Run counter-fitting.
# This produces the counter-fitted embeddings, which then need to turned back into a numpy array of floats.

In [22]:
# Read the result back in.
counter_fitted_vocab = pd.read_csv('GN_counter_fitted_vectors.txt', sep=" ", header=None)

# Check the length of the vocab.
print('vocab size:', len(counter_fitted_vocab))
counter_fitted_vocab.head()

vocab size: 56537


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,fawn,0.089176,0.121832,-0.067196,0.047728,-0.013659,-0.067196,0.064056,-0.033127,-0.036424,...,-0.008831,0.057776,-0.018918,0.016877,0.04082,-0.040506,0.055264,-0.048042,-0.027789,0.087292
1,vang,-0.056641,0.074099,-0.059357,-0.103196,-0.025023,-0.024635,0.036274,-0.001376,-0.01998,...,0.1063,0.068668,-0.029291,-0.001158,0.043063,-0.084574,-0.036468,0.031812,-0.027351,-0.00965
2,nunnery,0.097154,-0.010809,0.021109,0.048323,0.06282,0.104784,-0.045271,0.024289,0.008202,...,-0.018439,0.009728,-0.104276,-0.051629,0.028231,0.030647,-0.091559,0.013098,0.041202,0.135304
3,deferment,0.009257,0.014527,0.033095,-0.001165,0.007889,-0.015363,0.029022,-0.087086,0.078894,...,-0.011647,0.005508,-0.082832,-0.064062,0.016054,0.052927,-0.131125,0.0264,-0.038047,0.019191
4,vani,-0.034083,0.031514,0.035967,0.045216,-0.034939,0.033056,0.017042,0.002954,-0.052067,...,0.054464,0.01353,0.00167,-0.067139,-0.006637,0.061658,0.003361,-0.036823,-0.006423,0.102078


In [23]:
# Output is not in same order, use words as index to restore order.
# First, set words as index.
counter_fitted_vocab_reset = counter_fitted_vocab.set_index(0)
counter_fitted_vocab_reset.head(10)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fawn,0.089176,0.121832,-0.067196,0.047728,-0.013659,-0.067196,0.064056,-0.033127,-0.036424,0.005652,...,-0.008831,0.057776,-0.018918,0.016877,0.04082,-0.040506,0.055264,-0.048042,-0.027789,0.087292
vang,-0.056641,0.074099,-0.059357,-0.103196,-0.025023,-0.024635,0.036274,-0.001376,-0.01998,0.093497,...,0.1063,0.068668,-0.029291,-0.001158,0.043063,-0.084574,-0.036468,0.031812,-0.027351,-0.00965
nunnery,0.097154,-0.010809,0.021109,0.048323,0.06282,0.104784,-0.045271,0.024289,0.008202,0.018312,...,-0.018439,0.009728,-0.104276,-0.051629,0.028231,0.030647,-0.091559,0.013098,0.041202,0.135304
deferment,0.009257,0.014527,0.033095,-0.001165,0.007889,-0.015363,0.029022,-0.087086,0.078894,0.061559,...,-0.011647,0.005508,-0.082832,-0.064062,0.016054,0.052927,-0.131125,0.0264,-0.038047,0.019191
vani,-0.034083,0.031514,0.035967,0.045216,-0.034939,0.033056,0.017042,0.002954,-0.052067,0.046586,...,0.054464,0.01353,0.00167,-0.067139,-0.006637,0.061658,0.003361,-0.036823,-0.006423,0.102078
woods,0.03569,-0.003673,-0.065534,0.021383,0.007884,0.060304,-0.088609,-0.083071,0.089225,0.034921,...,-0.129837,-0.004384,-0.024614,0.027383,-0.031382,-0.036613,-0.055688,-0.014153,0.097224,0.054458
clotted,-0.02536,0.072456,0.026265,0.064003,-0.135855,0.047398,0.014718,-0.006868,0.003359,0.040757,...,0.00483,-0.034719,0.045285,0.030945,-0.053738,0.01668,0.005321,-0.00083,-0.019095,-0.025209
spiders,0.036379,0.0243,-0.0648,-0.015703,0.001332,-0.000123,-0.009521,-0.006075,-0.011582,0.115958,...,-0.098905,0.090379,-0.061389,0.036379,-0.029984,-0.052579,0.059116,0.012221,0.005258,0.033253
hanging,0.031847,0.048808,-0.052962,0.03202,0.012548,-0.01601,0.005625,-0.021808,0.082039,0.114924,...,-0.060231,0.117001,0.054001,-0.037039,0.018779,-0.110078,0.073386,-0.089309,-0.011769,0.006058
woody,0.024411,0.130576,0.021955,0.032933,-0.074533,0.017478,-0.015094,0.026866,-0.07511,0.063555,...,-0.020078,0.065866,0.017261,0.014083,-0.00845,-0.035389,-0.071933,-0.019066,-0.036255,0.07511


In [24]:
# Next, merge the ids from the id2word d (same one used the create the initial embeddings in text format).
counter_fitted_vocab_reset_merged = pd.merge(counter_fitted_vocab_reset, id2word_df, left_index=True, right_on='id', how='outer')
counter_fitted_vocab_reset_merged.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,id
30319,0.089176,0.121832,-0.067196,0.047728,-0.013659,-0.067196,0.064056,-0.033127,-0.036424,0.005652,...,0.057776,-0.018918,0.016877,0.04082,-0.040506,0.055264,-0.048042,-0.027789,0.087292,fawn
96657,-0.056641,0.074099,-0.059357,-0.103196,-0.025023,-0.024635,0.036274,-0.001376,-0.01998,0.093497,...,0.068668,-0.029291,-0.001158,0.043063,-0.084574,-0.036468,0.031812,-0.027351,-0.00965,vang
51180,0.097154,-0.010809,0.021109,0.048323,0.06282,0.104784,-0.045271,0.024289,0.008202,0.018312,...,0.009728,-0.104276,-0.051629,0.028231,0.030647,-0.091559,0.013098,0.041202,0.135304,nunnery
29131,0.009257,0.014527,0.033095,-0.001165,0.007889,-0.015363,0.029022,-0.087086,0.078894,0.061559,...,0.005508,-0.082832,-0.064062,0.016054,0.052927,-0.131125,0.0264,-0.038047,0.019191,deferment
92381,-0.034083,0.031514,0.035967,0.045216,-0.034939,0.033056,0.017042,0.002954,-0.052067,0.046586,...,0.01353,0.00167,-0.067139,-0.006637,0.061658,0.003361,-0.036823,-0.006423,0.102078,vani


In [25]:
# Verify ids and word are correctly matched.
print(id2word_df.loc[2, 'id'] + ' = ' + counter_fitted_vocab_reset_merged.loc[2, 'id'])

for = for


In [26]:
# Restore order by numerical index.
counter_fitted_vocab_reset_merged_sorted = counter_fitted_vocab_reset_merged.sort_index()
counter_fitted_vocab_reset_merged_sorted.head(20)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,id
1,0.052956,0.06546,0.066195,0.047072,0.052221,-0.082009,-0.061414,-0.116209,0.015629,0.099293,...,-0.066931,-0.060679,0.048911,0.046153,-0.035672,-0.044314,-0.035856,0.010895,-0.047072,in
2,-0.008512,-0.034224,0.032284,0.045868,-0.013143,-0.046221,-0.000948,-0.052219,0.046574,0.062451,...,0.00269,-0.059628,0.058923,0.005733,0.000345,0.013319,0.051513,-0.025227,0.017465,for
3,-0.012361,-0.02223,0.06554,0.039477,-0.08662,0.024913,-0.011163,-0.070522,0.092369,0.092752,...,-0.012265,-0.026254,-0.016193,-0.015235,0.050209,0.01581,0.00539,0.047909,-0.116515,that
4,0.01373,-0.030795,0.08387,-0.001946,-0.065557,0.095346,0.052906,-0.065769,0.02716,0.009564,...,-0.0139,-0.070325,0.095192,0.011897,0.07717,-0.004127,0.01682,0.083955,0.045543,is
5,0.016307,-0.026439,0.024721,0.122059,-0.025086,-0.009884,-0.016358,-0.135099,0.044917,-0.006176,...,0.01949,0.034068,0.004034,-0.060803,-0.003982,-0.091673,-0.080525,-0.016883,0.002533,on
6,-0.01597,0.014091,-0.022702,0.087678,0.01049,-0.017927,-0.015578,-0.089557,0.036793,0.039925,...,0.001977,-0.08893,-0.015657,-0.059809,-0.030531,-0.048536,0.006928,-0.012056,-0.044152,with
7,-0.0016,-0.04649,0.058165,-0.015237,-0.039997,0.047824,0.131768,-0.089942,0.03556,-0.015955,...,0.004469,-0.036935,0.029057,-0.068684,-0.01042,0.058132,0.067589,0.093866,-0.023766,said
8,0.017632,0.003188,0.085649,-0.032106,0.005213,-0.058289,-0.007549,-0.046817,0.041078,0.022407,...,-0.019473,-0.094513,0.11165,0.009629,0.057765,-0.045319,-0.053164,0.105328,-0.016318,was
9,0.074685,0.097911,0.046451,0.049866,-0.062845,-0.112483,0.032789,-0.110662,0.040986,0.028121,...,-0.028121,-0.012125,0.015256,-0.017077,0.013833,0.004668,0.003415,0.044401,-0.064211,the
10,-0.03536,-0.022689,0.043905,0.06571,0.040075,0.029319,0.076613,-0.035065,0.041548,0.054513,...,0.031824,-0.068952,0.113152,0.001786,-0.001657,-0.028435,-0.065121,-0.052156,-0.019301,at


In [28]:
df_nans = counter_fitted_vocab_reset_merged_sorted[counter_fitted_vocab_reset_merged_sorted.isna().any(axis=1)]
print('len(counter_fitted_vocab_reset_merged_sorted):', len(counter_fitted_vocab_reset_merged_sorted))
print('len(df_nans):', len(df_nans))
df_nans.head()

len(counter_fitted_vocab_reset_merged_sorted): 155062
len(df_nans): 98527


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,id
77,,,,,,,,,,,...,,,,,,,,,,percent
147,,,,,,,,,,,...,,,,,,,,,,play
150,,,,,,,,,,,...,,,,,,,,,,know
212,,,,,,,,,,,...,,,,,,,,,,lead
320,,,,,,,,,,,...,,,,,,,,,,tax


In [49]:
# Drop words with NaN.
counter_fitted_vocab_reset_merged_sorted_dropped_na = counter_fitted_vocab_reset_merged_sorted.dropna().copy()
print('len(counter_fitted_vocab_reset_merged_sorted_dropped_na):', len(counter_fitted_vocab_reset_merged_sorted_dropped_na))
counter_fitted_vocab_reset_merged_sorted_dropped_na.head()

len(counter_fitted_vocab_reset_merged_sorted_dropped_na): 56535


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,id
1,0.052956,0.06546,0.066195,0.047072,0.052221,-0.082009,-0.061414,-0.116209,0.015629,0.099293,...,-0.066931,-0.060679,0.048911,0.046153,-0.035672,-0.044314,-0.035856,0.010895,-0.047072,in
2,-0.008512,-0.034224,0.032284,0.045868,-0.013143,-0.046221,-0.000948,-0.052219,0.046574,0.062451,...,0.00269,-0.059628,0.058923,0.005733,0.000345,0.013319,0.051513,-0.025227,0.017465,for
3,-0.012361,-0.02223,0.06554,0.039477,-0.08662,0.024913,-0.011163,-0.070522,0.092369,0.092752,...,-0.012265,-0.026254,-0.016193,-0.015235,0.050209,0.01581,0.00539,0.047909,-0.116515,that
4,0.01373,-0.030795,0.08387,-0.001946,-0.065557,0.095346,0.052906,-0.065769,0.02716,0.009564,...,-0.0139,-0.070325,0.095192,0.011897,0.07717,-0.004127,0.01682,0.083955,0.045543,is
5,0.016307,-0.026439,0.024721,0.122059,-0.025086,-0.009884,-0.016358,-0.135099,0.044917,-0.006176,...,0.01949,0.034068,0.004034,-0.060803,-0.003982,-0.091673,-0.080525,-0.016883,0.002533,on


In [50]:
# TODO Index needs to be reset before saving these dicts.

# Create id2word and word2id lists.
id2word_df = counter_fitted_vocab_reset_merged_sorted_dropped_na.reset_index()

id2word_df = id2word_df.set_index('id')

id2word_df[['index']].head()

Unnamed: 0_level_0,index
id,Unnamed: 1_level_1
in,1
for,2
that,3
is,4
on,5


In [51]:
# Create dictionary of just IDs to words.
id2word_dict = id2word_df.to_dict()
id2word_dict = id2word_dict['index']

In [52]:
# Switch keys/values and store word2id dictionary.
# Needed to encode examples.
word2id_dict = {y: x for x, y in id2word_dict.items()}

In [64]:
# Write word2id list to disk.
with open('GoogleNews_CF_word2id.bin', 'wb') as f:
    msgpack.pack(word2id_dict, f)
    
# Write id2word list to disk.
with open('GoogleNews_CF_id2word.bin', 'wb') as f:
    msgpack.pack(id2word_dict, f)

In [54]:
print('len(counter_fitted_vocab_reset_merged_sorted_dropped_na):', len(counter_fitted_vocab_reset_merged_sorted_dropped_na))
counter_fitted_vocab_reset_merged_sorted_dropped_na.head()

len(counter_fitted_vocab_reset_merged_sorted_dropped_na): 56535


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,id
1,0.052956,0.06546,0.066195,0.047072,0.052221,-0.082009,-0.061414,-0.116209,0.015629,0.099293,...,-0.066931,-0.060679,0.048911,0.046153,-0.035672,-0.044314,-0.035856,0.010895,-0.047072,in
2,-0.008512,-0.034224,0.032284,0.045868,-0.013143,-0.046221,-0.000948,-0.052219,0.046574,0.062451,...,0.00269,-0.059628,0.058923,0.005733,0.000345,0.013319,0.051513,-0.025227,0.017465,for
3,-0.012361,-0.02223,0.06554,0.039477,-0.08662,0.024913,-0.011163,-0.070522,0.092369,0.092752,...,-0.012265,-0.026254,-0.016193,-0.015235,0.050209,0.01581,0.00539,0.047909,-0.116515,that
4,0.01373,-0.030795,0.08387,-0.001946,-0.065557,0.095346,0.052906,-0.065769,0.02716,0.009564,...,-0.0139,-0.070325,0.095192,0.011897,0.07717,-0.004127,0.01682,0.083955,0.045543,is
5,0.016307,-0.026439,0.024721,0.122059,-0.025086,-0.009884,-0.016358,-0.135099,0.044917,-0.006176,...,0.01949,0.034068,0.004034,-0.060803,-0.003982,-0.091673,-0.080525,-0.016883,0.002533,on


In [55]:
# Drop col of words.
counter_fitted_vocab_reset_merged_sorted_dropped_na.drop(['id'], axis=1, inplace=True)

In [57]:
# Reset index, dropping.
counter_fitted_vocab_reset_merged_sorted_dropped_na.reset_index(drop=True, inplace=True)
counter_fitted_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,0.052956,0.06546,0.066195,0.047072,0.052221,-0.082009,-0.061414,-0.116209,0.015629,0.099293,...,-0.127242,-0.066931,-0.060679,0.048911,0.046153,-0.035672,-0.044314,-0.035856,0.010895,-0.047072
1,-0.008512,-0.034224,0.032284,0.045868,-0.013143,-0.046221,-0.000948,-0.052219,0.046574,0.062451,...,-0.016318,0.00269,-0.059628,0.058923,0.005733,0.000345,0.013319,0.051513,-0.025227,0.017465
2,-0.012361,-0.02223,0.06554,0.039477,-0.08662,0.024913,-0.011163,-0.070522,0.092369,0.092752,...,-0.008863,-0.012265,-0.026254,-0.016193,-0.015235,0.050209,0.01581,0.00539,0.047909,-0.116515
3,0.01373,-0.030795,0.08387,-0.001946,-0.065557,0.095346,0.052906,-0.065769,0.02716,0.009564,...,-0.143228,-0.0139,-0.070325,0.095192,0.011897,0.07717,-0.004127,0.01682,0.083955,0.045543
4,0.016307,-0.026439,0.024721,0.122059,-0.025086,-0.009884,-0.016358,-0.135099,0.044917,-0.006176,...,0.051144,0.01949,0.034068,0.004034,-0.060803,-0.003982,-0.091673,-0.080525,-0.016883,0.002533


In [59]:
# Restore padding as first row.
padding = [0.0] * 300
counter_fitted_vocab_reset_merged_sorted_dropped_na.loc[-1] = padding  # adding a row
counter_fitted_vocab_reset_merged_sorted_dropped_na.index = counter_fitted_vocab_reset_merged_sorted_dropped_na.index + 1  # shifting index
counter_fitted_vocab_reset_merged_sorted_dropped_na = counter_fitted_vocab_reset_merged_sorted_dropped_na.sort_index()  # sorting by index
counter_fitted_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.052956,0.06546,0.066195,0.047072,0.052221,-0.082009,-0.061414,-0.116209,0.015629,0.099293,...,-0.127242,-0.066931,-0.060679,0.048911,0.046153,-0.035672,-0.044314,-0.035856,0.010895,-0.047072
2,-0.008512,-0.034224,0.032284,0.045868,-0.013143,-0.046221,-0.000948,-0.052219,0.046574,0.062451,...,-0.016318,0.00269,-0.059628,0.058923,0.005733,0.000345,0.013319,0.051513,-0.025227,0.017465
3,-0.012361,-0.02223,0.06554,0.039477,-0.08662,0.024913,-0.011163,-0.070522,0.092369,0.092752,...,-0.008863,-0.012265,-0.026254,-0.016193,-0.015235,0.050209,0.01581,0.00539,0.047909,-0.116515
4,0.01373,-0.030795,0.08387,-0.001946,-0.065557,0.095346,0.052906,-0.065769,0.02716,0.009564,...,-0.143228,-0.0139,-0.070325,0.095192,0.011897,0.07717,-0.004127,0.01682,0.083955,0.045543


In [60]:
counter_fitted_vocab_reset_merged_sorted_dropped_na.iloc[1:]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
1,0.052956,0.065460,0.066195,0.047072,0.052221,-0.082009,-0.061414,-0.116209,0.015629,0.099293,...,-0.127242,-0.066931,-0.060679,0.048911,0.046153,-0.035672,-0.044314,-0.035856,0.010895,-0.047072
2,-0.008512,-0.034224,0.032284,0.045868,-0.013143,-0.046221,-0.000948,-0.052219,0.046574,0.062451,...,-0.016318,0.002690,-0.059628,0.058923,0.005733,0.000345,0.013319,0.051513,-0.025227,0.017465
3,-0.012361,-0.022230,0.065540,0.039477,-0.086620,0.024913,-0.011163,-0.070522,0.092369,0.092752,...,-0.008863,-0.012265,-0.026254,-0.016193,-0.015235,0.050209,0.015810,0.005390,0.047909,-0.116515
4,0.013730,-0.030795,0.083870,-0.001946,-0.065557,0.095346,0.052906,-0.065769,0.027160,0.009564,...,-0.143228,-0.013900,-0.070325,0.095192,0.011897,0.077170,-0.004127,0.016820,0.083955,0.045543
5,0.016307,-0.026439,0.024721,0.122059,-0.025086,-0.009884,-0.016358,-0.135099,0.044917,-0.006176,...,0.051144,0.019490,0.034068,0.004034,-0.060803,-0.003982,-0.091673,-0.080525,-0.016883,0.002533
6,-0.015970,0.014091,-0.022702,0.087678,0.010490,-0.017927,-0.015578,-0.089557,0.036793,0.039925,...,-0.013308,0.001977,-0.088930,-0.015657,-0.059809,-0.030531,-0.048536,0.006928,-0.012056,-0.044152
7,-0.001600,-0.046490,0.058165,-0.015237,-0.039997,0.047824,0.131768,-0.089942,0.035560,-0.015955,...,-0.070590,0.004469,-0.036935,0.029057,-0.068684,-0.010420,0.058132,0.067589,0.093866,-0.023766
8,0.017632,0.003188,0.085649,-0.032106,0.005213,-0.058289,-0.007549,-0.046817,0.041078,0.022407,...,-0.154638,-0.019473,-0.094513,0.111650,0.009629,0.057765,-0.045319,-0.053164,0.105328,-0.016318
9,0.074685,0.097911,0.046451,0.049866,-0.062845,-0.112483,0.032789,-0.110662,0.040986,0.028121,...,-0.066488,-0.028121,-0.012125,0.015256,-0.017077,0.013833,0.004668,0.003415,0.044401,-0.064211
10,-0.035360,-0.022689,0.043905,0.065710,0.040075,0.029319,0.076613,-0.035065,0.041548,0.054513,...,-0.019890,0.031824,-0.068952,0.113152,0.001786,-0.001657,-0.028435,-0.065121,-0.052156,-0.019301


In [61]:
# Write the damn thing to file.
np.save('GoogleNews-CF-embeddings', counter_fitted_vocab_reset_merged_sorted_dropped_na)