In [1]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [2]:
# Read in the word embedding.
f = BytesIO(file_io.read_file_to_string('wikimedia-PA-300-embeddings.npy', binary_mode=True))
vocab = np.load(f)

# Later we can use this as a check that we haven't added any rows or
# changed any indexes, ensuring the integrity of the embeddings has
# been preserved.
print('vocab size:', len(vocab))

# The first row is padding so the vocab indexes start at 1.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36995


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,0.407885,...,-0.667598,-0.048573,0.153635,-0.931134,0.778837,0.358085,0.455149,0.213865,-0.010973,0.120303
2,0.226789,0.041741,0.354136,0.043368,0.829325,0.685373,-0.908815,1.092019,0.025413,0.008967,...,-0.04064,0.336631,0.655913,-0.781886,0.014642,1.230519,0.232964,0.027527,0.047553,-0.185841
3,-0.236558,0.234574,0.260735,-0.769586,0.669529,-0.097754,-0.032788,-0.101215,0.96121,0.283292,...,-0.720961,0.062782,0.062898,-0.163448,-0.073236,0.39468,0.690109,-0.624584,0.603089,0.1561
4,0.212198,0.738136,0.309266,0.207942,-0.090359,-0.120615,0.546719,0.553092,0.024477,1.39102,...,0.056193,-0.125345,0.30219,-1.06465,0.6199,0.361829,0.256977,-0.155599,-0.022368,0.761006


In [3]:
# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('id2word.bin', binary_mode=True))
id2word = msgpack.unpack(f, raw=False)
id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
id2word_df.head()

Unnamed: 0,id
14,this
9,is
16,not
3450,creative
1,.


In [4]:
# Merge the id-to-word dict with word embeddings, then set words as index, 
# which allows the lexicons to be easily merged since both have the words 
# as the index already.
word_emb_merged = pd.merge(word_embeddings_df, id2word_df, left_index=True, right_index=True, how='outer')
word_emb_merged = word_emb_merged.set_index('id')
word_emb_merged.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,0.407885,...,-0.667598,-0.048573,0.153635,-0.931134,0.778837,0.358085,0.455149,0.213865,-0.010973,0.120303
the,0.226789,0.041741,0.354136,0.043368,0.829325,0.685373,-0.908815,1.092019,0.025413,0.008967,...,-0.04064,0.336631,0.655913,-0.781886,0.014642,1.230519,0.232964,0.027527,0.047553,-0.185841
to,-0.236558,0.234574,0.260735,-0.769586,0.669529,-0.097754,-0.032788,-0.101215,0.96121,0.283292,...,-0.720961,0.062782,0.062898,-0.163448,-0.073236,0.39468,0.690109,-0.624584,0.603089,0.1561
and,0.212198,0.738136,0.309266,0.207942,-0.090359,-0.120615,0.546719,0.553092,0.024477,1.39102,...,0.056193,-0.125345,0.30219,-1.06465,0.6199,0.361829,0.256977,-0.155599,-0.022368,0.761006


In [5]:
# Reset the index to add the indexes as a col of values.
word_emb_merged_reset = word_emb_merged.reset_index()
word_emb_merged_reset.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,.,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,...,-0.667598,-0.048573,0.153635,-0.931134,0.778837,0.358085,0.455149,0.213865,-0.010973,0.120303
2,the,0.226789,0.041741,0.354136,0.043368,0.829325,0.685373,-0.908815,1.092019,0.025413,...,-0.04064,0.336631,0.655913,-0.781886,0.014642,1.230519,0.232964,0.027527,0.047553,-0.185841
3,to,-0.236558,0.234574,0.260735,-0.769586,0.669529,-0.097754,-0.032788,-0.101215,0.96121,...,-0.720961,0.062782,0.062898,-0.163448,-0.073236,0.39468,0.690109,-0.624584,0.603089,0.1561
4,and,0.212198,0.738136,0.309266,0.207942,-0.090359,-0.120615,0.546719,0.553092,0.024477,...,0.056193,-0.125345,0.30219,-1.06465,0.6199,0.361829,0.256977,-0.155599,-0.022368,0.761006


In [6]:
# Save values of df as a formatted text file using np.
np.savetxt('wikimedia-PA-Gao-300-keyed-embeddings.txt', word_emb_merged_reset.values, delimiter=' ', fmt='%s')

In [9]:
# Run python retrofit.py -i embeddings.txt -l lexicons/ppdb-xl.txt -n 10 -o out_vec.txt.
# This produces the enhanced embeddings, which then need to turned back into a numpy array of floats.

In [7]:
# Read in the enhanced word embeddings but skip first col (the words as strings).
vocab = np.loadtxt('wikimedia-PA-Gao-300-retrofitted-keyed-embeddings.txt', usecols=range(1, 201))

# Check the length of the vocab.
print('vocab size:', len(vocab))

# Reset the first row to zeros for padding to index 1.
vocab[0] = 0

# Create df.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36994


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0312,0.0374,0.0274,0.0126,0.0982,-0.0239,0.0416,0.036,0.0291,0.0471,...,0.0128,-0.006,-0.0581,-0.0126,-0.0883,-0.0015,-0.0567,-0.1139,0.0188,0.0241
2,0.0145,0.0009,0.0387,0.0288,0.0419,0.0629,-0.0667,0.0765,-0.0227,-0.024,...,0.0151,0.0286,0.0744,-0.0104,0.0708,-0.0041,0.0061,-0.0294,0.031,-0.0257
3,-0.0145,0.0127,0.0257,-0.0529,0.0298,0.0018,-0.0024,0.0054,0.0403,-0.0009,...,0.008,-0.0012,0.0192,-0.018,-0.0325,-0.0411,0.0048,-0.0321,0.0213,-0.0063
4,0.0127,0.0428,0.0316,0.0294,-0.0094,0.0024,0.0246,0.04,-0.0208,0.0685,...,0.0098,0.0009,0.0348,0.039,-0.0211,-0.0168,-0.0236,-0.0395,0.0455,0.0163


In [8]:
# Write the damn thing to file.
np.save('wikimedia-PA-Gao-300-retrofitted-embeddings', vocab)