# Generate Plain-Text Keyed Embeddings from NumPy Array

In [2]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [9]:
# Read in the word embedding.
f = BytesIO(file_io.read_file_to_string('PA-Gao-300-NC-Lex-V4-AR-Embeddings.npy', binary_mode=True))
vocab = np.load(f)

# Later we can use this as a check that we haven't added any rows or
# changed any indexes, ensuring the integrity of the embeddings has
# been preserved.
print('vocab size:', len(vocab))

# The first row is padding so the vocab indexes start at 1.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36993


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,300,301,302,303,304
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,0.041359,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0
2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.141965,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0
3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.045642,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0
4,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,0.159467,...,0.04148,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0


In [10]:
# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('Gao_300_NC_Lex_V4_PA_AR_id2word.bin', binary_mode=True))
id2word = msgpack.unpack(f, raw=False)
id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
id2word_df.head()

Unnamed: 0,id
1,.
2,the
3,to
4,and
5,of


In [11]:
# Merge the id-to-word dict with word embeddings, then set words as index, 
# which allows the lexicons to be easily merged since both have the words 
# as the index already.
word_emb_merged = pd.merge(word_embeddings_df, id2word_df, left_index=True, right_index=True, how='outer')
word_emb_merged = word_emb_merged.set_index('id')
word_emb_merged.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,300,301,302,303,304
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,0.041359,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0
the,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.141965,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0
to,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.045642,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0
and,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,0.159467,...,0.04148,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0


In [12]:
# Reset the index to add the indexes as a col of values.
word_emb_merged_reset = word_emb_merged.reset_index()
word_emb_merged_reset.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,295,296,297,298,299,300,301,302,303,304
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,.,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,...,0.041359,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0
2,the,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,...,0.141965,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0
3,to,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,...,0.045642,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0
4,and,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,...,0.04148,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0


In [13]:
# Save values of df as a formatted text file using np.
np.savetxt('PA-Gao-300-NC-Lex-V4-AR-keyed-embeddings.txt', word_emb_merged_reset.values, delimiter=' ', fmt='%s')

In [9]:
# Run python retrofit.py -i embeddings.txt -l lexicons/ppdb-xl.txt -n 10 -o out_vec.txt.
# This produces the enhanced embeddings, which then need to turned back into a numpy array of floats.

In [24]:
# Read in the enhanced word embeddings but skip first col (the words as strings).
vocab = np.loadtxt('out_vec.txt', usecols=range(1, 201))

# Check the length of the vocab.
print('vocab size:', len(vocab))

# Reset the first row to zeros for padding to index 1.
vocab[0] = 0

# Create df.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36994


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.062,0.0217,0.0165,-0.0669,-0.0349,-0.0018,-0.0024,0.1546,0.0611,-0.1022,...,-0.0114,-0.079,-0.0909,-0.0997,-0.1524,0.0419,-0.0098,0.1719,0.1107,0.0456
2,0.0211,0.0059,0.0055,0.0126,0.0767,-0.0247,0.0594,0.0652,-0.0192,-0.0852,...,0.0314,0.0318,0.0009,-0.0341,0.0279,0.0825,-0.0489,-0.0186,0.007,0.0147
3,0.0497,0.0264,-0.0977,0.0085,0.0952,0.0237,0.0006,-0.0041,0.0464,0.0158,...,-0.0192,0.0144,-0.0339,-0.1108,-0.0246,0.002,-0.0825,-0.0178,0.0009,-0.0263
4,0.0043,0.0509,-0.1038,-0.0105,0.0438,0.0003,0.0537,0.0824,0.0184,-0.1108,...,-0.0507,0.0019,-0.061,-0.0162,-0.0205,0.0615,-0.0391,0.075,-0.0121,-0.0072


In [25]:
# Write the damn thing to file.
np.save('enhanced-embeddings', vocab)