In [2]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [3]:
# Read in the word embedding.
f = BytesIO(file_io.read_file_to_string('wikimedia-personal-attacks-200-embeddings.npy', binary_mode=True))
vocab = np.load(f)

# Later we can use this as a check that we haven't added any rows or
# changed any indexes, ensuring the integrity of the embeddings has
# been preserved.
print('vocab size:', len(vocab))

# The first row is padding so the vocab indexes start at 1.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36995


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.438141,0.153445,0.116399,-0.473077,-0.246776,-0.012694,-0.016794,1.093161,0.432185,-0.722196,...,-0.080715,-0.558688,-0.642753,-0.704572,-1.077608,0.296168,-0.069523,1.215208,0.782416,0.322148
2,0.069291,0.075758,0.509854,0.206986,0.684989,-0.362959,0.501617,0.962356,-0.141415,-0.774441,...,0.499325,0.36411,0.38025,-0.357254,0.173331,0.957385,-0.219006,-0.618507,-0.093829,0.137752
3,0.359084,0.249134,-0.888367,0.263359,0.803612,0.131747,-0.250332,0.020726,0.618335,0.398759,...,-0.132253,0.067955,-0.098798,-1.293878,-0.39063,0.082776,-0.796724,-0.456788,-0.044439,-0.478273
4,-0.016234,0.54535,-0.95214,-0.091608,0.494948,-0.023566,0.724107,1.011107,0.302508,-1.160711,...,-0.452454,-0.100608,-0.445299,-0.029212,-0.514012,0.70092,-0.304741,0.697078,-0.131612,-0.194109


In [4]:
# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('id2word.bin', binary_mode=True))
id2word = msgpack.unpack(f, raw=False)
id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
id2word_df.head()

Unnamed: 0,id
14,this
9,is
16,not
3450,creative
1,.


In [5]:
# Merge the id-to-word dict with word embeddings, then set words as index, 
# which allows the lexicons to be easily merged since both have the words 
# as the index already.
word_emb_merged = pd.merge(word_embeddings_df, id2word_df, left_index=True, right_index=True, how='outer')
word_emb_merged = word_emb_merged.set_index('id')
word_emb_merged.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.438141,0.153445,0.116399,-0.473077,-0.246776,-0.012694,-0.016794,1.093161,0.432185,-0.722196,...,-0.080715,-0.558688,-0.642753,-0.704572,-1.077608,0.296168,-0.069523,1.215208,0.782416,0.322148
the,0.069291,0.075758,0.509854,0.206986,0.684989,-0.362959,0.501617,0.962356,-0.141415,-0.774441,...,0.499325,0.36411,0.38025,-0.357254,0.173331,0.957385,-0.219006,-0.618507,-0.093829,0.137752
to,0.359084,0.249134,-0.888367,0.263359,0.803612,0.131747,-0.250332,0.020726,0.618335,0.398759,...,-0.132253,0.067955,-0.098798,-1.293878,-0.39063,0.082776,-0.796724,-0.456788,-0.044439,-0.478273
and,-0.016234,0.54535,-0.95214,-0.091608,0.494948,-0.023566,0.724107,1.011107,0.302508,-1.160711,...,-0.452454,-0.100608,-0.445299,-0.029212,-0.514012,0.70092,-0.304741,0.697078,-0.131612,-0.194109


In [6]:
# Reset the index to add the indexes as a col of values.
word_emb_merged_reset = word_emb_merged.reset_index()
word_emb_merged_reset.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,.,0.438141,0.153445,0.116399,-0.473077,-0.246776,-0.012694,-0.016794,1.093161,0.432185,...,-0.080715,-0.558688,-0.642753,-0.704572,-1.077608,0.296168,-0.069523,1.215208,0.782416,0.322148
2,the,0.069291,0.075758,0.509854,0.206986,0.684989,-0.362959,0.501617,0.962356,-0.141415,...,0.499325,0.36411,0.38025,-0.357254,0.173331,0.957385,-0.219006,-0.618507,-0.093829,0.137752
3,to,0.359084,0.249134,-0.888367,0.263359,0.803612,0.131747,-0.250332,0.020726,0.618335,...,-0.132253,0.067955,-0.098798,-1.293878,-0.39063,0.082776,-0.796724,-0.456788,-0.044439,-0.478273
4,and,-0.016234,0.54535,-0.95214,-0.091608,0.494948,-0.023566,0.724107,1.011107,0.302508,...,-0.452454,-0.100608,-0.445299,-0.029212,-0.514012,0.70092,-0.304741,0.697078,-0.131612,-0.194109


In [8]:
# Save values of df as a formatted text file using np.
np.savetxt(r'embeddings.txt', word_emb_merged_reset.values, delimiter=' ', fmt='%s')

In [9]:
# Run python retrofit.py -i embeddings.txt -l lexicons/ppdb-xl.txt -n 10 -o out_vec.txt.
# This produces the enhanced embeddings, which then need to turned back into a numpy array of floats.

In [24]:
# Read in the enhanced word embeddings but skip first col (the words as strings).
vocab = np.loadtxt('out_vec.txt', usecols=range(1, 201))

# Check the length of the vocab.
print('vocab size:', len(vocab))

# Reset the first row to zeros for padding to index 1.
vocab[0] = 0

# Create df.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36994


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.062,0.0217,0.0165,-0.0669,-0.0349,-0.0018,-0.0024,0.1546,0.0611,-0.1022,...,-0.0114,-0.079,-0.0909,-0.0997,-0.1524,0.0419,-0.0098,0.1719,0.1107,0.0456
2,0.0211,0.0059,0.0055,0.0126,0.0767,-0.0247,0.0594,0.0652,-0.0192,-0.0852,...,0.0314,0.0318,0.0009,-0.0341,0.0279,0.0825,-0.0489,-0.0186,0.007,0.0147
3,0.0497,0.0264,-0.0977,0.0085,0.0952,0.0237,0.0006,-0.0041,0.0464,0.0158,...,-0.0192,0.0144,-0.0339,-0.1108,-0.0246,0.002,-0.0825,-0.0178,0.0009,-0.0263
4,0.0043,0.0509,-0.1038,-0.0105,0.0438,0.0003,0.0537,0.0824,0.0184,-0.1108,...,-0.0507,0.0019,-0.061,-0.0162,-0.0205,0.0615,-0.0391,0.075,-0.0121,-0.0072


In [25]:
# Write the damn thing to file.
np.save('enhanced-embeddings', vocab)