# Prep Wikimedia Data for Post-Specialized AR Embeddings

In [1]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [2]:
# # Get id-to-word dict previously created while generating word embeddings.
# f = BytesIO(file_io.read_file_to_string('PA-300-id2word.bin', binary_mode=True))
# Gao_PA_id2word = msgpack.unpack(f, raw=False)
# Gao_PA_id2word_df = pd.DataFrame.from_dict(Gao_PA_id2word, orient='index', columns=['id'])
# Gao_PA_id2word_df.head()

In [3]:
# Run attract-repel script.
# This produces the retrofitted embeddings, which then need to turned back into a numpy array of floats.

In [39]:
# Read the result back in.
post_spec_AR_vocab = pd.read_csv('wikimedia-PA-Gao-AR-PPDB-300-post-specialized.txt', sep=" ", header=None)

# Check the length of the vocab.
print('post_spec_AR_vocab size:', len(post_spec_AR_vocab))
post_spec_AR_vocab.head()

post_spec_AR_vocab size: 183870


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,biennials,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
1,tripolitan,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,-0.042047,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
2,tsukino,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
3,nunnery,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,0.068886,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432
4,schwarzburg-rudolstadt,0.074724,-0.01541,-5.6e-05,-0.032053,-0.008711,-0.03005,-0.035028,0.007226,0.111881,...,-0.033774,-0.038186,0.021029,0.038031,-0.067227,-0.019451,0.002031,-0.070298,0.037921,-0.001279


In [40]:
post_spec_AR_vocab.set_index(0, inplace=True)
post_spec_AR_vocab.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
biennials,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,0.020641,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
tripolitan,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,-0.042047,-0.004923,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
tsukino,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,0.008644,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
nunnery,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,0.068886,-0.002589,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432
schwarzburg-rudolstadt,0.074724,-0.01541,-5.6e-05,-0.032053,-0.008711,-0.03005,-0.035028,0.007226,0.111881,-0.018725,...,-0.033774,-0.038186,0.021029,0.038031,-0.067227,-0.019451,0.002031,-0.070298,0.037921,-0.001279


In [41]:
post_spec_AR_vocab.reset_index(inplace=True)# Shift index by 1 for masking of zero.
post_spec_AR_vocab.index = post_spec_AR_vocab.index + 1 
post_spec_AR_vocab.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
1,biennials,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
2,tripolitan,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,-0.042047,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
3,tsukino,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
4,nunnery,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,0.068886,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432
5,schwarzburg-rudolstadt,0.074724,-0.01541,-5.6e-05,-0.032053,-0.008711,-0.03005,-0.035028,0.007226,0.111881,...,-0.033774,-0.038186,0.021029,0.038031,-0.067227,-0.019451,0.002031,-0.070298,0.037921,-0.001279


In [42]:
post_spec_AR_vocab.reset_index(inplace=True)
post_spec_AR_vocab.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,291,292,293,294,295,296,297,298,299,300
0,1,biennials,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
1,2,tripolitan,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
2,3,tsukino,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
3,4,nunnery,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432
4,5,schwarzburg-rudolstadt,0.074724,-0.01541,-5.6e-05,-0.032053,-0.008711,-0.03005,-0.035028,0.007226,...,-0.033774,-0.038186,0.021029,0.038031,-0.067227,-0.019451,0.002031,-0.070298,0.037921,-0.001279


In [43]:
word2id_df = post_spec_AR_vocab.set_index(0)

word2id_df = word2id_df[['index']]
word2id_df.head()

Unnamed: 0_level_0,index
0,Unnamed: 1_level_1
biennials,1
tripolitan,2
tsukino,3
nunnery,4
schwarzburg-rudolstadt,5


In [44]:
# Create dictionary of just IDs to words.
word2id_dict = word2id_df.to_dict()
word2id_dict = word2id_dict['index']

# Switch keys/values and store word2id dictionary.
# Needed to encode examples.
id2word_dict = {y: x for x, y in word2id_dict.items()}

In [47]:
# Write word2id list to disk.
with open('Gao_300_PA_AR_Post_Spec_word2id.bin', 'wb') as f:
    msgpack.pack(word2id_dict, f)

In [48]:
# Write id2word list to disk.
with open('Gao_300_PA_AR_Post_Spec_id2word.bin', 'wb') as f:
    msgpack.pack(id2word_dict, f)

In [62]:
print('jawbone:', word2id_dict['jawbone'])
print('183868:', id2word_dict[183868])
print('len(word2id_dict):', len(word2id_dict))
print('len(id2word_dict):', len(id2word_dict))

jawbone: 183868
183868: jawbone
len(word2id_dict): 183869
len(id2word_dict): 183869


In [68]:
id2word_dict[183870]

'18c'

In [22]:
# Next, merge the ids from the id2word dict (same one used the create the initial embeddings in text format).
# attract_repel_vocab_reset_merged = pd.merge(attract_repel_vocab_reset, Gao_PA_id2word_df, left_index=True, right_on='id', how='outer')
# attract_repel_vocab_reset_merged.head()

In [23]:
# Verify ids and word are correctly matched.
# print(Gao_PA_id2word_df.loc[2, 'id'] + ' = ' + attract_repel_vocab_reset_merged.loc[2, 'id'])

In [28]:
# Restore order by numerical index.
# attract_repel_vocab_reset_merged_sorted = attract_repel_vocab_reset_merged.sort_index()
# attract_repel_vocab_reset_merged_sorted.head()

In [24]:
# Drop NaNs.
# attract_repel_vocab_reset_merged_sorted_dropped_na = attract_repel_vocab_reset_merged_sorted.dropna().copy()
# print('len(attract_repel_vocab_reset_merged_sorted_dropped_na):', len(attract_repel_vocab_reset_merged_sorted_dropped_na))
# attract_repel_vocab_reset_merged_sorted_dropped_na.head()

In [47]:
# Reset numerical index.
attract_repel_vocab_reset_merged_sorted_dropped_na = attract_repel_vocab_reset_merged_sorted_dropped_na.reset_index()
attract_repel_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,300,id
0,1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,...,-0.00561,0.017745,-0.107547,0.089956,0.041359,0.05257,0.024702,-0.001267,0.013895,.
1,2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,...,0.038837,0.075673,-0.090206,0.001689,0.141965,0.026877,0.003176,0.005486,-0.02144,the
2,3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,...,0.00726,0.007274,-0.018902,-0.008469,0.045642,0.079807,-0.072229,0.069743,0.018052,to
3,4,0.024488,0.085182,0.03569,0.023997,-0.010428,-0.013919,0.063092,0.063827,0.002825,...,-0.014465,0.034873,-0.122861,0.071537,0.041755,0.029655,-0.017956,-0.002581,0.087821,and
4,5,0.060285,0.077495,-0.027674,-0.069554,0.018359,0.040705,0.000747,0.02012,-0.00283,...,0.07124,0.038645,-0.12752,0.017034,0.046079,0.033626,0.035553,-0.020441,0.089884,of


In [48]:
# Drop old index, then shift new index by 1 to accomodate padding.
attract_repel_vocab_reset_merged_sorted_dropped_na.drop(['index'], axis=1, inplace=True)
attract_repel_vocab_reset_merged_sorted_dropped_na.index = attract_repel_vocab_reset_merged_sorted_dropped_na.index + 1  # shifting index
attract_repel_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,id
1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,-0.00561,0.017745,-0.107547,0.089956,0.041359,0.05257,0.024702,-0.001267,0.013895,.
2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.038837,0.075673,-0.090206,0.001689,0.141965,0.026877,0.003176,0.005486,-0.02144,the
3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.00726,0.007274,-0.018902,-0.008469,0.045642,0.079807,-0.072229,0.069743,0.018052,to
4,0.024488,0.085182,0.03569,0.023997,-0.010428,-0.013919,0.063092,0.063827,0.002825,0.160525,...,-0.014465,0.034873,-0.122861,0.071537,0.041755,0.029655,-0.017956,-0.002581,0.087821,and
5,0.060285,0.077495,-0.027674,-0.069554,0.018359,0.040705,0.000747,0.02012,-0.00283,0.079162,...,0.07124,0.038645,-0.12752,0.017034,0.046079,0.033626,0.035553,-0.020441,0.089884,of


In [51]:
# Drop vectors to get id2word and word2id dicts.
attract_repel_vocab_reset_merged_sorted_dropped_na.reset_index(inplace=True)

attract_repel_vocab_reset_merged_sorted_dropped_na.head()

word2id_df = attract_repel_vocab_reset_merged_sorted_dropped_na.set_index('id')

word2id_df = word2id_df[['index']]
word2id_df.head()

Unnamed: 0_level_0,index
id,Unnamed: 1_level_1
.,1
the,2
to,3
and,4
of,5


In [52]:
# Create dictionary of just IDs to words.
word2id_dict = word2id_df.to_dict()
word2id_dict = word2id_dict['index']

# Switch keys/values and store word2id dictionary.
# Needed to encode examples.
id2word_dict = {y: x for x, y in word2id_dict.items()}

In [55]:
# Write word2id list to disk.
with open('Gao_300_PA_AR_word2id.bin', 'wb') as f:
    msgpack.pack(word2id_dict, f)

In [56]:
# Write id2word list to disk.
with open('Gao_300_PA_AR_id2word.bin', 'wb') as f:
    msgpack.pack(id2word_dict, f)

In [49]:
# Drop col of words.
post_spec_AR_vocab.drop(['index', 0], axis=1, inplace=True)
post_spec_AR_vocab.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,0.020641,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
1,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,-0.042047,-0.004923,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
2,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,0.008644,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
3,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,0.068886,-0.002589,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432
4,0.074724,-0.01541,-5.6e-05,-0.032053,-0.008711,-0.03005,-0.035028,0.007226,0.111881,-0.018725,...,-0.033774,-0.038186,0.021029,0.038031,-0.067227,-0.019451,0.002031,-0.070298,0.037921,-0.001279


In [50]:
# Restore padding as first row.
padding = [0.0] * 300
post_spec_AR_vocab.loc[-1] = padding  # adding a row
post_spec_AR_vocab.index = post_spec_AR_vocab.index + 1  # shifting index
post_spec_AR_vocab = post_spec_AR_vocab.sort_index()  # sorting by index
post_spec_AR_vocab.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,0.020641,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
2,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,-0.042047,-0.004923,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
3,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,0.008644,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
4,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,0.068886,-0.002589,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432


In [53]:
# Add additional word embedding for unknown words.
post_spec_AR_vocab = np.concatenate((post_spec_AR_vocab, np.random.rand(1, 300)))

In [63]:
len(post_spec_AR_vocab)

183872

In [56]:
# Write the damn thing to file.
np.save('wikimedia-PA-AR-Post-Spec-Gao-300-embeddings', post_spec_AR_vocab)