# Prep Wikimedia Data for Attract-Repel Gao Embeddings

In [1]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [2]:
# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('PA-300-id2word.bin', binary_mode=True))
Gao_PA_id2word = msgpack.unpack(f, raw=False)
Gao_PA_id2word_df = pd.DataFrame.from_dict(Gao_PA_id2word, orient='index', columns=['id'])
Gao_PA_id2word_df.head()

Unnamed: 0,id
14,this
9,is
16,not
3450,creative
1,.


In [4]:
# Run attract-repel script.
# This produces the retrofitted embeddings, which then need to turned back into a numpy array of floats.

In [3]:
# Read the result back in.
attract_repel_vocab = pd.read_csv('wikimedia-PA-AR-Gao-300-Naive-Concat-Lex-V4-keyed-embeddings.txt', sep=" ", header=None)

# Check the length of the vocab.
print('attract_repel_vocab size:', len(attract_repel_vocab))
attract_repel_vocab.head()

attract_repel_vocab size: 36994


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,296,297,298,299,300,301,302,303,304,305
0,belligerence,-0.026939,0.072659,-0.008875,-0.003287,-0.054435,-0.054765,0.058507,0.032344,-0.016284,...,0.052049,-0.040802,0.072702,-0.037551,0.116691,0.0,-0.113936,-0.113936,0.0,0.001223
1,sonja,0.035939,0.037134,0.064545,0.030803,-0.013767,0.044446,-0.078485,0.048729,-0.044431,...,0.042409,-0.028483,0.087471,-0.062463,-0.012743,-1.3e-05,0.000933,-4.8e-05,-3.3e-05,-0.000149
2,vani,-0.002509,-0.006056,-0.009332,-0.079897,-0.004147,-0.058535,0.025435,-0.040276,-0.044595,...,0.065885,-0.020091,0.067758,-0.061037,0.071566,0.0,0.0,0.0,0.0,0.0
3,woods,-0.011645,-0.01146,0.100203,0.128036,-0.029959,0.165865,-0.064846,0.023041,-0.047391,...,-0.0033,-0.016624,0.095704,-0.122935,-0.031837,0.0,-0.114612,0.0,0.0,0.0
4,hanging,-0.06682,0.021019,0.016909,0.076224,-0.006307,0.011889,-0.066888,-0.075904,-0.035833,...,-0.011223,-0.042921,0.085932,0.034645,0.104189,0.0,-0.11392,0.0,-0.11392,0.0


In [4]:
# Output is not in same order, use words as index to restore order.
# First, set words as index.
attract_repel_vocab_reset = attract_repel_vocab.set_index(0)
attract_repel_vocab_reset.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,296,297,298,299,300,301,302,303,304,305
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
belligerence,-0.026939,0.072659,-0.008875,-0.003287,-0.054435,-0.054765,0.058507,0.032344,-0.016284,0.002657,...,0.052049,-0.040802,0.072702,-0.037551,0.116691,0.0,-0.113936,-0.113936,0.0,0.001223
sonja,0.035939,0.037134,0.064545,0.030803,-0.013767,0.044446,-0.078485,0.048729,-0.044431,-0.074024,...,0.042409,-0.028483,0.087471,-0.062463,-0.012743,-1.3e-05,0.000933,-4.8e-05,-3.3e-05,-0.000149
vani,-0.002509,-0.006056,-0.009332,-0.079897,-0.004147,-0.058535,0.025435,-0.040276,-0.044595,0.020883,...,0.065885,-0.020091,0.067758,-0.061037,0.071566,0.0,0.0,0.0,0.0,0.0
woods,-0.011645,-0.01146,0.100203,0.128036,-0.029959,0.165865,-0.064846,0.023041,-0.047391,-0.071785,...,-0.0033,-0.016624,0.095704,-0.122935,-0.031837,0.0,-0.114612,0.0,0.0,0.0
hanging,-0.06682,0.021019,0.016909,0.076224,-0.006307,0.011889,-0.066888,-0.075904,-0.035833,0.067021,...,-0.011223,-0.042921,0.085932,0.034645,0.104189,0.0,-0.11392,0.0,-0.11392,0.0


In [5]:
# Next, merge the ids from the id2word dict (same one used the create the initial embeddings in text format).
attract_repel_vocab_reset_merged = pd.merge(attract_repel_vocab_reset, Gao_PA_id2word_df, left_index=True, right_on='id', how='outer')
attract_repel_vocab_reset_merged.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,297,298,299,300,301,302,303,304,305,id
33435,-0.026939,0.072659,-0.008875,-0.003287,-0.054435,-0.054765,0.058507,0.032344,-0.016284,0.002657,...,-0.040802,0.072702,-0.037551,0.116691,0.0,-0.113936,-0.113936,0.0,0.001223,belligerence
12848,0.035939,0.037134,0.064545,0.030803,-0.013767,0.044446,-0.078485,0.048729,-0.044431,-0.074024,...,-0.028483,0.087471,-0.062463,-0.012743,-1.3e-05,0.000933,-4.8e-05,-3.3e-05,-0.000149,sonja
36921,-0.002509,-0.006056,-0.009332,-0.079897,-0.004147,-0.058535,0.025435,-0.040276,-0.044595,0.020883,...,-0.020091,0.067758,-0.061037,0.071566,0.0,0.0,0.0,0.0,0.0,vani
7405,-0.011645,-0.01146,0.100203,0.128036,-0.029959,0.165865,-0.064846,0.023041,-0.047391,-0.071785,...,-0.016624,0.095704,-0.122935,-0.031837,0.0,-0.114612,0.0,0.0,0.0,woods
6017,-0.06682,0.021019,0.016909,0.076224,-0.006307,0.011889,-0.066888,-0.075904,-0.035833,0.067021,...,-0.042921,0.085932,0.034645,0.104189,0.0,-0.11392,0.0,-0.11392,0.0,hanging


In [6]:
# Verify ids and word are correctly matched.
print(Gao_PA_id2word_df.loc[2, 'id'] + ' = ' + attract_repel_vocab_reset_merged.loc[2, 'id'])

the = the


In [7]:
# Restore order by numerical index.
attract_repel_vocab_reset_merged_sorted = attract_repel_vocab_reset_merged.sort_index()
attract_repel_vocab_reset_merged_sorted.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,297,298,299,300,301,302,303,304,305,id
1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0,.
2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0,the
3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0,to
4,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,0.159467,...,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0,and
5,0.060285,0.077495,-0.027674,-0.069554,0.018359,0.040705,0.000747,0.02012,-0.00283,0.079162,...,0.033626,0.035553,-0.020441,0.089884,0.0,0.0,0.0,0.0,0.0,of


In [8]:
# Drop NaNs.
attract_repel_vocab_reset_merged_sorted_dropped_na = attract_repel_vocab_reset_merged_sorted.dropna().copy()
print('len(attract_repel_vocab_reset_merged_sorted_dropped_na):', len(attract_repel_vocab_reset_merged_sorted_dropped_na))
attract_repel_vocab_reset_merged_sorted_dropped_na.head()

len(attract_repel_vocab_reset_merged_sorted_dropped_na): 36992


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,297,298,299,300,301,302,303,304,305,id
1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0,.
2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0,the
3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0,to
4,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,0.159467,...,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0,and
5,0.060285,0.077495,-0.027674,-0.069554,0.018359,0.040705,0.000747,0.02012,-0.00283,0.079162,...,0.033626,0.035553,-0.020441,0.089884,0.0,0.0,0.0,0.0,0.0,of


In [9]:
# Reset numerical index.
attract_repel_vocab_reset_merged_sorted_dropped_na = attract_repel_vocab_reset_merged_sorted_dropped_na.reset_index()
attract_repel_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,...,297,298,299,300,301,302,303,304,305,id
0,1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,...,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0,.
1,2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,...,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0,the
2,3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,...,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0,to
3,4,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,...,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0,and
4,5,0.060285,0.077495,-0.027674,-0.069554,0.018359,0.040705,0.000747,0.02012,-0.00283,...,0.033626,0.035553,-0.020441,0.089884,0.0,0.0,0.0,0.0,0.0,of


In [10]:
# Drop old index, then shift new index by 1 to accomodate padding.
attract_repel_vocab_reset_merged_sorted_dropped_na.drop(['index'], axis=1, inplace=True)
attract_repel_vocab_reset_merged_sorted_dropped_na.index = attract_repel_vocab_reset_merged_sorted_dropped_na.index + 1  # shifting index
attract_repel_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,297,298,299,300,301,302,303,304,305,id
1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0,.
2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0,the
3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0,to
4,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,0.159467,...,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0,and
5,0.060285,0.077495,-0.027674,-0.069554,0.018359,0.040705,0.000747,0.02012,-0.00283,0.079162,...,0.033626,0.035553,-0.020441,0.089884,0.0,0.0,0.0,0.0,0.0,of


In [11]:
# Drop vectors to get id2word and word2id dicts.
attract_repel_vocab_reset_merged_sorted_dropped_na.reset_index(inplace=True)

attract_repel_vocab_reset_merged_sorted_dropped_na.head()

word2id_df = attract_repel_vocab_reset_merged_sorted_dropped_na.set_index('id')

word2id_df = word2id_df[['index']]
word2id_df.head()

Unnamed: 0_level_0,index
id,Unnamed: 1_level_1
.,1
the,2
to,3
and,4
of,5


In [13]:
# Create dictionary of just IDs to words.
word2id_dict = word2id_df.to_dict()
word2id_dict = word2id_dict['index']

# Switch keys/values and store word2id dictionary.
# Needed to encode examples.
id2word_dict = {y: x for x, y in word2id_dict.items()}

In [16]:
# Write word2id list to disk.
with open('Gao_300_NC_Lex_V4_PA_AR_word2id.bin', 'wb') as f:
    msgpack.pack(word2id_dict, f)

In [15]:
# Write id2word list to disk.
with open('Gao_300_NC_Lex_V4_PA_AR_id2word.bin', 'wb') as f:
    msgpack.pack(id2word_dict, f)

In [17]:
# Drop col of words.
attract_repel_vocab_reset_merged_sorted_dropped_na.drop(['index', 'id'], axis=1, inplace=True)
attract_repel_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,296,297,298,299,300,301,302,303,304,305
0,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,0.041359,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0
1,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.141965,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0
2,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.045642,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0
3,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,0.159467,...,0.04148,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0
4,0.060285,0.077495,-0.027674,-0.069554,0.018359,0.040705,0.000747,0.02012,-0.00283,0.079162,...,0.046079,0.033626,0.035553,-0.020441,0.089884,0.0,0.0,0.0,0.0,0.0


In [18]:
# Restore padding as first row.
padding = [0.0] * 305
attract_repel_vocab_reset_merged_sorted_dropped_na.loc[-1] = padding  # adding a row
attract_repel_vocab_reset_merged_sorted_dropped_na.index = attract_repel_vocab_reset_merged_sorted_dropped_na.index + 1  # shifting index
attract_repel_vocab_reset_merged_sorted_dropped_na = attract_repel_vocab_reset_merged_sorted_dropped_na.sort_index()  # sorting by index
attract_repel_vocab_reset_merged_sorted_dropped_na.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,296,297,298,299,300,301,302,303,304,305
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.03122,0.03741,0.02741,0.012629,0.098199,-0.023932,0.041614,0.036007,0.029063,0.047111,...,0.041359,0.05257,0.024702,-0.001267,0.013895,0.0,0.0,0.0,0.0,0.0
2,0.026165,0.004816,0.040857,0.005003,0.095679,0.079071,-0.10485,0.125986,0.002932,0.001035,...,0.141965,0.026877,0.003176,0.005486,-0.02144,0.0,0.0,0.0,0.0,0.0
3,-0.027357,0.027127,0.030152,-0.088998,0.077427,-0.011305,-0.003792,-0.011705,0.111158,0.032761,...,0.045642,0.079807,-0.072229,0.069743,0.018052,0.0,0.0,0.0,0.0,0.0
4,0.024326,0.08462,0.035454,0.023838,-0.010359,-0.013827,0.062676,0.063406,0.002806,0.159467,...,0.04148,0.02946,-0.017838,-0.002564,0.087242,0.0,0.11464,0.0,0.0,0.0


In [19]:
# Add additional word embedding for unknown words.
# Gao_300_PA_AR_vocab = np.concatenate((attract_repel_vocab_reset_merged_sorted_dropped_na, np.random.rand(1, 300)))
Gao_300_PA_AR_vocab

NameError: name 'Gao_300_PA_AR_vocab' is not defined

In [20]:
# Write the damn thing to file.
np.save('PA-Gao-300-NC-Lex-V4-AR-Embeddings', attract_repel_vocab_reset_merged_sorted_dropped_na)

In [22]:
# Decode comment for validation.
len(id2word_dict)

36992