# Prep Wikimedia Data for CF Gao Embeddings

In [119]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [126]:
# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('Gao_PA_id2word.bin', binary_mode=True))
Gao_PA_id2word = msgpack.unpack(f, raw=False)
Gao_PA_id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
Gao_PA_id2word_df.head()

Unnamed: 0,id
14,this
9,is
16,not
3450,creative
1,.


In [120]:
# Run counter-fitting.
# This produces the enhanced embeddings, which then need to turned back into a numpy array of floats.

In [127]:
# Read the result back in.
counter_fitted_vocab = pd.read_csv('wikimedia-personal-attacks-200-keyed-counter_fitted_vectors.txt', sep=" ", header=None)

# Check the length of the vocab.
print('vocab size:', len(vocab))
counter_fitted_vocab.head()

vocab size: 36995


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,sonja,-0.005808,-0.026572,-0.103046,-0.060719,0.094757,-0.007691,0.037145,-0.09773,-0.081288,...,-0.025076,0.008198,-0.134271,-0.001052,0.035114,0.008318,-0.180306,0.064044,0.058231,0.02613
1,vani,0.034789,-0.088009,-0.047511,-0.016943,0.148968,-0.04916,0.029211,-0.036494,0.043149,...,-0.027269,0.004287,-0.180869,-0.111541,0.045441,-0.095603,-0.081328,0.038352,0.128825,0.019857
2,woods,0.066142,-0.012744,-0.059821,-0.017741,0.049048,0.096224,0.112517,-0.062495,-0.10101,...,0.016214,0.017011,-0.107387,-0.047012,0.116589,-0.039114,-0.174756,0.132353,0.06219,0.119502
3,hanging,0.000443,-0.004933,-0.005245,-0.060956,0.063518,-0.019249,-0.053869,-0.026201,-0.075231,...,0.091328,-0.047149,-0.136224,-0.053713,0.034217,0.015259,-0.123624,0.048912,0.15385,0.025916
4,woody,0.067918,-0.001583,-0.149499,-0.037716,0.139094,0.028135,0.069009,-0.13057,0.003022,...,-0.003675,0.068206,-0.136547,-0.05059,0.044669,-0.04829,-0.090706,0.085735,0.020854,0.026444


In [128]:
# Output is not in same order, use words as index to restore order.
# First, set words as index.
counter_fitted_vocab_reset = counter_fitted_vocab.set_index(0)
counter_fitted_vocab_reset.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sonja,-0.005808,-0.026572,-0.103046,-0.060719,0.094757,-0.007691,0.037145,-0.09773,-0.081288,-0.100188,...,-0.025076,0.008198,-0.134271,-0.001052,0.035114,0.008318,-0.180306,0.064044,0.058231,0.02613
vani,0.034789,-0.088009,-0.047511,-0.016943,0.148968,-0.04916,0.029211,-0.036494,0.043149,-0.067964,...,-0.027269,0.004287,-0.180869,-0.111541,0.045441,-0.095603,-0.081328,0.038352,0.128825,0.019857
woods,0.066142,-0.012744,-0.059821,-0.017741,0.049048,0.096224,0.112517,-0.062495,-0.10101,-0.197875,...,0.016214,0.017011,-0.107387,-0.047012,0.116589,-0.039114,-0.174756,0.132353,0.06219,0.119502
hanging,0.000443,-0.004933,-0.005245,-0.060956,0.063518,-0.019249,-0.053869,-0.026201,-0.075231,-0.088585,...,0.091328,-0.047149,-0.136224,-0.053713,0.034217,0.015259,-0.123624,0.048912,0.15385,0.025916
woody,0.067918,-0.001583,-0.149499,-0.037716,0.139094,0.028135,0.069009,-0.13057,0.003022,-0.074498,...,-0.003675,0.068206,-0.136547,-0.05059,0.044669,-0.04829,-0.090706,0.085735,0.020854,0.026444


In [130]:
# Next, merge the ids from the id2word dict (same one used the create the initial embeddings in text format).
counter_fitted_vocab_reset_merged = pd.merge(counter_fitted_vocab_reset, Gao_PA_id2word_df, left_index=True, right_on='id', how='outer')
counter_fitted_vocab_reset_merged.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,192,193,194,195,196,197,198,199,200,id
12848,-0.005808,-0.026572,-0.103046,-0.060719,0.094757,-0.007691,0.037145,-0.09773,-0.081288,-0.100188,...,0.008198,-0.134271,-0.001052,0.035114,0.008318,-0.180306,0.064044,0.058231,0.02613,sonja
36921,0.034789,-0.088009,-0.047511,-0.016943,0.148968,-0.04916,0.029211,-0.036494,0.043149,-0.067964,...,0.004287,-0.180869,-0.111541,0.045441,-0.095603,-0.081328,0.038352,0.128825,0.019857,vani
7405,0.066142,-0.012744,-0.059821,-0.017741,0.049048,0.096224,0.112517,-0.062495,-0.10101,-0.197875,...,0.017011,-0.107387,-0.047012,0.116589,-0.039114,-0.174756,0.132353,0.06219,0.119502,woods
6017,0.000443,-0.004933,-0.005245,-0.060956,0.063518,-0.019249,-0.053869,-0.026201,-0.075231,-0.088585,...,-0.047149,-0.136224,-0.053713,0.034217,0.015259,-0.123624,0.048912,0.15385,0.025916,hanging
13123,0.067918,-0.001583,-0.149499,-0.037716,0.139094,0.028135,0.069009,-0.13057,0.003022,-0.074498,...,0.068206,-0.136547,-0.05059,0.044669,-0.04829,-0.090706,0.085735,0.020854,0.026444,woody


In [132]:
# Verify ids and word are correctly matched.
print(Gao_PA_id2word_df.loc[2, 'id'] + ' = ' + counter_fitted_vocab_reset_merged.loc[2, 'id'])

the = the


In [133]:
# Restore order by numerical index.
counter_fitted_vocab_reset_merged_sorted = counter_fitted_vocab_reset_merged.sort_index()
counter_fitted_vocab_reset_merged_sorted.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,192,193,194,195,196,197,198,199,200,id
1,,,,,,,,,,,...,,,,,,,,,,.
2,0.009807,0.010722,0.072162,0.029296,0.09695,-0.051371,0.070996,0.136207,-0.020015,-0.10961,...,0.051534,0.053819,-0.050564,0.024532,0.135503,-0.030997,-0.08754,-0.01328,0.019497,the
3,0.050796,0.035242,-0.125667,0.037255,0.113678,0.018637,-0.035412,0.002932,0.087469,0.056408,...,0.009613,-0.013976,-0.183031,-0.055258,0.011709,-0.112704,-0.064617,-0.006286,-0.067656,to
4,-0.002296,0.077134,-0.13467,-0.012957,0.070005,-0.003333,0.102417,0.14301,0.042786,-0.16417,...,-0.01423,-0.062983,-0.004132,-0.072701,0.099137,-0.043102,0.098594,-0.018615,-0.027455,and
5,0.136606,-0.043707,-0.008443,-0.099816,0.047012,0.100151,0.118435,0.053565,0.074816,-0.08706,...,0.038696,0.106887,-0.035146,-0.010763,0.079028,-0.058162,0.050052,0.057121,-0.001675,of


In [134]:
# Drop NaNs.
CF_vocab_reset_merged_sorted_dropped_na = counter_fitted_vocab_reset_merged_sorted.dropna().copy()
print('len(CF_vocab_reset_merged_sorted_dropped_na):', len(CF_vocab_reset_merged_sorted_dropped_na))
CF_vocab_reset_merged_sorted_dropped_na.head()

len(CF_vocab_reset_merged_sorted_dropped_na): 27420


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,192,193,194,195,196,197,198,199,200,id
2,0.009807,0.010722,0.072162,0.029296,0.09695,-0.051371,0.070996,0.136207,-0.020015,-0.10961,...,0.051534,0.053819,-0.050564,0.024532,0.135503,-0.030997,-0.08754,-0.01328,0.019497,the
3,0.050796,0.035242,-0.125667,0.037255,0.113678,0.018637,-0.035412,0.002932,0.087469,0.056408,...,0.009613,-0.013976,-0.183031,-0.055258,0.011709,-0.112704,-0.064617,-0.006286,-0.067656,to
4,-0.002296,0.077134,-0.13467,-0.012957,0.070005,-0.003333,0.102417,0.14301,0.042786,-0.16417,...,-0.01423,-0.062983,-0.004132,-0.072701,0.099137,-0.043102,0.098594,-0.018615,-0.027455,and
5,0.136606,-0.043707,-0.008443,-0.099816,0.047012,0.100151,0.118435,0.053565,0.074816,-0.08706,...,0.038696,0.106887,-0.035146,-0.010763,0.079028,-0.058162,0.050052,0.057121,-0.001675,of
6,-0.01358,-0.079276,-0.126072,-0.022334,0.069643,0.016661,0.039508,0.096936,0.023874,-0.094374,...,0.004717,0.030693,0.018407,0.042755,0.010373,0.082392,-0.073676,0.020993,-0.076747,a


In [135]:
# Reset numerical index.
CF_vocab_reset_merged_sorted_dropped_na_reset = CF_vocab_reset_merged_sorted_dropped_na.reset_index()
CF_vocab_reset_merged_sorted_dropped_na_reset.head()

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,200,id
0,2,0.009807,0.010722,0.072162,0.029296,0.09695,-0.051371,0.070996,0.136207,-0.020015,...,0.051534,0.053819,-0.050564,0.024532,0.135503,-0.030997,-0.08754,-0.01328,0.019497,the
1,3,0.050796,0.035242,-0.125667,0.037255,0.113678,0.018637,-0.035412,0.002932,0.087469,...,0.009613,-0.013976,-0.183031,-0.055258,0.011709,-0.112704,-0.064617,-0.006286,-0.067656,to
2,4,-0.002296,0.077134,-0.13467,-0.012957,0.070005,-0.003333,0.102417,0.14301,0.042786,...,-0.01423,-0.062983,-0.004132,-0.072701,0.099137,-0.043102,0.098594,-0.018615,-0.027455,and
3,5,0.136606,-0.043707,-0.008443,-0.099816,0.047012,0.100151,0.118435,0.053565,0.074816,...,0.038696,0.106887,-0.035146,-0.010763,0.079028,-0.058162,0.050052,0.057121,-0.001675,of
4,6,-0.01358,-0.079276,-0.126072,-0.022334,0.069643,0.016661,0.039508,0.096936,0.023874,...,0.004717,0.030693,0.018407,0.042755,0.010373,0.082392,-0.073676,0.020993,-0.076747,a


In [136]:
# Drop old index, then shift new index by 1 to accomodate padding.
CF_vocab_reset_merged_sorted_dropped_na_reset.drop(['index'], axis=1, inplace=True)
CF_vocab_reset_merged_sorted_dropped_na_reset.index = CF_vocab_reset_merged_sorted_dropped_na_reset.index + 1  # shifting index
CF_vocab_reset_merged_sorted_dropped_na_reset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,192,193,194,195,196,197,198,199,200,id
1,0.009807,0.010722,0.072162,0.029296,0.09695,-0.051371,0.070996,0.136207,-0.020015,-0.10961,...,0.051534,0.053819,-0.050564,0.024532,0.135503,-0.030997,-0.08754,-0.01328,0.019497,the
2,0.050796,0.035242,-0.125667,0.037255,0.113678,0.018637,-0.035412,0.002932,0.087469,0.056408,...,0.009613,-0.013976,-0.183031,-0.055258,0.011709,-0.112704,-0.064617,-0.006286,-0.067656,to
3,-0.002296,0.077134,-0.13467,-0.012957,0.070005,-0.003333,0.102417,0.14301,0.042786,-0.16417,...,-0.01423,-0.062983,-0.004132,-0.072701,0.099137,-0.043102,0.098594,-0.018615,-0.027455,and
4,0.136606,-0.043707,-0.008443,-0.099816,0.047012,0.100151,0.118435,0.053565,0.074816,-0.08706,...,0.038696,0.106887,-0.035146,-0.010763,0.079028,-0.058162,0.050052,0.057121,-0.001675,of
5,-0.01358,-0.079276,-0.126072,-0.022334,0.069643,0.016661,0.039508,0.096936,0.023874,-0.094374,...,0.004717,0.030693,0.018407,0.042755,0.010373,0.082392,-0.073676,0.020993,-0.076747,a


In [140]:
# Drop vectors to get id2word and word2id dicts.
# word2id_df = CF_vocab_reset_merged_sorted_dropped_na_reset.reset_index(inplace=True)
# word2id_df = CF_vocab_reset_merged_sorted_dropped_na_reset.reset_index()

word2id_df = CF_vocab_reset_merged_sorted_dropped_na_reset.set_index('id')

word2id_df = word2id_df[['index']]
word2id_df

Unnamed: 0_level_0,index
id,Unnamed: 1_level_1
the,1
to,2
and,3
of,4
a,5
you,6
i,7
is,8
that,9
in,10


In [146]:
# Create dictionary of just IDs to words.
word2id_dict = word2id_df.to_dict()
word2id_dict = word2id_dict['index']

# Switch keys/values and store word2id dictionary.
# Needed to encode examples.
id2word_dict = {y: x for x, y in id2word_dict.items()}

In [142]:
# Write word2id list to disk.
with open('Gao_PA_CF_word2id.bin', 'wb') as f:
    msgpack.pack(word2id_dict, f)

In [148]:
# Write id2word list to disk.
with open('Gao_PA_CF_id2word.bin', 'wb') as f:
    msgpack.pack(id2word_dict, f)

In [99]:
# Drop col of words.
CF_vocab_reset_merged_sorted_dropped_na_reset.drop(['index', 'id'], axis=1, inplace=True)
CF_vocab_reset_merged_sorted_dropped_na_reset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
0,0.009807,0.010722,0.072162,0.029296,0.09695,-0.051371,0.070996,0.136207,-0.020015,-0.10961,...,0.070672,0.051534,0.053819,-0.050564,0.024532,0.135503,-0.030997,-0.08754,-0.01328,0.019497
1,0.050796,0.035242,-0.125667,0.037255,0.113678,0.018637,-0.035412,0.002932,0.087469,0.056408,...,-0.018708,0.009613,-0.013976,-0.183031,-0.055258,0.011709,-0.112704,-0.064617,-0.006286,-0.067656
2,-0.002296,0.077134,-0.13467,-0.012957,0.070005,-0.003333,0.102417,0.14301,0.042786,-0.16417,...,-0.063995,-0.01423,-0.062983,-0.004132,-0.072701,0.099137,-0.043102,0.098594,-0.018615,-0.027455
3,0.136606,-0.043707,-0.008443,-0.099816,0.047012,0.100151,0.118435,0.053565,0.074816,-0.08706,...,0.077654,0.038696,0.106887,-0.035146,-0.010763,0.079028,-0.058162,0.050052,0.057121,-0.001675
4,-0.01358,-0.079276,-0.126072,-0.022334,0.069643,0.016661,0.039508,0.096936,0.023874,-0.094374,...,0.06096,0.004717,0.030693,0.018407,0.042755,0.010373,0.082392,-0.073676,0.020993,-0.076747


In [101]:
# Restore padding as first row.
padding = [0.0] * 200
CF_vocab_reset_merged_sorted_dropped_na_reset.loc[-1] = padding  # adding a row
CF_vocab_reset_merged_sorted_dropped_na_reset.index = CF_vocab_reset_merged_sorted_dropped_na_reset.index + 1  # shifting index
CF_vocab_reset_merged_sorted_dropped_na_reset = CF_vocab_reset_merged_sorted_dropped_na_reset.sort_index()  # sorting by index
CF_vocab_reset_merged_sorted_dropped_na_reset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.009807,0.010722,0.072162,0.029296,0.09695,-0.051371,0.070996,0.136207,-0.020015,-0.10961,...,0.070672,0.051534,0.053819,-0.050564,0.024532,0.135503,-0.030997,-0.08754,-0.01328,0.019497
2,0.050796,0.035242,-0.125667,0.037255,0.113678,0.018637,-0.035412,0.002932,0.087469,0.056408,...,-0.018708,0.009613,-0.013976,-0.183031,-0.055258,0.011709,-0.112704,-0.064617,-0.006286,-0.067656
3,-0.002296,0.077134,-0.13467,-0.012957,0.070005,-0.003333,0.102417,0.14301,0.042786,-0.16417,...,-0.063995,-0.01423,-0.062983,-0.004132,-0.072701,0.099137,-0.043102,0.098594,-0.018615,-0.027455
4,0.136606,-0.043707,-0.008443,-0.099816,0.047012,0.100151,0.118435,0.053565,0.074816,-0.08706,...,0.077654,0.038696,0.106887,-0.035146,-0.010763,0.079028,-0.058162,0.050052,0.057121,-0.001675


In [107]:
# Add additional word embedding for unknown words.
CF_Gao_vocab = np.concatenate((vocab, np.random.rand(1, 200)))
CF_Gao_vocab

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.4381411 ,  0.1534448 ,  0.11639864, ...,  1.21520825,
         0.78241566,  0.32214786],
       [ 0.06929084,  0.07575765,  0.5098544 , ..., -0.61850654,
        -0.09382881,  0.13775178],
       ...,
       [ 0.97458988, -0.67156247, -0.50628575, ...,  0.32488224,
         1.00972745,  0.02257891],
       [ 0.09095229,  0.87264827,  0.07317016, ...,  0.86094169,
         0.86700748,  0.37922492],
       [ 0.23703329,  0.36283082,  0.71383202, ...,  0.7631847 ,
         0.83938365,  0.93016998]])

In [106]:
# Write the damn thing to file.
np.save('wikimedia-PA-CF-200-embeddings', CF_Gao_vocab)

In [147]:
# Decode comment for validation.
id2word_dict

{1: 'the',
 2: 'to',
 3: 'and',
 4: 'of',
 5: 'a',
 6: 'you',
 7: 'i',
 8: 'is',
 9: 'that',
 10: 'in',
 11: 'it',
 12: 'this',
 13: 'for',
 14: 'not',
 15: 'on',
 16: 'be',
 17: 'as',
 18: 'are',
 19: 'have',
 20: 'your',
 21: 'with',
 22: 'was',
 23: 'if',
 24: 'article',
 25: 'or',
 26: 'but',
 27: 'my',
 28: 'an',
 29: 'wikipedia',
 30: 'by',
 31: 'page',
 32: 'do',
 33: 'from',
 34: 'me',
 35: 'at',
 36: 'about',
 37: 'so',
 38: 'what',
 39: 'all',
 40: 'can',
 41: 'there',
 42: 'no',
 43: 'will',
 44: 'he',
 45: 'has',
 46: 'like',
 47: 'its',
 48: 'they',
 49: 'one',
 50: 'just',
 51: 'would',
 52: 'dont',
 53: 'which',
 54: 'please',
 55: 'who',
 56: 'any',
 57: 'been',
 58: 'we',
 59: 'should',
 60: 'his',
 61: 'more',
 62: 'some',
 63: 'other',
 64: 'talk',
 65: 'here',
 66: 'am',
 67: 'because',
 68: 'think',
 69: 'see',
 70: 'also',
 71: 'fuck',
 72: 'people',
 73: 'im',
 74: 'why',
 75: 'up',
 76: 'how',
 77: 'only',
 78: 'were',
 79: 'out',
 80: 'when',
 81: 'edit',
 82: 