# Naive Concatenation of Word Embeddings with Lexicons V4

V4 adds expanded lexicon of abusive words.

## Import packages, read in embeddings and lexicons.

In [2]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [3]:
# Set paths.
word_embeddings_path = 'wikimedia-PA-AR-Post-Spec-Gao-300-embeddings.npy'
id2word_dict_path = 'Gao_300_PA_AR_Post_Spec_id2word.bin'

In [4]:
# Read in the word embedding.
f = BytesIO(file_io.read_file_to_string(word_embeddings_path, binary_mode=True))
vocab = np.load(f)

# Later we can use this as a check that we haven't added any rows or
# changed any indexes, ensuring the integrity of the embeddings has
# been preserved.
print('vocab size:', len(vocab))

# The first row is padding so the vocab indexes start at 1.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 183872


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,0.020641,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
2,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,-0.042047,-0.004923,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
3,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,0.008644,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
4,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,0.068886,-0.002589,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432


In [5]:
# Read in the python dictionaries of the lexicons
# from which to build the lexicon embeddings.
f = BytesIO(file_io.read_file_to_string('AFINN-96-lex.bin', binary_mode=True))
lex_afinn = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('MSOL-June15-09-numeric.bin', binary_mode=True))
lex_msol = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('bing-liu-opinion-lex.bin', binary_mode=True))
lex_bing = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('NRC-EmoLex-polarity.bin', binary_mode=True))
lex_emolex = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('abusive-words-lex-first-occ.bin', binary_mode=True))
lex_abusive = msgpack.unpack(f, raw=False)

In [6]:
# Convert AFINN to pandas dataframe.
lex_afinn_df = pd.DataFrame.from_dict(lex_afinn, orient='index', columns=['afinn_polarity'])
print(len(lex_afinn_df))
lex_afinn_df.head()

1468


Unnamed: 0,afinn_polarity
abandon,-0.4
abandons,-0.4
abandoned,-0.4
absentee,-0.2
absentees,-0.2


In [7]:
# Convert MSOL to pandas dataframe.
lex_msol_df = pd.DataFrame.from_dict(lex_msol, orient='index', columns=['msol_polarity']) 
print(len(lex_msol_df))
lex_msol_df.head()

76400


Unnamed: 0,msol_polarity
10cc,1.0
12-16-18-foot_skiff,1.0
"2,4,5-t",-1.0
"2,4-d",-1.0
3-d,1.0


In [8]:
# Convert bing to pandas dataframe.
lex_bing_df = pd.DataFrame.from_dict(lex_bing, orient='index', columns=['bing_polarity']) 
print(len(lex_bing_df))
lex_bing_df.head()

6786


Unnamed: 0,bing_polarity
a+,1.0
abound,1.0
abounds,1.0
abundance,1.0
abundant,1.0


In [9]:
# Convert emolex to pandas dataframe.
lex_emolex_df = pd.DataFrame.from_dict(lex_emolex, orient='index', columns=['emolex_polarity']) 
print(len(lex_emolex_df))
lex_emolex_df.head()

5555


Unnamed: 0,emolex_polarity
abandon,-1.0
abandoned,-1.0
abandonment,-1.0
abba,1.0
abduction,-1.0


In [10]:
# Convert abusive words to pandas dataframe.
lex_abusive_df = pd.DataFrame.from_dict(lex_abusive) 
lex_abusive_df.head()

Unnamed: 0,abusive_lex
187,0.328716
1984,-0.043314
419,0.247129
86,0.082784
abandon,-0.097071


In [11]:
# Join the dataframes on the words.
merged_lex_df = pd.merge(lex_afinn_df, lex_msol_df, left_index=True, right_index=True, how='outer')
merged_lex_df = pd.merge(merged_lex_df, lex_bing_df, left_index=True, right_index=True, how='outer')
merged_lex_df = pd.merge(merged_lex_df, lex_emolex_df, left_index=True, right_index=True, how='outer')
merged_lex_df = pd.merge(merged_lex_df, lex_abusive_df, left_index=True, right_index=True, how='outer')
merged_lex_df.head(10)

Unnamed: 0,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
10cc,,1.0,,,
12-16-18-foot_skiff,,1.0,,,
187,,,,,0.328716
1984,,,,,-0.043314
"2,4,5-t",,-1.0,,,
"2,4-d",,-1.0,,,
2-faced,,,-1.0,,
2-faces,,,-1.0,,
3-d,,1.0,,,
419,,,,,0.247129


## Prep lexicons to be joined with the word embeddings.

In [12]:
# Since words exists in each lexicon that are not in the other, 
# treating both as sparse data.
# Replacing NaN with zeros.
merged_lex_df.fillna(0, inplace=True)
merged_lex_df.head()

Unnamed: 0,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
10cc,0.0,1.0,0.0,0.0,0.0
12-16-18-foot_skiff,0.0,1.0,0.0,0.0,0.0
187,0.0,0.0,0.0,0.0,0.328716
1984,0.0,0.0,0.0,0.0,-0.043314
"2,4,5-t",0.0,-1.0,0.0,0.0,0.0


In [13]:
# Now merge the lexicon dimensions with the word embeddings using the vocab.

# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string(id2word_dict_path, binary_mode=True))
id2word = msgpack.unpack(f, raw=False)
id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
id2word_df.head()

Unnamed: 0,id
1,biennials
2,tripolitan
3,tsukino
4,nunnery
5,schwarzburg-rudolstadt


In [14]:
# Merge the id-to-word dict with word embeddings, then set words as index, 
# which allows the lexicons to be easily merged since both have the words 
# as the index already.
word_emb_merged = pd.merge(word_embeddings_df, id2word_df, left_index=True, right_index=True, how='outer')
word_emb_merged = word_emb_merged.set_index('id')
word_emb_merged.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
biennials,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,0.020641,...,0.00929,-0.101324,0.002901,0.04894,-0.155127,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691
tripolitan,-0.056535,0.000427,-0.02293,-0.024883,0.030431,-0.022631,0.040052,-0.05398,-0.042047,-0.004923,...,0.046784,-0.070465,-0.019298,-0.000591,-0.030715,-0.017091,-0.005564,-0.032029,-0.00545,0.031682
tsukino,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,0.008644,...,-0.035731,-0.069168,0.051958,0.049571,-0.112113,-0.071534,-0.030837,-0.058374,0.050012,0.01431
nunnery,0.127087,0.017128,0.023523,-0.006482,-0.07861,0.004704,0.022335,0.054162,0.068886,-0.002589,...,-0.018985,-0.123621,0.053752,-0.004124,-0.126023,0.081543,0.011255,-0.046931,0.041655,-0.016432


In [15]:
# Now merge lexicon dimensions with word embedding dimensions.
word_lex_emb_merged = pd.merge(word_emb_merged, merged_lex_df, left_index=True, right_index=True, how='outer')
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
',0.024148,-0.034054,-0.058848,-0.016646,0.055086,-0.016343,0.009924,0.024387,0.025746,-0.001198,...,-0.044417,0.00094,-0.055176,0.124942,-0.045313,,,,,
'',-0.03427,-0.03514,-0.06671,-0.007767,0.02697,-0.050708,0.010494,0.026598,0.043573,-0.002888,...,-0.091963,-0.000332,-0.091182,0.067562,0.011579,,,,,
'd,0.050227,-0.014655,-0.036504,-0.020528,0.084165,-0.030685,-0.02479,0.050698,0.032149,-0.048966,...,-0.064785,0.085282,-0.040399,0.006608,-0.121758,,,,,
'll,0.053706,-0.083291,-0.059433,-0.032715,0.091138,-0.027493,-0.033934,0.004242,0.057772,0.041439,...,-0.043243,0.025084,-0.030115,0.033786,-0.102064,,,,,
'm,0.07483,-0.081202,-0.055662,-0.019256,0.070749,-0.037944,0.015033,0.019028,-0.023651,-0.032119,...,-0.075279,0.021702,-0.058172,0.068852,-0.024139,,,,,


In [16]:
# Replace NaN's in only the lexicon embedding columns with 0,
# (so rows with NaN's in the word embeddings can be dropped after).
word_lex_emb_merged['afinn_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['msol_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['bing_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['emolex_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['abusive_lex'].fillna(0.0, inplace=True)
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
',0.024148,-0.034054,-0.058848,-0.016646,0.055086,-0.016343,0.009924,0.024387,0.025746,-0.001198,...,-0.044417,0.00094,-0.055176,0.124942,-0.045313,0.0,0.0,0.0,0.0,0.0
'',-0.03427,-0.03514,-0.06671,-0.007767,0.02697,-0.050708,0.010494,0.026598,0.043573,-0.002888,...,-0.091963,-0.000332,-0.091182,0.067562,0.011579,0.0,0.0,0.0,0.0,0.0
'd,0.050227,-0.014655,-0.036504,-0.020528,0.084165,-0.030685,-0.02479,0.050698,0.032149,-0.048966,...,-0.064785,0.085282,-0.040399,0.006608,-0.121758,0.0,0.0,0.0,0.0,0.0
'll,0.053706,-0.083291,-0.059433,-0.032715,0.091138,-0.027493,-0.033934,0.004242,0.057772,0.041439,...,-0.043243,0.025084,-0.030115,0.033786,-0.102064,0.0,0.0,0.0,0.0,0.0
'm,0.07483,-0.081202,-0.055662,-0.019256,0.070749,-0.037944,0.015033,0.019028,-0.023651,-0.032119,...,-0.075279,0.021702,-0.058172,0.068852,-0.024139,0.0,0.0,0.0,0.0,0.0


In [17]:
# Now drop the rows that have NaN for word embedding values 
# to get back to original vocabulary.
word_lex_emb_merged.dropna(inplace=True)
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
',0.024148,-0.034054,-0.058848,-0.016646,0.055086,-0.016343,0.009924,0.024387,0.025746,-0.001198,...,-0.044417,0.00094,-0.055176,0.124942,-0.045313,0.0,0.0,0.0,0.0,0.0
'',-0.03427,-0.03514,-0.06671,-0.007767,0.02697,-0.050708,0.010494,0.026598,0.043573,-0.002888,...,-0.091963,-0.000332,-0.091182,0.067562,0.011579,0.0,0.0,0.0,0.0,0.0
'd,0.050227,-0.014655,-0.036504,-0.020528,0.084165,-0.030685,-0.02479,0.050698,0.032149,-0.048966,...,-0.064785,0.085282,-0.040399,0.006608,-0.121758,0.0,0.0,0.0,0.0,0.0
'll,0.053706,-0.083291,-0.059433,-0.032715,0.091138,-0.027493,-0.033934,0.004242,0.057772,0.041439,...,-0.043243,0.025084,-0.030115,0.033786,-0.102064,0.0,0.0,0.0,0.0,0.0
'm,0.07483,-0.081202,-0.055662,-0.019256,0.070749,-0.037944,0.015033,0.019028,-0.023651,-0.032119,...,-0.075279,0.021702,-0.058172,0.068852,-0.024139,0.0,0.0,0.0,0.0,0.0


In [18]:
# Confirm we're back to original vocab size.
print('vocab size:', len(word_lex_emb_merged))

vocab size: 183872


In [19]:
# Last, restore original index using id2word dict.

# Flip keys and values of id2word dict.
word2id = dict((v,int(k)) for k,v in id2word.items())

# Add index as col, sort by it.
word_lex_emb_merged['idx'] = word_lex_emb_merged.index.map(word2id.get)
word_lex_emb_merged_sorted = word_lex_emb_merged.sort_values(by=['idx'])
# word_lex_emb_merged_sorted.head(20)

# Reset index, dropping the words.
word_lex_emb_merged_sorted_reset = word_lex_emb_merged_sorted.reset_index(drop=True)

# Drop unneeded idx column.
word_lex_emb_merged_sorted_reset_dropped = word_lex_emb_merged_sorted_reset.drop(['idx'], axis=1)

# Convert to np array, add zero row back at index 0, so index again 
# matches 1-indexing of the vocab.
word_lex_emb_arr = word_lex_emb_merged_sorted_reset_dropped.values
row_zero = np.full((1,305), 0.)
word_lex_emb_arr = np.insert(word_lex_emb_arr, 0, row_zero, axis=0)

# Delete the original zero row that got sorted down to the end.
word_lex_emb_arr = np.delete(word_lex_emb_arr, 36994, axis=0)

# Visually confirm. 
pd.DataFrame(word_lex_emb_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,300,301,302,303,304
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1,-0.007001,0.034904,-0.006476,-0.022211,0.023702,0.005626,-0.062335,0.130678,0.079644,0.020641,...,-0.017552,-0.005853,-0.029875,-0.009073,-0.106691,0.0,0.0,0.0,0.0,0.0
2,-0.056535,0.000427,-0.022930,-0.024883,0.030431,-0.022631,0.040052,-0.053980,-0.042047,-0.004923,...,-0.017091,-0.005564,-0.032029,-0.005450,0.031682,0.0,0.0,0.0,0.0,0.0
3,0.006473,0.004586,-0.033621,-0.029989,0.071586,-0.014516,-0.032025,-0.009592,0.003382,0.008644,...,-0.071534,-0.030837,-0.058374,0.050012,0.014310,0.0,0.0,0.0,0.0,0.0
4,0.127087,0.017128,0.023523,-0.006482,-0.078610,0.004704,0.022335,0.054162,0.068886,-0.002589,...,0.081543,0.011255,-0.046931,0.041655,-0.016432,0.0,-1.0,0.0,0.0,0.0
5,0.074724,-0.015410,-0.000056,-0.032053,-0.008711,-0.030050,-0.035028,0.007226,0.111881,-0.018725,...,-0.019451,0.002031,-0.070298,0.037921,-0.001279,0.0,0.0,0.0,0.0,0.0
6,0.013112,-0.004902,-0.045273,-0.040804,0.065461,-0.017439,0.044821,-0.062670,0.045787,-0.034323,...,-0.045235,0.073545,-0.030448,0.020594,-0.057343,0.0,0.0,0.0,0.0,0.0
7,0.049698,0.015223,-0.043774,-0.045762,0.031101,-0.006538,0.018239,-0.067973,-0.025177,-0.052420,...,-0.079866,0.003996,-0.037560,0.014547,-0.029501,0.0,0.0,0.0,0.0,0.0
8,0.017523,0.026443,0.011831,-0.008102,0.136289,0.003914,0.058256,0.017257,0.033446,0.079077,...,-0.056801,0.014523,-0.017413,0.042509,-0.092218,0.0,0.0,0.0,0.0,0.0
9,0.068496,-0.024607,-0.022963,-0.029806,0.025425,-0.020681,0.010901,0.048066,0.057736,0.052636,...,0.002967,0.020327,-0.038351,0.035187,-0.049014,0.0,0.0,0.0,0.0,0.0


In [20]:
len(word_lex_emb_arr[word_lex_emb_arr[:,303] == -1])

3234

In [27]:
# Write new word-lex naive concatenated embeddings.
# Dimensions 0 to 199 are word embeddings and 300 to 304 are lexicon values.
np.save('PA-Gao-300-AR-PostSpec-naive-concat-word-lex-V4-305', word_lex_emb_arr)

In [23]:
# Also write just the lexicon columns to file for use in the multichannel and parallel CNNs.
lexicons_only = word_lex_emb_arr[:,300:305]
lexicons_only

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [25]:
np.save('PA-Gao-300-PostSpec-AR-lex-V4', lexicons_only)