# Naive Concatenation of Word Embeddings with Lexicons V4

V4 adds expanded lexicon of abusive words.

## Import packages, read in embeddings and lexicons.

In [1]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [2]:
# Read in the word embedding.
f = BytesIO(file_io.read_file_to_string('wikimedia-PA-Gao-300-embeddings.npy', binary_mode=True))
vocab = np.load(f)

# Later we can use this as a check that we haven't added any rows or
# changed any indexes, ensuring the integrity of the embeddings has
# been preserved.
print('vocab size:', len(vocab))

# The first row is padding so the vocab indexes start at 1.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36995


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,0.407885,...,-0.667598,-0.048573,0.153635,-0.931134,0.778837,0.358085,0.455149,0.213865,-0.010973,0.120303
2,0.226789,0.041741,0.354136,0.043368,0.829325,0.685373,-0.908815,1.092019,0.025413,0.008967,...,-0.04064,0.336631,0.655913,-0.781886,0.014642,1.230519,0.232964,0.027527,0.047553,-0.185841
3,-0.236558,0.234574,0.260735,-0.769586,0.669529,-0.097754,-0.032788,-0.101215,0.96121,0.283292,...,-0.720961,0.062782,0.062898,-0.163448,-0.073236,0.39468,0.690109,-0.624584,0.603089,0.1561
4,0.212198,0.738136,0.309266,0.207942,-0.090359,-0.120615,0.546719,0.553092,0.024477,1.39102,...,0.056193,-0.125345,0.30219,-1.06465,0.6199,0.361829,0.256977,-0.155599,-0.022368,0.761006


In [3]:
# Read in the python dictionaries of the lexicons
# from which to build the lexicon embeddings.
f = BytesIO(file_io.read_file_to_string('AFINN-96-lex.bin', binary_mode=True))
lex_afinn = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('MSOL-June15-09-numeric.bin', binary_mode=True))
lex_msol = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('bing-liu-opinion-lex.bin', binary_mode=True))
lex_bing = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('NRC-EmoLex-polarity.bin', binary_mode=True))
lex_emolex = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('abusive-words-lex-first-occ.bin', binary_mode=True))
lex_abusive = msgpack.unpack(f, raw=False)

In [4]:
# Convert AFINN to pandas dataframe.
lex_afinn_df = pd.DataFrame.from_dict(lex_afinn, orient='index', columns=['afinn_polarity'])
print(len(lex_afinn_df))
lex_afinn_df.head()

1468


Unnamed: 0,afinn_polarity
abandon,-0.4
abandons,-0.4
abandoned,-0.4
absentee,-0.2
absentees,-0.2


In [5]:
# Convert MSOL to pandas dataframe.
lex_msol_df = pd.DataFrame.from_dict(lex_msol, orient='index', columns=['msol_polarity']) 
print(len(lex_msol_df))
lex_msol_df.head()

76400


Unnamed: 0,msol_polarity
10cc,1.0
12-16-18-foot_skiff,1.0
"2,4,5-t",-1.0
"2,4-d",-1.0
3-d,1.0


In [6]:
# Convert bing to pandas dataframe.
lex_bing_df = pd.DataFrame.from_dict(lex_bing, orient='index', columns=['bing_polarity']) 
print(len(lex_bing_df))
lex_bing_df.head()

6786


Unnamed: 0,bing_polarity
a+,1.0
abound,1.0
abounds,1.0
abundance,1.0
abundant,1.0


In [7]:
# Convert emolex to pandas dataframe.
lex_emolex_df = pd.DataFrame.from_dict(lex_emolex, orient='index', columns=['emolex_polarity']) 
print(len(lex_emolex_df))
lex_emolex_df.head()

5555


Unnamed: 0,emolex_polarity
abandon,-1.0
abandoned,-1.0
abandonment,-1.0
abba,1.0
abduction,-1.0


In [8]:
# Convert abusive words to pandas dataframe.
lex_abusive_df = pd.DataFrame.from_dict(lex_abusive) 
lex_abusive_df.head()

Unnamed: 0,abusive_lex
187,0.328716
1984,-0.043314
419,0.247129
86,0.082784
abandon,-0.097071


In [9]:
# Join the dataframes on the words.
merged_lex_df = pd.merge(lex_afinn_df, lex_msol_df, left_index=True, right_index=True, how='outer')
merged_lex_df = pd.merge(merged_lex_df, lex_bing_df, left_index=True, right_index=True, how='outer')
merged_lex_df = pd.merge(merged_lex_df, lex_emolex_df, left_index=True, right_index=True, how='outer')
merged_lex_df = pd.merge(merged_lex_df, lex_abusive_df, left_index=True, right_index=True, how='outer')
merged_lex_df.head(10)

Unnamed: 0,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
10cc,,1.0,,,
12-16-18-foot_skiff,,1.0,,,
187,,,,,0.328716
1984,,,,,-0.043314
"2,4,5-t",,-1.0,,,
"2,4-d",,-1.0,,,
2-faced,,,-1.0,,
2-faces,,,-1.0,,
3-d,,1.0,,,
419,,,,,0.247129


## Prep lexicons to be joined with the word embeddings.

In [10]:
# Since words exists in each lexicon that are not in the other, 
# treating both as sparse data.
# Replacing NaN with zeros.
merged_lex_df.fillna(0, inplace=True)
merged_lex_df.head()

Unnamed: 0,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
10cc,0.0,1.0,0.0,0.0,0.0
12-16-18-foot_skiff,0.0,1.0,0.0,0.0,0.0
187,0.0,0.0,0.0,0.0,0.328716
1984,0.0,0.0,0.0,0.0,-0.043314
"2,4,5-t",0.0,-1.0,0.0,0.0,0.0


In [11]:
# Now merge the lexicon dimensions with the word embeddings using the vocab.

# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('id2word.bin', binary_mode=True))
id2word = msgpack.unpack(f, raw=False)
id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
id2word_df.head()

Unnamed: 0,id
14,this
9,is
16,not
3450,creative
1,.


In [12]:
# Merge the id-to-word dict with word embeddings, then set words as index, 
# which allows the lexicons to be easily merged since both have the words 
# as the index already.
word_emb_merged = pd.merge(word_embeddings_df, id2word_df, left_index=True, right_index=True, how='outer')
word_emb_merged = word_emb_merged.set_index('id')
word_emb_merged.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,0.407885,...,-0.667598,-0.048573,0.153635,-0.931134,0.778837,0.358085,0.455149,0.213865,-0.010973,0.120303
the,0.226789,0.041741,0.354136,0.043368,0.829325,0.685373,-0.908815,1.092019,0.025413,0.008967,...,-0.04064,0.336631,0.655913,-0.781886,0.014642,1.230519,0.232964,0.027527,0.047553,-0.185841
to,-0.236558,0.234574,0.260735,-0.769586,0.669529,-0.097754,-0.032788,-0.101215,0.96121,0.283292,...,-0.720961,0.062782,0.062898,-0.163448,-0.073236,0.39468,0.690109,-0.624584,0.603089,0.1561
and,0.212198,0.738136,0.309266,0.207942,-0.090359,-0.120615,0.546719,0.553092,0.024477,1.39102,...,0.056193,-0.125345,0.30219,-1.06465,0.6199,0.361829,0.256977,-0.155599,-0.022368,0.761006


In [13]:
# Now merge lexicon dimensions with word embedding dimensions.
word_lex_emb_merged = pd.merge(word_emb_merged, merged_lex_df, left_index=True, right_index=True, how='outer')
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
!,-0.077877,0.344916,0.326776,0.489076,0.060497,-0.142201,0.775315,-0.717459,0.841575,-0.08721,...,0.039897,0.056755,-0.266409,-0.067895,-0.281368,,,,,
.,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,0.407885,...,0.358085,0.455149,0.213865,-0.010973,0.120303,,,,,
0,-0.116742,-0.152173,-0.207449,-0.32657,0.311591,-0.349999,-0.166362,-0.256711,-0.259762,-0.32865,...,-0.246159,-0.193446,0.732802,0.244847,0.452789,,,,,
00,-0.185029,-0.257349,0.004526,0.226381,-0.114823,0.399389,0.028907,-0.677618,-0.645236,-0.278574,...,-0.074621,0.07408,1.198059,-0.207582,0.548961,,,,,
000,0.547898,-0.954973,0.484188,0.680155,-0.552666,-0.147946,-0.392003,-0.937154,0.3069,0.590128,...,-0.467416,-0.076471,0.585052,-0.093639,-0.162717,,,,,


In [14]:
# Replace NaN's in only the lexicon embedding columns with 0,
# (so rows with NaN's in the word embeddings can be dropped after).
word_lex_emb_merged['afinn_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['msol_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['bing_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['emolex_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['abusive_lex'].fillna(0.0, inplace=True)
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
!,-0.077877,0.344916,0.326776,0.489076,0.060497,-0.142201,0.775315,-0.717459,0.841575,-0.08721,...,0.039897,0.056755,-0.266409,-0.067895,-0.281368,0.0,0.0,0.0,0.0,0.0
.,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,0.407885,...,0.358085,0.455149,0.213865,-0.010973,0.120303,0.0,0.0,0.0,0.0,0.0
0,-0.116742,-0.152173,-0.207449,-0.32657,0.311591,-0.349999,-0.166362,-0.256711,-0.259762,-0.32865,...,-0.246159,-0.193446,0.732802,0.244847,0.452789,0.0,0.0,0.0,0.0,0.0
00,-0.185029,-0.257349,0.004526,0.226381,-0.114823,0.399389,0.028907,-0.677618,-0.645236,-0.278574,...,-0.074621,0.07408,1.198059,-0.207582,0.548961,0.0,0.0,0.0,0.0,0.0
000,0.547898,-0.954973,0.484188,0.680155,-0.552666,-0.147946,-0.392003,-0.937154,0.3069,0.590128,...,-0.467416,-0.076471,0.585052,-0.093639,-0.162717,0.0,0.0,0.0,0.0,0.0


In [15]:
# Now drop the rows that have NaN for word embedding values 
# to get back to original vocabulary.
word_lex_emb_merged.dropna(inplace=True)
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,afinn_polarity,msol_polarity,bing_polarity,emolex_polarity,abusive_lex
!,-0.077877,0.344916,0.326776,0.489076,0.060497,-0.142201,0.775315,-0.717459,0.841575,-0.08721,...,0.039897,0.056755,-0.266409,-0.067895,-0.281368,0.0,0.0,0.0,0.0,0.0
.,0.2703,0.323896,0.237319,0.109343,0.850202,-0.207198,0.36029,0.311746,0.251624,0.407885,...,0.358085,0.455149,0.213865,-0.010973,0.120303,0.0,0.0,0.0,0.0,0.0
0,-0.116742,-0.152173,-0.207449,-0.32657,0.311591,-0.349999,-0.166362,-0.256711,-0.259762,-0.32865,...,-0.246159,-0.193446,0.732802,0.244847,0.452789,0.0,0.0,0.0,0.0,0.0
00,-0.185029,-0.257349,0.004526,0.226381,-0.114823,0.399389,0.028907,-0.677618,-0.645236,-0.278574,...,-0.074621,0.07408,1.198059,-0.207582,0.548961,0.0,0.0,0.0,0.0,0.0
000,0.547898,-0.954973,0.484188,0.680155,-0.552666,-0.147946,-0.392003,-0.937154,0.3069,0.590128,...,-0.467416,-0.076471,0.585052,-0.093639,-0.162717,0.0,0.0,0.0,0.0,0.0


In [16]:
# Confirm we're back to original vocab size.
print('vocab size:', len(word_lex_emb_merged))

vocab size: 36995


In [18]:
# Last, restore original index using id2word dict.

# Flip keys and values of id2word dict.
word2id = dict((v,int(k)) for k,v in id2word.items())

# Add index as col, sort by it.
word_lex_emb_merged['idx'] = word_lex_emb_merged.index.map(word2id.get)
word_lex_emb_merged_sorted = word_lex_emb_merged.sort_values(by=['idx'])
# word_lex_emb_merged_sorted.head(20)

# Reset index, dropping the words.
word_lex_emb_merged_sorted_reset = word_lex_emb_merged_sorted.reset_index(drop=True)

# Drop unneeded idx column.
word_lex_emb_merged_sorted_reset_dropped = word_lex_emb_merged_sorted_reset.drop(['idx'], axis=1)

# Convert to np array, add zero row back at index 0, so index again 
# matches 1-indexing of the vocab.
word_lex_emb_arr = word_lex_emb_merged_sorted_reset_dropped.values
row_zero = np.full((1,305), 0.)
word_lex_emb_arr = np.insert(word_lex_emb_arr, 0, row_zero, axis=0)

# Delete the original zero row that got sorted down to the end.
word_lex_emb_arr = np.delete(word_lex_emb_arr, 36994, axis=0)

# Visually confirm. 
pd.DataFrame(word_lex_emb_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,300,301,302,303,304
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.270300,0.323896,0.237319,0.109343,0.850202,-0.207198,0.360290,0.311746,0.251624,0.407885,...,0.358085,0.455149,0.213865,-0.010973,0.120303,0.0,0.0,0.0,0.0,0.0
2,0.226789,0.041741,0.354136,0.043368,0.829325,0.685373,-0.908815,1.092019,0.025413,0.008967,...,1.230519,0.232964,0.027527,0.047553,-0.185841,0.0,0.0,0.0,0.0,0.0
3,-0.236558,0.234574,0.260735,-0.769586,0.669529,-0.097754,-0.032788,-0.101215,0.961210,0.283292,...,0.394680,0.690109,-0.624584,0.603089,0.156100,0.0,0.0,0.0,0.0,0.0
4,0.212198,0.738136,0.309266,0.207942,-0.090359,-0.120615,0.546719,0.553092,0.024477,1.391020,...,0.361829,0.256977,-0.155599,-0.022368,0.761006,0.0,1.0,0.0,0.0,0.0
5,0.522779,0.672022,-0.239986,-0.603157,0.159206,0.352987,0.006476,0.174475,-0.024545,0.686472,...,0.399582,0.291594,0.308311,-0.177261,0.779451,0.0,0.0,0.0,0.0,0.0
6,-0.059169,0.588721,-0.601633,-0.098019,0.474290,0.205990,-0.214929,0.221648,-0.128647,0.183875,...,0.682588,0.298273,0.024632,-0.538838,0.543300,0.0,-1.0,0.0,0.0,0.0
7,0.014399,0.635920,-0.866441,-0.734539,0.609717,-0.762848,0.161034,-0.394787,0.251164,1.196285,...,0.399716,0.390844,-0.571430,1.125118,0.244292,0.0,0.0,0.0,0.0,0.0
8,0.799485,0.367415,-0.678524,-0.586149,0.460251,-0.171771,0.165325,-0.469078,-0.108768,0.741728,...,0.004504,0.297358,-0.014225,1.503099,0.009011,0.0,0.0,0.0,0.0,0.0
9,-0.467806,0.322376,-0.791115,-0.611845,0.543142,-0.023537,-0.083124,0.301101,0.011354,0.137723,...,-0.400523,-0.585404,-0.045500,0.244482,0.410365,0.0,0.0,0.0,0.0,0.0


In [19]:
len(word_lex_emb_arr[word_lex_emb_arr[:,203] == -1])

0

In [20]:
# Write new word-lex naive concatenated embeddings.
# Dimensions 0 to 199 are word embeddings and 200 to 204 are lexicon values.
np.save('PA-Gao-300-naive-concat-lex-V4', word_lex_emb_arr)

In [21]:
# Also write just the lexicon columns to file for use in the multichannel and parallel CNNs.
lexicons_only = word_lex_emb_arr[:,300:305]
lexicons_only

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [22]:
np.save('PA-Gao-lex-embeddings-V4', lexicons_only)