# Naive Concatenation of Word Embeddings with Lexicons V2

V2 adds polarity from the Bing Liu opinion lexicon and  

## Import packages, read in embeddings and lexicons.

In [1]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [2]:
# Read in the word embedding.
f = BytesIO(file_io.read_file_to_string('wikimedia-personal-attacks-200-embeddings.npy', binary_mode=True))
vocab = np.load(f)

# Later we can use this as a check that we haven't added any rows or
# changed any indexes, ensuring the integrity of the embeddings has
# been preserved.
print('vocab size:', len(vocab))

# The first row is padding so the vocab indexes start at 1.
word_embeddings_df = pd.DataFrame(data=vocab)
word_embeddings_df.head()

vocab size: 36995


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.443647,0.128096,0.704819,-0.413601,-0.615326,0.510453,0.081257,-0.65561,-0.357054,-0.387023,...,0.600914,-0.826428,-0.043708,-0.699184,-0.719986,0.412833,0.103456,-1.462709,0.466236,0.383801
2,-0.429288,-0.016554,0.384819,0.113181,-1.462845,0.261847,-0.875737,-0.451588,0.548763,-0.459796,...,0.000568,0.446239,0.302111,-0.397178,-0.675476,0.05389,0.523819,0.075026,0.311938,-0.055834
3,0.674015,0.280386,0.416335,-0.870266,0.490471,-0.532294,0.146954,-0.918494,0.201321,-0.624855,...,-0.092502,0.504301,0.192321,-0.295859,0.08463,-0.107387,0.118977,0.602776,-0.274779,-0.051925
4,-0.007329,-0.039159,0.599992,-0.76129,-0.340942,-0.756368,-0.926788,0.02548,0.299273,-0.697997,...,-0.180548,-0.031146,-0.675957,-0.666593,0.342779,0.510544,0.873011,0.061182,0.068458,0.256529


In [3]:
# Read in the python dictionaries of the lexicons
# from which to build the lexicon embeddings.
f = BytesIO(file_io.read_file_to_string('AFINN-96-lex.bin', binary_mode=True))
lex_afinn = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('MSOL-June15-09-numeric.bin', binary_mode=True))
lex_msol = msgpack.unpack(f, raw=False)

f = BytesIO(file_io.read_file_to_string('bing-liu-opinion-lex.bin', binary_mode=True))
lex_bing = msgpack.unpack(f, raw=False)

In [4]:
# Convert AFINN to pandas dataframe.
lex_afinn_df = pd.DataFrame.from_dict(lex_afinn, orient='index', columns=['afinn_polarity']) 
lex_afinn_df.head()

Unnamed: 0,afinn_polarity
abandon,-0.4
abandons,-0.4
abandoned,-0.4
absentee,-0.2
absentees,-0.2


In [5]:
# Convert MSOL to pandas dataframe.
lex_msol_df = pd.DataFrame.from_dict(lex_msol, orient='index', columns=['msol_polarity']) 
lex_msol_df.head()

Unnamed: 0,msol_polarity
10cc,1.0
12-16-18-foot_skiff,1.0
"2,4,5-t",-1.0
"2,4-d",-1.0
3-d,1.0


In [6]:
# Convert MSOL to pandas dataframe.
lex_bing_df = pd.DataFrame.from_dict(lex_bing, orient='index', columns=['bing_polarity']) 
lex_bing_df.head()

Unnamed: 0,bing_polarity
a+,1.0
abound,1.0
abounds,1.0
abundance,1.0
abundant,1.0


In [7]:
# Join the dataframes on the words.
merged_lex_df = pd.merge(lex_afinn_df, lex_msol_df, left_index=True, right_index=True, how='outer')
merged_lex_df = pd.merge(merged_lex_df, lex_bing_df, left_index=True, right_index=True, how='outer')
merged_lex_df.head(10)

Unnamed: 0,afinn_polarity,msol_polarity,bing_polarity
10cc,,1.0,
12-16-18-foot_skiff,,1.0,
"2,4,5-t",,-1.0,
"2,4-d",,-1.0,
2-faced,,,-1.0
2-faces,,,-1.0
3-d,,1.0,
420,,1.0,
505,,1.0,
a,,-1.0,


## Prep lexicons to be joined with the word embeddings.

In [8]:
# Since words exists in each lexicon that are not in the other, 
# treating both as sparse data.
# Replacing NaN with zeros.
merged_lex_df.fillna(0, inplace=True)
merged_lex_df.head()

Unnamed: 0,afinn_polarity,msol_polarity,bing_polarity
10cc,0.0,1.0,0.0
12-16-18-foot_skiff,0.0,1.0,0.0
"2,4,5-t",0.0,-1.0,0.0
"2,4-d",0.0,-1.0,0.0
2-faced,0.0,0.0,-1.0


In [9]:
# Now merge the lexicon dimensions with the word embeddings using the vocab.

# Get id-to-word dict previously created while generating word embeddings.
f = BytesIO(file_io.read_file_to_string('id2word.bin', binary_mode=True))
id2word = msgpack.unpack(f, raw=False)
id2word_df = pd.DataFrame.from_dict(id2word, orient='index', columns=['id'])
id2word_df.head()

Unnamed: 0,id
14,this
9,is
16,not
3450,creative
1,.


In [10]:
# Merge the id-to-word dict with word embeddings, then set words as index, 
# which allows the lexicons to be easily merged since both have the words 
# as the index already.
word_emb_merged = pd.merge(word_embeddings_df, id2word_df, left_index=True, right_index=True, how='outer')
word_emb_merged = word_emb_merged.set_index('id')
word_emb_merged.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.443647,0.128096,0.704819,-0.413601,-0.615326,0.510453,0.081257,-0.65561,-0.357054,-0.387023,...,0.600914,-0.826428,-0.043708,-0.699184,-0.719986,0.412833,0.103456,-1.462709,0.466236,0.383801
the,-0.429288,-0.016554,0.384819,0.113181,-1.462845,0.261847,-0.875737,-0.451588,0.548763,-0.459796,...,0.000568,0.446239,0.302111,-0.397178,-0.675476,0.05389,0.523819,0.075026,0.311938,-0.055834
to,0.674015,0.280386,0.416335,-0.870266,0.490471,-0.532294,0.146954,-0.918494,0.201321,-0.624855,...,-0.092502,0.504301,0.192321,-0.295859,0.08463,-0.107387,0.118977,0.602776,-0.274779,-0.051925
and,-0.007329,-0.039159,0.599992,-0.76129,-0.340942,-0.756368,-0.926788,0.02548,0.299273,-0.697997,...,-0.180548,-0.031146,-0.675957,-0.666593,0.342779,0.510544,0.873011,0.061182,0.068458,0.256529


In [11]:
# Now merge lexicon dimensions with word embedding dimensions.
word_lex_emb_merged = pd.merge(word_emb_merged, merged_lex_df, left_index=True, right_index=True, how='outer')
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,193,194,195,196,197,198,199,afinn_polarity,msol_polarity,bing_polarity
!,0.719402,0.289279,-0.273394,-0.528267,-0.297721,-0.127302,-0.276557,0.444393,-0.759141,-0.210949,...,-0.26566,0.162053,0.286156,-0.064141,-0.999607,-0.819493,0.76035,,,
.,0.443647,0.128096,0.704819,-0.413601,-0.615326,0.510453,0.081257,-0.65561,-0.357054,-0.387023,...,-0.699184,-0.719986,0.412833,0.103456,-1.462709,0.466236,0.383801,,,
0,-0.356113,-0.015441,0.185449,0.24063,-0.112532,0.395168,0.261011,-0.261993,0.226032,0.540223,...,-0.883688,0.619772,-0.195389,0.21292,0.67014,-0.460036,0.082508,,,
00,-0.69056,0.259574,0.428379,0.403884,-0.752359,0.95652,0.459817,0.117626,0.170718,0.654727,...,-0.626954,0.392507,-0.209626,-0.147662,0.033319,-0.259417,0.417896,,,
000,-0.134573,-0.890869,0.339105,0.84503,-0.100801,0.964447,0.199437,-0.270517,0.339413,1.024041,...,-1.021924,0.062363,-0.216607,0.817551,-0.624438,0.471237,0.194961,,,


In [12]:
# Replace NaN's in only the lexicon embedding columns with 0,
# (so rows with NaN's in the word embeddings can be dropped after).
word_lex_emb_merged['afinn_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['msol_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged['bing_polarity'].fillna(0.0, inplace=True)
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,193,194,195,196,197,198,199,afinn_polarity,msol_polarity,bing_polarity
!,0.719402,0.289279,-0.273394,-0.528267,-0.297721,-0.127302,-0.276557,0.444393,-0.759141,-0.210949,...,-0.26566,0.162053,0.286156,-0.064141,-0.999607,-0.819493,0.76035,0.0,0.0,0.0
.,0.443647,0.128096,0.704819,-0.413601,-0.615326,0.510453,0.081257,-0.65561,-0.357054,-0.387023,...,-0.699184,-0.719986,0.412833,0.103456,-1.462709,0.466236,0.383801,0.0,0.0,0.0
0,-0.356113,-0.015441,0.185449,0.24063,-0.112532,0.395168,0.261011,-0.261993,0.226032,0.540223,...,-0.883688,0.619772,-0.195389,0.21292,0.67014,-0.460036,0.082508,0.0,0.0,0.0
00,-0.69056,0.259574,0.428379,0.403884,-0.752359,0.95652,0.459817,0.117626,0.170718,0.654727,...,-0.626954,0.392507,-0.209626,-0.147662,0.033319,-0.259417,0.417896,0.0,0.0,0.0
000,-0.134573,-0.890869,0.339105,0.84503,-0.100801,0.964447,0.199437,-0.270517,0.339413,1.024041,...,-1.021924,0.062363,-0.216607,0.817551,-0.624438,0.471237,0.194961,0.0,0.0,0.0


In [13]:
# Now drop the rows that have NaN for word embedding values 
# to get back to original vocabulary.
word_lex_emb_merged.dropna(inplace=True)
word_lex_emb_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,193,194,195,196,197,198,199,afinn_polarity,msol_polarity,bing_polarity
!,0.719402,0.289279,-0.273394,-0.528267,-0.297721,-0.127302,-0.276557,0.444393,-0.759141,-0.210949,...,-0.26566,0.162053,0.286156,-0.064141,-0.999607,-0.819493,0.76035,0.0,0.0,0.0
.,0.443647,0.128096,0.704819,-0.413601,-0.615326,0.510453,0.081257,-0.65561,-0.357054,-0.387023,...,-0.699184,-0.719986,0.412833,0.103456,-1.462709,0.466236,0.383801,0.0,0.0,0.0
0,-0.356113,-0.015441,0.185449,0.24063,-0.112532,0.395168,0.261011,-0.261993,0.226032,0.540223,...,-0.883688,0.619772,-0.195389,0.21292,0.67014,-0.460036,0.082508,0.0,0.0,0.0
00,-0.69056,0.259574,0.428379,0.403884,-0.752359,0.95652,0.459817,0.117626,0.170718,0.654727,...,-0.626954,0.392507,-0.209626,-0.147662,0.033319,-0.259417,0.417896,0.0,0.0,0.0
000,-0.134573,-0.890869,0.339105,0.84503,-0.100801,0.964447,0.199437,-0.270517,0.339413,1.024041,...,-1.021924,0.062363,-0.216607,0.817551,-0.624438,0.471237,0.194961,0.0,0.0,0.0


In [14]:
# Confirm we're back to original vocab size.
print('vocab size:', len(word_emb_merged))

vocab size: 36995


In [16]:
# Last, restore original index using id2word dict.

# Flip keys and values of id2word dict.
word2id = dict((v,int(k)) for k,v in id2word.items())

# Add index as col, sort by it.
word_lex_emb_merged['idx'] = word_lex_emb_merged.index.map(word2id.get)
word_lex_emb_merged_sorted = word_lex_emb_merged.sort_values(by=['idx'])
# word_lex_emb_merged_sorted.head(20)

# Reset index, dropping the words.
word_lex_emb_merged_sorted_reset = word_lex_emb_merged_sorted.reset_index(drop=True)

# Drop unneeded idx column.
word_lex_emb_merged_sorted_reset_dropped = word_lex_emb_merged_sorted_reset.drop(['idx'], axis=1)

# Convert to np array, add zero row back at index 0, so index again 
# matches 1-indexing of the vocab.
word_lex_emb_arr = word_lex_emb_merged_sorted_reset_dropped.values
row_zero = np.full((1,203), 0.)
word_lex_emb_arr = np.insert(word_lex_emb_arr, 0, row_zero, axis=0)

# Delete the original zero row that got sorted down to the end.
word_lex_emb_arr = np.delete(word_lex_emb_arr, 36994, axis=0)

# Visually confirm. 
pd.DataFrame(word_lex_emb_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,193,194,195,196,197,198,199,200,201,202
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,0.443647,0.128096,0.704819,-0.413601,-0.615326,0.510453,0.081257,-0.655610,-0.357054,-0.387023,...,-0.699184,-0.719986,0.412833,0.103456,-1.462709,0.466236,0.383801,0.0,0.0,0.0
2,-0.429288,-0.016554,0.384819,0.113181,-1.462845,0.261847,-0.875737,-0.451588,0.548763,-0.459796,...,-0.397178,-0.675476,0.053890,0.523819,0.075026,0.311938,-0.055834,0.0,0.0,0.0
3,0.674015,0.280386,0.416335,-0.870266,0.490471,-0.532294,0.146954,-0.918494,0.201321,-0.624855,...,-0.295859,0.084630,-0.107387,0.118977,0.602776,-0.274779,-0.051925,0.0,0.0,0.0
4,-0.007329,-0.039159,0.599992,-0.761290,-0.340942,-0.756368,-0.926788,0.025480,0.299273,-0.697997,...,-0.666593,0.342779,0.510544,0.873011,0.061182,0.068458,0.256529,0.0,1.0,0.0
5,0.637875,0.431644,-0.142334,-0.247996,0.252809,-0.554958,-0.548522,-0.962688,0.328486,0.167552,...,-0.657409,0.596882,0.118234,0.408964,0.282328,-0.053108,0.086466,0.0,0.0,0.0
6,-0.477269,1.112766,0.211479,-0.450701,-0.899444,0.419699,-0.484609,0.016592,0.213506,-0.653046,...,-0.434183,-0.622537,-0.221095,-0.248796,0.342602,0.015706,-0.074386,0.0,-1.0,0.0
7,0.277146,0.277347,0.558619,-0.187756,-0.493287,-0.002708,-0.086124,-0.395723,-0.358413,-1.054767,...,-0.055361,0.527424,-0.015326,-0.053649,-0.344339,-0.650715,0.000239,0.0,0.0,0.0
8,0.370951,0.153481,0.431980,0.128675,0.183913,-1.035314,-0.077974,-0.454007,0.246618,-0.483983,...,0.373857,0.350026,0.085473,0.380639,-0.550845,-0.552427,-0.095772,0.0,0.0,0.0
9,0.418232,-0.901033,-0.360525,-0.058054,-0.378379,-0.091621,0.514418,0.101297,0.027855,-0.560424,...,-0.199112,-0.353223,-0.336193,0.438961,0.261008,0.288814,-0.459271,0.0,0.0,0.0


In [17]:
# Write new word-lex naive concatenated embeddings.
# Dimensions 0 to 199 are word embeddings and 200 to 202 are lexicon values.
np.save('naive-concat-word-lex-embeddings-203', word_lex_emb_arr)