# Prep for Lexicon of Abusive Words (GitHub handle: uds-lsv)

In [1]:
# Import packages.
import string
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
# Read in lexicon.
abusive_words_df = pd.read_csv('lexicons_archives/lexicon-of-abusive-words/Lexicons/expandedLexicon.txt', sep='\t', header=None)
abusive_words_df.head()

Unnamed: 0,0,1
0,horrible_noun,3.679601
1,disgusting_adj,3.493682
2,moron_noun,3.469677
3,bastard_noun,3.399238
4,stupid_noun,3.323882


In [3]:
# Normalize the values to [-1, 1]
x = abusive_words_df[[1]].values # Returns a numpy array.
min = -1
max = 1
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(min, max))
x_scaled = min_max_scaler.fit_transform(x)
normalized_abusive_words_df = pd.DataFrame(x_scaled)
normalized_abusive_words_df.head()

Unnamed: 0,0
0,1.0
1,0.957866
2,0.952425
3,0.936462
4,0.919384


In [48]:
# Merge dataframes.
abusive_words_joined_df = abusive_words_df.join(normalized_abusive_words_df, how='outer', rsuffix='_norm')
abusive_words_joined_df.columns = ['0', '1', 'abusive_lex']
abusive_words_joined_df.head()

Unnamed: 0,0,1,abusive_lex
0,horrible_noun,3.679601,1.0
1,disgusting_adj,3.493682,0.957866
2,moron_noun,3.469677,0.952425
3,bastard_noun,3.399238,0.936462
4,stupid_noun,3.323882,0.919384


In [50]:
# Drop non-normalized column.
abusive_words_joined_norm_df = abusive_words_joined_df.drop('1', axis=1)
abusive_words_joined_norm_df.head()

Unnamed: 0,0,abusive_lex
0,horrible_noun,1.0
1,disgusting_adj,0.957866
2,moron_noun,0.952425
3,bastard_noun,0.936462
4,stupid_noun,0.919384


In [51]:
# abusive_words_joined_norm_df['0'].where(abusive_words_joined_norm_df['0'] == 'horrible_noun')
abusive_words_joined_norm_df[abusive_words_joined_norm_df['0'] == 'horrible_noun']

Unnamed: 0,0,abusive_lex
0,horrible_noun,1.0


In [52]:
# Apply lambda to remove POS labels.
words_df = abusive_words_joined_norm_df['0'].apply(lambda x: x.replace('_noun', '').replace('_adj', '').replace('_verb', ''))
abusive_words_joined_norm_df['word'] = words_df
abusive_words_joined_norm_df.head()

Unnamed: 0,0,abusive_lex,word
0,horrible_noun,1.0,horrible
1,disgusting_adj,0.957866,disgusting
2,moron_noun,0.952425,moron
3,bastard_noun,0.936462,bastard
4,stupid_noun,0.919384,stupid


In [53]:
# Problem: 1429 words are duplicates/triplicates if POS label removed.
abusive_words_joined_norm_df[abusive_words_joined_norm_df.duplicated('word')]

Unnamed: 0,0,abusive_lex,word
7,stupid_adj,0.889470,stupid
8,horrible_adj,0.887407,horrible
11,bastard_adj,0.844285,bastard
25,bitch_verb,0.743532,bitch
32,slut_verb,0.731548,slut
41,stink_verb,0.706906,stink
43,crap_verb,0.697943,crap
44,rubbish_adj,0.696167,rubbish
53,crap_adj,0.685363,crap
63,fuck_verb,0.673528,fuck


In [54]:
# Create df with duplicates dropped, retaining only the first occurance.
abusive_words_joined_norm_first_occ_df = abusive_words_joined_norm_df.drop_duplicates('word').copy()
abusive_words_joined_norm_first_occ_df.head()

Unnamed: 0,0,abusive_lex,word
0,horrible_noun,1.0,horrible
1,disgusting_adj,0.957866,disgusting
2,moron_noun,0.952425,moron
3,bastard_noun,0.936462,bastard
4,stupid_noun,0.919384,stupid


In [55]:
abusive_words_joined_norm_first_occ_df.drop('0', axis=1, inplace=True)

In [56]:
abusive_words_joined_norm_first_occ_df

Unnamed: 0,abusive_lex,word
0,1.000000,horrible
1,0.957866,disgusting
2,0.952425,moron
3,0.936462,bastard
4,0.919384,stupid
5,0.905426,bitch
6,0.890157,scumbag
9,0.864109,ass
10,0.852809,idiot
12,0.830330,slut


In [57]:
# Set words as index.
abusive_words_joined_norm_first_occ_df.set_index('word', inplace=True)

In [58]:
# Convert to dict.
abusive_words_joined_norm_first_occ_dict = abusive_words_joined_norm_first_occ_df.to_dict()
# abusive_words_joined_norm_first_occ_dict

In [59]:
# Save in messagepack format.
path = 'abusive-words-lex-first-occ.bin'
with open(path, 'wb') as f:
    msgpack.pack(abusive_words_joined_norm_first_occ_dict, f)