In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from utilities.data_preprocessors import read_preprocess, series_to_1D_array

In [20]:
# 1 for religious and 0 for non religious
df = pd.read_csv('./data/hate-speech-data-cleaned.csv', index_col=0)
df = read_preprocess(df)

In [21]:
all_words = pd.Series(series_to_1D_array(df['comment']))
all_unique_words_counts = all_words.value_counts()
all_unique_words = all_words.unique()

In [22]:
len(all_words)

894878

In [23]:
len(all_unique_words)

47921

In [24]:
all_unique_words_counts

nigger         16186
faggot         14812
bitch          12246
tranny         11850
like           11657
               ...  
pty                1
vocorp             1
sometimesin        1
shon               1
maupin             1
Length: 47921, dtype: int64

In [25]:
# before joining again get array in df with longest length first
max_len_1 = len(max(df['comment'], key=len))

In [26]:
df['comment'] = df['comment'].apply(lambda comment: " ".join(comment))
df

Unnamed: 0,comment,label
0,woman complain cleaning house man always take ...,1
1,boy dat coldtyga dwn bad cuffin dat hoe st place,0
2,dawg ever fuck bitch start cry confused shit,0
3,look like tranny,0
4,shit hear might true might faker bitch told ya,0
...,...,...
65775,from the midnight sun where the hot spring blow,1
65776,do not say am not your type,1
65777,and therefor never send to know for whom the b...,1
65778,and cannot stand anoth day,1


In [27]:
df.loc[0, 'comment']

'woman complain cleaning house man always take trash'

**A note on the subsequent code below**

fit_on_texts Updates internal vocabulary based on a list of texts. This method creates the vocabulary index based on word frequency. So if you give it something like, "The cat sat on the mat." It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value. 0 is reserved for padding. So lower integer means more frequent word (often the first few are stop words because they appear a lot).

texts_to_sequences Transforms each text in texts to a sequence of integers. So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary. Nothing more, nothing less, certainly no magic involved.

In [28]:
# train_sents, test_sents, train_labels, test_labels = train_test_split(df['comment'], df['label'], test_size=0.3, random_state=0)
sents = df['comment']
max_len_2 = 50

num_words_1 = df.shape[0]
num_words_2 = len(all_words)
num_words_3 = len(all_unique_words)

tokenizer = Tokenizer(num_words=num_words_3, split=' ')
tokenizer.fit_on_texts(sents)
# the bug is here that's why there are wrong indeces

seqs = tokenizer.texts_to_sequences(sents)

# post means place padding of 0's on the tail or ending of the sequence
# and truncating removes the values of a sequence that is greater than the max length given
seqs_padded = pad_sequences(seqs, maxlen=max_len_2, padding='post', truncating='post')

moreover num_words of the Tokenizer can be an arbitrary number most likely based off of an educated guess like the number of sentences in the dataset itself

another is using the number of words itself in the dataset

another is even narrowing the nuber of words in the dataset by using only the number of uniquely occuring words in the dataset

<img src="./figures%20%26%20images/Nhwur.png">

In [29]:
seqs

[[42, 1301, 3583, 264, 46, 96, 76, 93],
 [158, 405, 21627, 15991, 72, 6673, 405, 13, 1041, 186],
 [2023, 87, 12, 3, 182, 402, 793, 15],
 [39, 5, 4],
 [15, 271, 153, 280, 153, 8250, 3, 178, 202],
 [15, 843, 21628, 3887, 509, 52, 18, 13],
 [808, 57, 141, 3, 24, 58, 15, 61],
 [176, 840, 142, 3, 400, 26, 1302, 67],
 [153, 8, 202, 3, 64, 209],
 [3258, 2183, 1062, 15992, 3],
 [21629, 3, 4007, 114, 55, 1332, 735, 5, 604],
 [9989, 1574, 3, 1574, 1662],
 [13, 1076, 988, 670, 29, 2400],
 [72, 3, 33, 5],
 [3, 8],
 [3, 37, 719],
 [3, 3888, 334],
 [3, 63],
 [3, 8, 455, 1042, 361],
 [14, 1716, 72, 3],
 [452, 3, 318, 90, 132],
 [4145, 3, 5, 9038],
 [318, 43, 13, 221, 214],
 [12, 3, 94, 28, 220, 78, 11288, 180, 389, 12, 2400],
 [24, 202, 3, 1533, 13072, 21630, 1833],
 [35, 1303, 5, 2851, 736],
 [13, 1641],
 [690, 35, 3324, 2459, 554, 35, 1377],
 [59, 11289, 3, 12, 15993],
 [1053, 192, 27, 3, 658],
 [11290, 98, 3, 119, 3, 35, 11291],
 [7, 10, 84, 64, 98, 98, 3, 7, 8, 2460],
 [79, 389, 970, 137, 35, 37,

Here we see that indeed 50 is not enough as our max length but for the subsequent code we will still use 50 and later 503 for our experimentation. For now 503 will be an extremely large value eespecially when applied to all sequences

In [30]:
print(max_len_1, max_len_2)

503 50


In [35]:
word_index_dict = tokenizer.word_index
print(len(word_index_dict))

47916


In [None]:
word_index_dict

In [34]:
seqs[0]

[42, 1301, 3583, 264, 46, 96, 76, 93]

In [33]:
# this is supposed to be 1301
print(word_index_dict['complain'])

# this is supposed to be 3583
print(word_index_dict['cleaning'])

1301
3583


In [36]:
seqs_padded

array([[   42,  1301,  3583, ...,     0,     0,     0],
       [  158,   405, 21627, ...,     0,     0,     0],
       [ 2023,    87,    12, ...,     0,     0,     0],
       ...,
       [  241,  8741,    53, ...,     0,     0,     0],
       [  241,    48,   430, ...,     0,     0,     0],
       [  936, 11286,   372, ...,     0,     0,     0]])

In [37]:
train_seqs, test_seqs, train_labels, test_labels = train_test_split(seqs_padded, df['label'], test_size=0.3, random_state=0)

train_seqs

array([[   80,    37,    15, ...,     0,     0,     0],
       [  197,   632,   322, ...,     0,     0,     0],
       [ 1702,    19,   618, ...,     0,     0,     0],
       ...,
       [ 7031,  2718,   133, ...,     0,     0,     0],
       [   89, 38309,   100, ...,     0,     0,     0],
       [  268,  2005,     1, ...,     0,     0,     0]])

In [40]:
len(train_seqs)

46046

In [38]:
test_seqs

array([[ 1203,   111,     4, ...,     0,     0,     0],
       [   86, 16628,     3, ...,     0,     0,     0],
       [  210,  1785,  5182, ...,     0,     0,     0],
       ...,
       [  143, 42114,    16, ...,     0,     0,     0],
       [   88,     5,   584, ...,     0,     0,     0],
       [ 1059,     7,   107, ...,    32,   297,    86]])

In [39]:
len(test_seqs)

19734