In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import itertools
import string
from keras.preprocessing import sequence
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english') + list(string.punctuation))

In [None]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
              'identity_hate']

# Data loading

In [None]:
vocab_size = 10000
max_seq_length = 300

In [None]:
data = pd.read_csv('data/train.csv')
del data['id']

In [None]:
X = data[['comment_text']]
X = np.squeeze(X)
Y = data[categories].values
data.head()

Number of examples

In [None]:
len(X)

Number of non-toxic examples

In [None]:
Y.sum(axis=0)

# Preprocess

## Steps to clean the data

1. Break the sentences int tokens using nltk's

Only keep words with at least 4 letters in it (make the size of sentences smaller)

In [None]:
tokens = [[word for word in word_tokenize(sent.lower()) if word not in stop_words and len(word) >= 4] for sent in X]
wordFrequencies = nltk.FreqDist(itertools.chain(*tokens))
vocab = wordFrequencies.most_common(vocab_size-1)
index_to_word = [word[0] for word in vocab]
index_to_word = ['unknown'] + index_to_word
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
for i, sent in enumerate(tokens):
    tokens[i] = [word_to_index[w] for w in sent if w in word_to_index]

X_processed = sequence.pad_sequences(tokens, maxlen=max_seq_length)

## Save index_to_word as json and word_to_index as json

In [None]:
import pickle

with open('word_to_index.pckl', 'wb') as f:
    pickle.dump(word_to_index, f)
    
with open('index_to_word.pckl', 'wb') as f:
    pickle.dump(index_to_word, f)

In [None]:
np.save('./data/X.npy', X_processed)
np.save('./data/Y.npy', Y)