In [2]:
from __future__ import print_function, division
from builtins import range
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score

"""
Glove was invented by Stanford.

First we have keras tokenizer. We convert a big sentence into list of strings where
each string is now a token. A token can also be any punctuation mark. depends on the strategy being used

Next is pad_sequences. We want input strings to be of same length so that we can have rectangular output
 
Next is keras layers for defining the layers in our keras model

model class for defining keras model 

roc_auc_score which returns area under curve required for binary classification.

"""
 

Using TensorFlow backend.


'\nGlove was invented by Stanford.\n\nFirst we have keras tokenizer. We convert a big sentence into list of strings where\neach string is now a token. A token can also be any punctuation mark. depends on the strategy being used\n\nNext is pad_sequences. We want input strings to be of same length so that we can have rectangular output\n \nNext is keras layers for defining the layers in our keras model\n\nmodel class for defining keras model \n\nroc_auc_score which returns area under curve required for binary classification.\n\n'

In [3]:
# some configuration
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000 
# native english speaker knows this much words as defined in max_vocab_size
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 10

In [4]:
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('../large_files/glove.6B/glove.6B.%sd.txt' %EMBEDDING_DIM), encoding="utf8") as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2]
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [5]:
# so far we have loaded the words and the corresponding word vector in a central dictionary

In [11]:
# now is the turn to train data set
print('Loading in comments...')

train = pd.read_csv("../large_files/toxic_comment/train.csv")
sentences = train["comment_text"].fillna("DUMMY_VALUE").values
possible_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[possible_labels].values

Loading in comments...


In [15]:
# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))
s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])

max sequence length: 1400
min sequence length: 0
median sequence length: 35


In [16]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))


# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Found 210337 unique tokens.
Shape of data tensor: (159571, 100)


In [21]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all zeros.
          embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [22]:
embedding_matrix.shape

(20000, 100)