In [1]:
from __future__ import print_function,division

In [2]:
from builtins import range 

In [3]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Input,GlobalMaxPooling1D
from keras.layers import Conv1D,MaxPooling1D,Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [4]:
MAX_SEQUENCE_LENGTH=100
MAX_VOCAB_SIZE=20000
EMBEDDING_DIM=100
VALIDATION_SPLIT=0.2
BATCH_SIZE=120
EPOCHS=10

In [5]:
word2vec={}
with open(os.path.join('glove.6B/glove.6B.%sd.txt' %EMBEDDING_DIM),encoding="utf8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vec=np.asarray(values[1:], dtype='float32')
        word2vec[word]=vec
print('found %s word vectors.' % len(word2vec))


found 400000 word vectors.


In [6]:
print(word)

sandberger


In [7]:
print(vec)

[ 0.28365   -0.6263    -0.44351    0.2177    -0.087421  -0.17062
  0.29266   -0.024899   0.26414   -0.17023    0.25817    0.097484
 -0.33103   -0.43859    0.0095799  0.095624  -0.17777    0.38886
  0.27151    0.14742   -0.43973   -0.26588   -0.024271   0.27186
 -0.36761   -0.24827   -0.20815    0.22128   -0.044409   0.021373
  0.24594    0.26143    0.29303    0.13281    0.082232  -0.12869
  0.1622    -0.22567   -0.060348   0.28703    0.11381    0.34839
  0.3419     0.36996   -0.13592    0.0062694  0.080317   0.0036251
  0.43093    0.01882    0.31008    0.16722    0.074112  -0.37745
  0.47363    0.41284    0.24471    0.075965  -0.51725   -0.49481
  0.526     -0.074645   0.41434   -0.1956    -0.16544   -0.045649
 -0.40153   -0.13136   -0.4672     0.18825    0.2612     0.16854
  0.22615    0.62992   -0.1288     0.055841   0.01928    0.024572
  0.46875    0.2582    -0.31672    0.048591   0.3277    -0.50141
  0.30855    0.11997   -0.25768   -0.039867  -0.059672   0.5525
  0.13885   -0.22862

In [8]:
print(word2vec['sandberger'])

[ 0.28365   -0.6263    -0.44351    0.2177    -0.087421  -0.17062
  0.29266   -0.024899   0.26414   -0.17023    0.25817    0.097484
 -0.33103   -0.43859    0.0095799  0.095624  -0.17777    0.38886
  0.27151    0.14742   -0.43973   -0.26588   -0.024271   0.27186
 -0.36761   -0.24827   -0.20815    0.22128   -0.044409   0.021373
  0.24594    0.26143    0.29303    0.13281    0.082232  -0.12869
  0.1622    -0.22567   -0.060348   0.28703    0.11381    0.34839
  0.3419     0.36996   -0.13592    0.0062694  0.080317   0.0036251
  0.43093    0.01882    0.31008    0.16722    0.074112  -0.37745
  0.47363    0.41284    0.24471    0.075965  -0.51725   -0.49481
  0.526     -0.074645   0.41434   -0.1956    -0.16544   -0.045649
 -0.40153   -0.13136   -0.4672     0.18825    0.2612     0.16854
  0.22615    0.62992   -0.1288     0.055841   0.01928    0.024572
  0.46875    0.2582    -0.31672    0.048591   0.3277    -0.50141
  0.30855    0.11997   -0.25768   -0.039867  -0.059672   0.5525
  0.13885   -0.22862

In [9]:
train=pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')

In [10]:
test=pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv')

In [12]:
sentences=train["comment_text"].fillna("dummy_value").values

In [13]:
test_sentences=test["comment_text"].fillna("dummy_value").values

In [14]:
sentences

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [15]:
possible_labels=["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

In [19]:
targets=train[possible_labels].values

In [20]:
targets

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [21]:
print("max sequence length:",max(len(s) for s in sentences))
print("min sequence length:",min(len(s) for s in sentences))

max sequence length: 5000
min sequence length: 6


In [22]:
s=sorted(len(s) for s in sentences)
print("median sequence length:",s[len(s) // 2])

median sequence length: 205


In [23]:
tokenizer=Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences=tokenizer.texts_to_sequences(sentences)


In [24]:
tokenizer.fit_on_texts(test_sentences)
test_sequences=tokenizer.texts_to_sequences(test_sentences)

In [25]:
word2idx=tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 394787 unique tokens.


In [26]:
len(sequences)

159571

In [27]:
data=pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:',data.shape)

Shape of data tensor: (159571, 100)


In [28]:
test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH)

In [29]:
num_words=min(MAX_VOCAB_SIZE,len(word2idx)+1)
embedding_matrix=np.zeros((num_words,EMBEDDING_DIM))
for word,i in word2idx.items():
    if i<MAX_VOCAB_SIZE:
        embedding_vector=word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector

In [30]:
embedding_layer=Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

In [37]:
input_=Input(shape=(MAX_SEQUENCE_LENGTH,))
x=embedding_layer(input_)
x=Conv1D(128,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(128,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(128,3,activation='relu')(x)
x=GlobalMaxPooling1D()(x)
x=Dense(128,activation='relu')(x)
output=Dense(len(possible_labels),activation='sigmoid')(x)


In [38]:
model=Model(input_,output)
model.compile(
 loss='binary_crossentropy',
 optimizer='rmsprop',
 metrics =['accuracy']
)

In [39]:
r=model.fit(
data,
targets,
batch_size=BATCH_SIZE,
epochs=15,
validation_split=VALIDATION_SPLIT
)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 127656 samples, validate on 31915 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [40]:
res=model.predict(test_data)

In [42]:
print(res[0:10])

[[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [1.7197728e-03 0.0000000e+00 7.5131655e-05 0.0000000e+00 2.4408102e-05
  8.9406967e-08]
 [2.9802322e-08 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [2.6029348e-03 0.0000000e+00 1.3208389e-04 0.0000000e+00 6.0617924e-05
  1.6391277e-06]
 [7.2511435e-02 3.6305189e-04 2.0640254e-02 9.0429187e-04 1.8567771e-02
  3.4393668e-03]
 [5.0240695e-02 7.1078539e-05 9.0350807e-03 4.6932697e-04 8.4850192e-03
  1.4385581e-03]
 [6.2775314e-03 1.1920929e-07 2.8732419e-04 8.0466270e-07 1.9723177e-04
  7.8082085e-06]
 [1.6133666e-02 1.2516975e-06 2.3224950e-03 2.6822090e-07 1.1781156e-03
  1.4275312e-05]
 [9.9197447e-02 1.3095140e-04 2.1187067e-02 9.4866753e-04 1.8467367e-02
  3.6202967e-03]
 [1.7294884e-03 0.0000000e+00 4.3213367e-05 0.0000000e+00 7.6293945e-06
  0.0000000e+00]]


In [44]:
res=pd.DataFrame(res)
#result_toxic_comment=pd.to_csv(res)