In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import re
from numpy import *  
import matplotlib.pyplot as plt
import numpy as np
import operator
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
df = pd.read_csv('../../data/jigsaw-toxic-comment-classification-challenge/train.csv')
df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
83008,de1473aee5f0dfed,Why do you keep deleting a link to the most in...,0,0,0,0,0,0
4399,0bb4288aa6e24846,"Welcome\n\nHello, and welcome to Wikipedia! Th...",0,0,0,0,0,0
131467,bf5d52f37697d1e8,"""\nWelcome\n\nHello and welcome to Wikipedia! ...",0,0,0,0,0,0
30505,50f6eef1804af470,"""\n\n See below, please \n\nDoug youvan, NukeH...",0,0,0,0,0,0
80696,d7e00750334e246e,"""\n\nI am not going to keep redoing this over ...",0,0,0,0,0,0


In [None]:
special_chars = r"[^a-z0-9!@#\$%\^\&\*_\-,\.' ]"

class preprocessing(object):
    def __init__(self,special_chars):
        self.special_chars = special_chars
    def cleanString(self,s):
        # remove special chars
        if self.special_chars is not None:
            s = re.sub(self.special_chars, ' ', s)
        s = s.replace("\\n", " ").replace("\n", " ")
        tokenizer = TweetTokenizer()
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        cleaned_words = [w for w in tokenizer.tokenize(s) if w not in stop_words]
        return " ".join(cleaned_words)

    def stemWords(self,sentence):
        stemmer, tokenizer = PorterStemmer(), TweetTokenizer()
        stemmed_words = [stemmer.stem(w) for w in tokenizer.tokenize(sentence)]
        return " ".join(stemmed_words)

    def cleanFrame(selfdev,frame):
        frame['clean_comment'] = frame.comment_text.apply(selfdev.cleanString)

    def stemFrame(selfdev,frame):
        frame['stem_comment'] = frame.clean_comment.apply(selfdev.stemWords)

Preprocessing=preprocessing(special_chars)
Preprocessing.cleanFrame(df)
Preprocessing.stemFrame(df)

RecursionError: maximum recursion depth exceeded in comparison

Import Pickle

In [10]:
import pickle
toxic_comments = open('../toxic.pickle','rb') 
df = pickle.load(toxic_comments)
df.sample(5)

Unnamed: 0,id,comment_text,Toxic,clean_comment,stem_comment
45030,78623a20cebc2734,Truck manufacturer \n\nIt seems this company d...,0,truck manufacturr thi company mak truck jut co...,truck manufacturr thi compani mak truck jut co...
91528,f4b9e8d7f1720cfa,"You, letter B, deleted my redirect page!!!\nIt...",0,lttr b dltd rdirct pa jut rdirct pa ur pa dfin...,lttr b dltd rdirct pa jut rdirct pa ur pa dfin...
81760,daac31f22b627f8c,No accident. And implying that I'm a child co...,0,accidnt implyin im child could b conidrd om b ...,accidnt implyin im child could b conidrd om b ...
115653,6a8663d28bbf200c,"]\n\n August 2010 \n\nIf I get blocked, I'll j...",0,auut blockd ill jut mak anothr account aum n i...,auut blockd ill jut mak anothr account aum n i...
105354,33a709a822632f12,"Thank you for taking a look, that is literally...",0,thank takin look litrally aom look th fa bio i...,thank takin look litral aom look th fa bio im ...


In [11]:
df.columns

Index(['id', 'comment_text', 'Toxic', 'clean_comment', 'stem_comment'], dtype='object')

In [20]:
y = df["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"].values

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.stem_comment, y, test_size=0.2,
                                                    random_state=np.random)
print(X_train.shape, X_test.shape, len(y_train), len(y_test))

(127656,) (31915,) 127656 31915


In [22]:
from keras.preprocessing.text import Tokenizer
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
maxlen=400
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
V_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
V_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
print(V_train.shape,V_test.shape)

(127656, 400) (31915, 400)


In [31]:
def getModel():
    inp = Input(shape=(maxlen, ))
    embed_size = 128
    x = Embedding(max_features, embed_size)(inp)
    x = LSTM(30, return_sequences=True,name='lstm_layer')(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.5)(x)
    x = Dense(20, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [32]:
model = getModel()
batch_size = 32
epochs = 3
file_path="../weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=2)


callbacks_list = [checkpoint, early] #early
model.fit(V_train,y_train,  batch_size=batch_size, epochs=epochs, 
          validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)

Epoch 1/3
Epoch 00001: val_loss improved from inf to 0.05693, saving model to ../weights_base.best.hdf5
Epoch 2/3
Epoch 00002: val_loss improved from 0.05693 to 0.05629, saving model to ../weights_base.best.hdf5
Epoch 3/3
Epoch 00003: val_loss improved from 0.05629 to 0.05506, saving model to ../weights_base.best.hdf5


In [33]:
from sklearn.pipeline import Pipeline

y_pred = model.predict(V_test)
print(classification_report(y_test,y_pred.round(),digits=6))

              precision    recall  f1-score   support

           0   0.886977  0.627509  0.735016      3039
           1   0.000000  0.000000  0.000000       301
           2   0.844476  0.719856  0.777202      1667
           3   0.000000  0.000000  0.000000        96
           4   0.741608  0.620104  0.675435      1532
           5   0.000000  0.000000  0.000000       265

   micro avg   0.836150  0.587971  0.690436      6900
   macro avg   0.412177  0.327912  0.364609      6900
weighted avg   0.759334  0.587971  0.661461      6900
 samples avg   0.054598  0.050031  0.049802      6900



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
