In [1]:
import gc
import re
import operator 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk.data
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.util import ngrams

import gensim
from gensim.models.phrases import Phrases, Phraser
import re, unicodedata
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, precision_score, recall_score, auc, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.metrics import confusion_matrix, classification_report

from sklearn import metrics
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Dense, Conv1D, MaxPooling1D, Flatten, Dropout, SpatialDropout1D, LSTM, GlobalMaxPool1D, \
                        GlobalAveragePooling1D, concatenate, Bidirectional, CuDNNLSTM, CuDNNGRU, Masking, GlobalAveragePooling1D, GlobalMaxPooling1D, BatchNormalization
from keras.optimizers import RMSprop, Adam
from keras.models import Model, load_model
from gensim.models import KeyedVectors
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

from sklearn import metrics
import tensorflow as tf

pd.set_option('max_colwidth', 300)
%matplotlib inline

Using TensorFlow backend.


In [2]:
print(os.listdir('../input/fasttext-crawl-300d-2m/'))

['crawl-300d-2M.vec']


In [3]:
path_data = '../input/jigsaw-unintended-bias-in-toxicity-classification/'
path_glove = '../input/glove840b300dtxt/glove.840B.300d.txt'
path_fasttext = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
path_output = '../input/'

In [4]:
train = pd.read_csv(path_data + 'train.csv')
test = pd.read_csv(path_data + 'test.csv')
sub = pd.read_csv(path_data + 'sample_submission.csv')

In [5]:
df = pd.concat([train[['id','comment_text']], test], axis=0)
del(train, test)
gc.collect()

22

In [6]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

### Preprocess

In [7]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                       "could've": "could have", "couldn't": "could not", "didn't": "did not", 
                       "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                       "hasn't": "has not", "haven't": "have not", "he'd": "he would",
                       "he'll": "he will", "he's": "he is", "how'd": "how did", 
                       "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                       "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
                       "I'll've": "I will have","I'm": "I am", "I've": "I have",
                       "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
                       "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not",
                       "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have","it's": "it is", "let's": "let us",
                       "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                       "mightn't": "might not","mightn't've": "might not have", 
                       "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                       "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", 
                       "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                       "she'd've": "she would have", "she'll": "she will",
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", 
                       "shouldn't": "should not", "shouldn't've": "should not have", 
                       "so've": "so have","so's": "so as", "this's": "this is",
                       "that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", 
                       "there's": "there is", "here's": "here is","they'd": "they would", 
                       "they'd've": "they would have", "they'll": "they will", 
                       "they'll've": "they will have", "they're": "they are", 
                       "they've": "they have", "to've": "to have", "wasn't": "was not",
                       "we'd": "we would", "we'd've": "we would have", "we'll": 
                       "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                       "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
                       "what're": "what are",  "what's": "what is", "what've": "what have", 
                       "when's": "when is", "when've": "when have", "where'd": "where did", 
                       "where's": "where is", "where've": "where have", "who'll": "who will", 
                       "who'll've": "who will have", "who's": "who is", "who've": "who have", 
                       "why's": "why is", "why've": "why have", "will've": "will have",
                       "won't": "will not", "won't've": "will not have", "would've": "would have",
                       "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", 
                       "y'all'd": "you all would","y'all'd've": "you all would have",
                       "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
                       "you'd've": "you would have", "you'll": "you will", 
                       "you'll've": "you will have", "you're": "you are", "you've": "you have" }

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

swear_words_re = ' 4r5e | 5h1t | 5hit | ass-fucker | assfucker | assfukka | asswhole | a_s_s | b!tch | b17ch | blow job \
                    | boiolas | bollok | boooobs | booooobs | booooooobs | bunny fucker | buttmuch | c0cksucker \
                    | carpet muncher | cl1t | cockface | cockmunch | cockmuncher | cocksuka | cocksukka | cokmuncher \
                    | coksucka | cunillingus | cuntlick | cuntlicker | cuntlicking | cyalis | cyberfuc | cyberfuck \
                    | cyberfucked | cyberfucker | cyberfuckers | cyberfucking | dirsa | dlck | dog-fucker | donkeyribber \
                    | ejaculatings | ejakulate | f u c k | f u c k e r | f4nny | faggitt | faggs | fannyflaps | fannyfucker \
                    | fanyy | fingerfucker | fingerfuckers | fingerfucks | fistfuck | fistfucked | fistfucker | fistfuckers \
                    | fistfucking | fistfuckings | fistfucks | fuckingshitmotherfucker | fuckwhit | fudge packer | fudgepacker \
                    | fukwhit | fukwit | fux0r | f_u_c_k | god-dam | kawk | knobead | knobed | knobend | knobjocky | knobjokey \
                    | kondum | kondums | kummer | kumming | kums | kunilingus | l3itch | m0f0 | m0fo | m45terbate | ma5terb8 \
                    | ma5terbate | master-bate | masterb8 | masterbat3 | masterbations | mof0 | mothafuck | mothafuckaz \
                    | mothafucked | mothafucking | mothafuckings | mothafucks | mother fucker | motherfucked | motherfuckings \
                    | motherfuckka | motherfucks | muthafecker | muthafuckker | n1gga | n1gger | nigg3r | nigg4h | nob jokey \
                    | nobjocky | nobjokey | penisfucker | phuked | phuking | phukked | phukking | phuks | phuq | pigfucker \
                    | pimpis | pissflaps | rimjaw | s hit | scroat | sh!t | shitdick | shitfull | shitings | shittings | s_h_i_t \
                    | t1tt1e5 | t1tties | teez | tittie5 | tittiefucker | tittywank | tw4t | twathead | twunter | v14gra \
                    | v1gra | w00se | whoar '

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
                'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do',
                'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many',
                'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does',
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating',
                'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data',
                '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 
                'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

In [8]:
punct_mapping = {"_":" ", "`":" "}

def clean_special_chars(text, puncts, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])    
    for p in puncts:
        text = text.replace(p, f' {p} ')     
    return text

In [9]:
def correct_spelling(words):
    for word in mispell_dict.keys():
        words = words.replace(word, mispell_dict[word])
    return words

In [10]:
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

In [11]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [12]:
def clean_numbers(x):
    return re.sub('\d+', ' ', x)

In [13]:
def remove_strange_words(x):
     return re.sub('\W*\b\w{20,100}\b', ' ', x)

In [14]:
def handle_swears(text):
    text = re.sub(swear_words_re, ' fuck ', text)
    return text

In [15]:
%%time
df['comment_text'] = df['comment_text'].apply(lambda x: x.lower())
df['comment_text'] = df['comment_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
df['comment_text'] = df['comment_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
df['comment_text'] = df['comment_text'].apply(lambda x: handle_swears(x))
df['comment_text'] = df['comment_text'].apply(correct_spelling)
df['comment_text'] = df['comment_text'].apply(clean_numbers)
df['comment_text'] = df['comment_text'].apply(remove_strange_words)

CPU times: user 3min 38s, sys: 544 ms, total: 3min 38s
Wall time: 3min 38s


In [16]:
train = df.iloc[:1804874,:]
test = df.iloc[1804874:,:]

train.head()

Unnamed: 0,id,comment_text
0,59848,"this is so cool . it is like , ' would you want your mother to read this ? ? ' really great idea , well done !"
1,59849,"thank you ! ! this would make my life a lot less anxiety - inducing . keep it up , and do not let anyone get in your way !"
2,59852,this is such an urgent design problem ; kudos to you for taking it on . very impressive !
3,59855,is this something i will be able to install on my site ? when will you be releasing it ?
4,59856,haha you guys are a bunch of losers .


In [17]:
my_columns = ['target',
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']

train_orig = pd.read_csv( path_data + "train.csv")
train = pd.concat([train, train_orig[my_columns]], axis=1)
del(train_orig)
# gc.collect()
train['target'] = np.where(train['target'] >= 0.5, 1, 0)

In [18]:
train_df, validate_df = train_test_split(train, test_size=0.1, random_state = 12)
print(f'{len(train_df)} train comments, {len(validate_df)} validate comments')

1624386 train comments, 180488 validate comments


In [19]:
del train
gc.collect()

17

In [20]:
MAX_NB_WORDS = 100_000 #574_312 unique tokens in word_index
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 220

corpus = pd.concat([train_df['comment_text'], validate_df ['comment_text'], test['comment_text']])
corpus = corpus.drop_duplicates()

# Create a text tokenizer.
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(corpus)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 301731 unique tokens.


In [21]:
train_text = pad_sequences(tokenizer.texts_to_sequences(train_df['comment_text']), maxlen=MAX_SEQUENCE_LENGTH)
train_labels = train_df['target']
validate_text = pad_sequences(tokenizer.texts_to_sequences(validate_df['comment_text']), maxlen=MAX_SEQUENCE_LENGTH)
validate_labels = validate_df['target']
test_text = pad_sequences(tokenizer.texts_to_sequences(test['comment_text']), maxlen=MAX_SEQUENCE_LENGTH)

In [22]:
del (df)
gc.collect()

0

In [23]:
%%time
glove_matrix = build_matrix(tokenizer.word_index, path_glove)
#fasttext_matrix = build_matrix(tokenizer.word_index, path_fasttext)

CPU times: user 2min 36s, sys: 5.52 s, total: 2min 41s
Wall time: 2min 42s


### Model architecture

In [24]:
y_aux_train = train_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
y_aux_validate = validate_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
num_categories_train = y_aux_train.shape[1]

In [25]:
def build_model(embedding_matrix):
    sequence_input = Input(shape = (MAX_SEQUENCE_LENGTH,),  dtype='int32')
    
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(sequence_input)

    x = SpatialDropout1D(0.25)(x)

    x1 = Bidirectional(CuDNNLSTM(128, return_sequences = True))(x)
    x2 = Bidirectional(CuDNNLSTM(64, return_sequences = True))(x1)

    avg_pool = GlobalAveragePooling1D()(x2)
    max_pool = GlobalMaxPooling1D()(x2)

    vf = concatenate([avg_pool, max_pool])
    vf = BatchNormalization()(vf)

    #vf = Dropout(0.2)(Dense(64, activation='relu') (vf))

    preds = Dense(1, activation='sigmoid', name = 'target')(vf)
    preds_aux = Dense(num_categories_train, activation = 'sigmoid', name = 'categories')(vf)

    model = Model(inputs = sequence_input, outputs=[preds, preds_aux])
    model.compile(loss = 'binary_crossentropy',
                      optimizer = 'adam',
                      metrics=['acc'])
    
    return model

In [26]:
def train_model(embedding_matrix):
    NUM_MODELS = 2
    EPOCHS = 4
    checkpoint_predictions_val = []
    checkpoint_predictions_test = []
    weights_val = []
    weights_test = []
    file_path = "best_model.hdf5"

    for model_i in range(NUM_MODELS):
        model = build_model(embedding_matrix)
        for glob_epoch in range (EPOCHS):
            model.fit(train_text,
               [train_labels, y_aux_train],
               batch_size=512,
               #class_weight = class_weight,
               epochs=1,
               validation_data=(validate_text, [validate_labels, y_aux_validate]),
               callbacks = [EarlyStopping(monitor='val_categories_loss', mode='min', verbose=1, patience=1),             
                            ModelCheckpoint(file_path, monitor = "val_categories_loss", verbose = 1,
                                            save_best_only = True, mode = "min"),
                           LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** glob_epoch))],
               verbose=0) #Avoid kaggle kernels issues
            
            checkpoint_predictions_val.append(model.predict(validate_text, batch_size=2048)[0].flatten())
            weights_val.append(2 ** glob_epoch)
            checkpoint_predictions_test.append(model.predict(test_text, batch_size=2048)[0].flatten())
            weights_test.append(2 ** glob_epoch)
            
    predictions_val = np.average(checkpoint_predictions_val, weights=weights_val, axis=0)
    predictions_test = np.average(checkpoint_predictions_test, weights=weights_test, axis=0)
            
    return predictions_val, predictions_test
        
    

In [27]:
%time glove_val_preds, glove_test_preds = train_model(glove_matrix)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.

Epoch 00001: val_categories_loss improved from inf to 0.08967, saving model to best_model.hdf5

Epoch 00001: val_categories_loss improved from inf to 0.08566, saving model to best_model.hdf5

Epoch 00001: val_categories_loss improved from inf to 0.08742, saving model to best_model.hdf5

Epoch 00001: val_categories_loss improved from inf to 0.08323, saving model to best_model.hdf5

Epoch 00001: val_categories_loss improved from inf to 0.09321, saving model to best_model.hdf5

Epoch 00001: val_categories_loss improved from inf to 0.08398, saving model to best_model.hdf5

Epoch 00001: val_categories_loss improved from inf to 0.08708, saving model to best_model.hdf5

Epoch 00001: v

In [28]:
validate_df['predict_NN'] = glove_val_preds

In [29]:
print(confusion_matrix(validate_df.target, np.where(validate_df.predict_NN > 0.5, 1, 0)))
print(classification_report(validate_df.target, np.where(validate_df.predict_NN > 0.5, 1, 0)))

[[163597   2414]
 [  5908   8569]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98    166011
           1       0.78      0.59      0.67     14477

   micro avg       0.95      0.95      0.95    180488
   macro avg       0.87      0.79      0.82    180488
weighted avg       0.95      0.95      0.95    180488



In [30]:
print('ROC-AUC: {}'.format(roc_auc_score(validate_df.target, validate_df.predict_NN)))

ROC-AUC: 0.9658466871475452


In [31]:
test['prediction'] = glove_test_preds
test.head()

Unnamed: 0,id,comment_text,prediction
0,7000000,jeff sessions is another one of trump ' s orwellian choices . he believes and has believed his entire career the exact opposite of what the position requires .,0.004802
1,7000001,"i actually inspected the infrastructure on grand chief stewart philip ' s home penticton first nation in both and . exactly zero projects that had been identified in previous inspection reports had been funded by the federal government , and the entire band was housed in atco trailers ....",0.000103
2,7000002,"no it will not . that is just wishful thinking on democrats fault . for the th time , walker cited the cost of drug users treatment as being lost with obamacare . i laugh every time i hear a liberal claim republicans want to hurt people , and that is why they dumped obamacare .",0.005147
3,7000003,"instead of wringing our hands and nibbling the periphery of the issue , how about we face the actual issue head on ? i would support a city ordinance against loitering , and applaud city councilors who champion a real and permanent solution . \n\nthe details could be determined , but would i...",0.002219
4,7000004,"how many of you commenters have garbage piled high in your yard , bald tires , dead batteries , rotten pallets , car parts , blah blah blah . this town is a pigpen . drive around and look for yourself , its pathetic .",0.95411


In [32]:
submission = test[['id', 'prediction']]
submission.to_csv('submission.csv', index = False)