# jigsaw-toxic-comment-classification-challenge

**
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

***

## History
- v_1: first version 

#### Import

In [121]:
import os, sys, pickle, random, datetime, keras, logging, warnings, re

from random import randint
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm

from sklearn import metrics, neighbors, model_selection, preprocessing
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics.scorer import make_scorer
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from mlxtend.regressor import StackingRegressor, StackingCVRegressor
import xgboost as xgb
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from keras import metrics, losses
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, BatchNormalization, Dropout, LeakyReLU, PReLU, LSTM, GRU, Bidirectional,  Input, Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils.vis_utils import model_to_dot

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


from IPython.display import SVG, display
from logging.handlers import RotatingFileHandler
from IPython.core.interactiveshell import InteractiveShell

sns.set(style="whitegrid", palette="muted")
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 800)

InteractiveShell.ast_node_interactivity = "last"
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
loghandler = logging.handlers.TimedRotatingFileHandler(filename='logs/log.log', when="midnight")
loghandler.setLevel(logging.INFO)
fileformatter = logging.Formatter('%(asctime)s - %(filename)s [%(levelname)s] >>> %(message)s')
loghandler.setFormatter(fileformatter)
logger.addHandler(loghandler)

def excepthood_handler(exctype, value, tb):
    logger.error('IAO Uncaught exception')
    logger.error('Type: {0}'.format(str(exctype)))
    logger.error('Value: {0}'.format(str(value)))
    logger.error('Traceback: {0}'.format(str(tb)))
    
sys.excepthook = excepthood_handler
logger.info('- start -')

In [3]:
def df_filter(df_tmp, condition, col=None):
    for key, val in condition.items():
        df_tmp = df_tmp[(df_tmp[key]==val)]
        
    if col:
        return df_tmp[col]
    else:
        return df_tmp
      
def get_idx(date_arr, codition):
    idxs=[]
    for start_date, end_date in codition:
        idx = np.where((date_arr >= start_date) & (date_arr < end_date))[0]
        idxs.append(idx)
    return np.concatenate(idxs)    


def df_dump(df, filename):
    path = open('cache/{f}'.format(f=filename), 'wb')
    return pickle.dump(df, path)
    
    
def df_load(filename):
    path = 'cache/{f}'.format(f=filename)
    if os.path.isfile(path):
        fh = open(path, 'rb')
        data = pickle.load(fh)
    else:
        data =None  
    return data
    



#### ** Data Exploration **

In [98]:

data = {
    'tra': pd.read_csv('dataset/train.csv'),
    'tes': pd.read_csv('dataset/test.csv'),
    'sam': pd.read_csv('dataset/sample_submission.csv'),
    #'sam': pd.read_csv('dataset/sample_submission.csv').rename(columns={'calendar_date':'visit_date'})
    }


toxic: 評論標籤, 是否為toxic
severe toxic: 嚴重的toxic
obscene: 猥褻
threat: 威脅
insult: 污辱
identity_hate: ??仇恨

In [49]:
display(data['tra'][40:43])
display(data['tra'].describe())

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
40,001735f961a23fc4,"""\n Sure, but the lead must briefly summarize Armenia's history. I simply added what I found necessary. If anyone thinks this or that sentence is redundant for the lead, they are welcome to remove make edits. talk """,0,0,0,0,0,0
41,00173958f46763a2,"TFD \n\nI think we just eced. I think we responded to each other without seeing each others responses. I added something in response to yours, but don't know if you saw mine. (T/C//WP:CHICAGO/WP:FOUR)",0,0,0,0,0,0
42,001810bf8c45bf5f,"You are gay or antisemmitian? \n\nArchangel WHite Tiger\n\nMeow! Greetingshhh!\n\nUh, there are two ways, why you do erased my comment about WW2, that holocaust was brutally slaying of Jews and not gays/Gypsys/Slavs/anyone...\n\n1 - If you are anti-semitian, than shave your head bald and go to the skinhead meetings!\n\n2 - If you doubt words of the Bible, that homosexuality is a deadly sin, make a pentagram tatoo on your forehead go to the satanistic masses with your gay pals!\n\n3 - First and last warning, you fucking gay - I won't appreciate if any more nazi shwain would write in my page! I don't wish to talk to you anymore!\n\nBeware of the Dark Side!",1,0,1,0,1,1


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


#### ** Preprocess **
- 英文小寫
- Remove Special Characters
- Replace Numbers

In [59]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')

#正規表示式 
special_character_removal = re.compile(r'[^a-z\d ]',re.IGNORECASE) 
replace_numbers = re.compile(r'\d+',re.IGNORECASE) 
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): 
    text = text.lower().split() #字母小寫 
    if remove_stopwords: #去停用詞 
        stops = set(stopwords.words("english")) 
        text = [w for w in text if not w in stops] 
    text = " ".join(text) 
    text=special_character_removal.sub('',text)  #Remove Special Characters
    text=replace_numbers.sub('n',text) #Replace Numbers
    if stem_words: #提取詞幹 
        text = text.split() 
        stemmer = SnowballStemmer('english') 
        stemmed_words = [stemmer.stem(word) for word in text] 
        text = " ".join(stemmed_words) 
    return(text) 
    
print('Raw:',data['tra'].loc[0,"comment_text"]) 
print('After:',text_to_wordlist(data['tra'].loc[0,"comment_text"])) 








Processing text dataset
Raw: Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
After: explanation why the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired nown


**序列化**
- tokenizer
- pad_sequences

In [109]:
MAX_SEQUENCE_LENGTH = 150 
MAX_NB_WORDS = 100000 


list_sentences_train = data['tra']['comment_text'].fillna("NA").values 
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = data['tra'][list_classes].values 

list_sentences_test = data['tes']["comment_text"].fillna("NA").values 

comments = [] 
for text in list_sentences_train: 
    comments.append(text_to_wordlist(text)) 

test_comments=[] 
for text in list_sentences_test: 
    test_comments.append(text_to_wordlist(text)) 
    
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
tokenizer.fit_on_texts(comments + test_comments) 

sequences = tokenizer.texts_to_sequences(comments) 
test_sequences = tokenizer.texts_to_sequences(test_comments) 

word_index = tokenizer.word_index 
print('Found %s unique tokens' % len(word_index)) 

train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) 
print('Shape of data tensor:', train_data.shape) 
print('Shape of label tensor:', y.shape) 

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) 
print('Shape of test_data tensor:', test_data.shape) 

            

Found 392183 unique tokens


AttributeError: 'dict' object has no attribute 'shape'

**word embedding**

In [101]:
EMBEDDING_FILE='model/glove.840B.300d.txt'
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300

embeddings_index = {} 
f = open(EMBEDDING_FILE,"rb") 
for line in f: 
    values = line.split() 
    word = values[0] 
    coefs = np.asarray(values[1:], dtype='float32') 
    embeddings_index[word] = coefs 
f.close() 
# embeddings_index 是通過glove預訓練詞向量構造的一個字典，每個單詞都有一個對應的300維度的詞向量,詞向量來源於glove的預訓練。
# 接著，我們構造了一個embedding_matrix，只取了排名靠前的10W單詞，並且把詞向量填充進embedding_matrix。 
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index)) 
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) 
for word, i in word_index.items(): 
    if i >= MAX_NB_WORDS: 
        continue 
    embedding_vector = embeddings_index.get(str.encode(word)) 
    if embedding_vector is not None: 
        # words not found in embedding index will be all-zeros. 
        embedding_matrix[i] = embedding_vector 

    

Preparing embedding matrix


In [118]:
########################################
## sample train/validation data
########################################        
# 接下來，我們對資料進行訓練集和驗證集的劃分。 

VALIDATION_SPLIT = 0.1

perm = np.random.permutation(len(train_data)) 
idx_train = perm[:int(len(train_data)*(1-VALIDATION_SPLIT))] 
idx_val = perm[int(len(train_data)*(1-VALIDATION_SPLIT)):] 
data_train=train_data[idx_train] 
labels_train=y[idx_train] 
print(data_train.shape,labels_train.shape) 
data_val=train_data[idx_val] 
labels_val=y[idx_val] 
print(data_val.shape,labels_val.shape)

(143613, 150) (143613, 6)
(15958, 150) (15958, 6)


#### **Deep Learning**

In [119]:
embedding_layer = Embedding(input_dim=nb_words, output_dim=300, weights=[embedding_matrix], input_length=150, trainable=False) 

class Attention(Layer): 
    def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs): 
        """ Keras Layer that implements an Attention mechanism for temporal data. 
        Supports Masking. 
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756] 
        # Input shape 
            3D tensor with shape: `(samples, steps, features)`. 
        # Output shape 
            2D tensor with shape: `(samples, features)`. 
        :param kwargs: 
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 
        The dimensions are inferred based on the output shape of the RNN. 
        Example: 
            model.add(LSTM(64, return_sequences=True)) 
            model.add(Attention()) """ 
        self.supports_masking = True 
        #self.init = initializations.get('glorot_uniform') 
        self.init = initializers.get('glorot_uniform') 
        self.W_regularizer = regularizers.get(W_regularizer) 
        self.b_regularizer = regularizers.get(b_regularizer) 
        self.W_constraint = constraints.get(W_constraint) 
        self.b_constraint = constraints.get(b_constraint) 
        self.bias = bias 
        self.step_dim = step_dim 
        self.features_dim = 0 
        super(Attention, self).__init__(**kwargs) 
        
    def build(self, input_shape): 
        assert len(input_shape) == 3 
        self.W = self.add_weight((input_shape[-1],), initializer=self.init, name='{}_W'.format(self.name), regularizer=self.W_regularizer, constraint=self.W_constraint) 
        self.features_dim = input_shape[-1] 
        if self.bias: 
            self.b = self.add_weight((input_shape[1],), initializer='zero', name='{}_b'.format(self.name), regularizer=self.b_regularizer, constraint=self.b_constraint) 
        else: self.b = None 
        self.built = True 
    
    def compute_mask(self, input, input_mask=None): 
        # do not pass the mask to the next layers 
        return None 
    
    def call(self, x, mask=None): 
        # eij = K.dot(x, self.W) TF backend doesn't support it 
        # features_dim = self.W.shape[0] 
        # step_dim = x._keras_shape[1] 
        features_dim = self.features_dim 
        step_dim = self.step_dim 
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) 
        
        if self.bias: 
            eij += self.b 
        
        eij = K.tanh(eij) 
        a = K.exp(eij) 
        
        # apply mask after the exp. will be re-normalized next 
        if mask is not None: 
            # Cast the mask to floatX to avoid float64 upcasting in theano 
            a *= K.cast(mask, K.floatx()) 
        
        # in some cases especially in the early stages of training the sum may be almost zero 
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 
        a = K.expand_dims(a) 
        weighted_input = x * a 
        #print weigthted_input.shape 
        return K.sum(weighted_input, axis=1) 
    
    def compute_output_shape(self, input_shape): 
        #return input_shape[0], input_shape[-1] 
        return input_shape[0], self.features_dim 
    
    
num_lstm = 300 
num_dense = 256 
rate_drop_lstm = 0.25 
rate_drop_dense = 0.25 
act = 'relu' 

lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,return_sequences=True) 
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 
embedded_sequences= embedding_layer(comment_input) 
x = lstm_layer(embedded_sequences) 
x = Dropout(rate_drop_dense)(x) 
merged = Attention(MAX_SEQUENCE_LENGTH)(x) 
merged = Dense(num_dense, activation=act)(merged) 
merged = Dropout(rate_drop_dense)(merged) 
merged = BatchNormalization()(merged) 
preds = Dense(6, activation='sigmoid')(merged)

     

** Training **


In [122]:
model = Model(inputs=[comment_input],  outputs=preds) 
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) 
print(model.summary()) 

STAMP = 'model/simple_lstm_glove_vectors_%.2f_%.2f'%(rate_drop_lstm,rate_drop_dense) 
print('STAMP',STAMP)
bst_model_path = STAMP + '.h5' 
print('bst_model_path',bst_model_path) 

early_stopping =EarlyStopping(monitor='val_loss', patience=5) 
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) 

hist = model.fit(data_train, labels_train, validation_data=(data_val, labels_val), epochs=50, batch_size=256, shuffle=True, callbacks=[early_stopping, model_checkpoint]) 

model.load_weights(bst_model_path) 
bst_val_score = min(hist.history['val_loss']) 

y_test = model.predict([test_data], batch_size=1024, verbose=1) 
data['sam'][list_classes] = y_test 
data['sam'].to_csv('%.4f_'%(bst_val_score) + STAMP + '.csv', index=False)

    
    
    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          30000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 300)          721200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 300)          0         
_________________________________________________________________
attention_1 (Attention)      (None, 300)               450       
_________________________________________________________________
dense_1 (Dense)              (None, 256)               77056     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
__________

KeyboardInterrupt: 

***