# Homework 2 - TF-IDF Classifier

Ваша цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

В процессе обучения нужно ответить на ***[вопросы](https://docs.google.com/forms/d/e/1FAIpQLSd9mQx8EFpSH6FhCy1M_FmISzy3lhgyyqV3TN0pmtop7slmTA/viewform?usp=sf_link)***

Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



In [None]:
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('./input/train.csv').fillna('Unknown')
test = pd.read_csv('./input/test.csv').fillna('Unknown')
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [None]:
import re, string
from nltk.corpus import stopwords
from keras.preprocessing.text import text_to_word_sequence

#re_tok = re.compile('([%s“”¨«»®´·º½¾¿¡§£₤‘’])' % string.punctuation)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(' u ', 'you', text)
    text = re.sub('\nu ', 'you', text)
    text = re.sub(' u\n', 'you', text)
    text = re.sub("fucksex", 'fuck sex', text)
    text = text.strip(' ')
    return text

def cleanupDoc(s):
    s = clean_text(s)
    stopset = set(stopwords.words('english'))
    stopset.add('wikipedia')
    tokens = text_to_word_sequence(s, 
                                   filters="\"!'#$%&()*+,-˚˙./:;‘“<=·>?@[]^_`{|}~\t\n",
                                   lower=True,
                                   split=" ")
    cleanup = " ".join(filter(lambda word: word not in stopset, tokens))
    return cleanup

In [None]:
from keras.models import Model, Layer
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam, RMSprop
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, CuDNNLSTM, Add, Concatenate
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import initializers, regularizers, constraints
import keras.backend as K
from keras.layers import Conv1D, GaussianNoise, MaxPooling1D, GlobalMaxPooling1D
from keras.regularizers import l2

In [None]:
EMBEDDING_FILE = './input/crawl-300d-2M.vec'

In [None]:
max_features = 300000
maxlen = 200
embed_size = 300

In [None]:
targets_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
train = train["comment_text"].fillna("fillna").map(clean_text).values
test = test["comment_text"].fillna("fillna").map(clean_text).values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train) + list(test))
X_train = tokenizer.texts_to_sequences(train)
X_test = tokenizer.texts_to_sequences(test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf-8'))

In [None]:
from sklearn.metrics import roc_auc_score
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, batch_size=512, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            
            
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

# CV

## GRU

In [None]:
def _bn_elu():
    def func(x):
        x = BatchNormalization()(x)
        x = Activation('elu')(x)
        return x
    return func    

def get_gru_v2(dropout=0., dropout_dense=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = GaussianNoise(stddev=0.15)(x)
    
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = Attention(maxlen)(x)
    
    x = Dense(128)(x)
    x = _bn_elu()(x) 
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, use_bias=True, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
from sklearn.model_selection import KFold
def prepare_data_cv():
    global targets_train, x_train, x_test
    
    kfold_data = []
    kf = KFold(n_splits=5, shuffle=True, random_state=0xCAFFE)
    
    targets_train = np.array(targets_train)
    x_train = np.array(x_train)
    
    for train_indices, val_indices in kf.split(targets_train):
        X_train_cv = x_train[train_indices]
        y_train_cv = targets_train[train_indices]

        X_val = x_train[val_indices]
        y_val = targets_train[val_indices]

        kfold_data.append((X_train_cv, y_train_cv, X_val, y_val, val_indices))

    X_test = x_test

    return (kfold_data, X_test)

In [None]:
def get_model_callbacks(save_dir):
    stopping = EarlyStopping(monitor='val_loss',
                             min_delta=1e-3,
                             patience=5,
                             verbose=False,
                             mode='min')

    board_path = os.path.join(save_dir, 'board')
    if not os.path.exists(board_path):
        os.makedirs(board_path)

    lr_sheduler = ReduceLROnPlateau(monitor='val_loss',
                                    factor=0.1,
                                    patience=2,
                                    verbose=True,
                                    mode='min',
                                    epsilon=2e-3,
                                    min_lr=1e-5)

    model_path = os.path.join(save_dir, 'model/model_weights.hdf5')
    if not os.path.exists(os.path.dirname(model_path)):
        os.makedirs(os.path.dirname(model_path))

    model_checkpoint = ModelCheckpoint(model_path,
                                       monitor='val_loss',
                                       verbose=False,
                                       save_best_only=True,
                                       save_weights_only=False,
                                       mode='min',
                                       period=1)

    callbacks = [stopping, lr_sheduler, model_checkpoint]
    return callbacks

In [None]:
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import os

In [None]:
model = get_gru_v2(dropout_dense=0.)
model.summary()

In [None]:
#tf.reset_default_graph()

STAMP = 'gru_107'
experiment_path = './experiments/%s' % STAMP
epochs = 15
batch_size = 256

(kfold_data, X_test) = prepare_data_cv()


train_probas = np.zeros(shape=(x_train.shape[0], 6))
test_probas = np.zeros(shape=(x_test.shape[0], 6))

models_roc = []
models_train_roc = []


for idx, data in enumerate(tqdm(kfold_data)):
    X_train, y_train, X_valid, y_valid, val_indices = data

    model = get_gru_v2()
    callbacks = get_model_callbacks(save_dir=os.path.join(experiment_path, 'fold_%02d' % idx))

    model.fit(X_train, y_train, 
               batch_size=batch_size, 
               epochs=epochs, 
               validation_data=(X_valid, y_valid),
               shuffle=True,
               callbacks=callbacks, verbose=1)

    model.load_weights(filepath=os.path.join(experiment_path, ('fold_%02d/model/model_weights.hdf5' % idx)))

    proba = model.predict(X_train, batch_size=batch_size*2)
    proba_val = model.predict(X_valid, batch_size=batch_size*2)
    proba_test = model.predict(x_test, batch_size=batch_size*2)

    models_roc.append(roc_auc_score(y_valid, proba_val))
    models_train_roc.append(roc_auc_score(y_train, proba))
    
    train_probas[val_indices] += proba_val
    test_probas += proba_test / 5.


    print('Train ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_train_roc),
                                                                 np.std(models_train_roc),
                                                                 np.min(models_train_roc),
                                                                 np.max(models_train_roc)))

    print('Val ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_roc),
                                                                 np.std(models_roc),
                                                                 np.min(models_roc),
                                                                 np.max(models_roc)))


for i, cls_name in enumerate(class_names):
    train_submission[cls_name] = train_probas[:, i]
train_submission.to_csv('train_%s.csv' % STAMP, index=False)

for i, cls_name in enumerate(class_names):
    submission[cls_name] = test_probas[:, i]
submission.to_csv('submission_%s.csv' % STAMP, index=False)

## LSTM

In [None]:
def _bn_elu():
    def func(x):
        x = BatchNormalization()(x)
        x = Activation('elu')(x)
        return x
    return func


def get_lstm_v2(dropout=0., dropout_dense=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = GaussianNoise(stddev=0.15)(x)
    
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = Attention(maxlen)(x)
    
    x = Dense(128)(x)
    x = _bn_elu()(x) 
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, use_bias=True, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
model = get_lstm_v2(dropout_dense=0.)
model.summary()

In [None]:
#tf.reset_default_graph()

STAMP = 'lstm_100'
experiment_path = './experiments/%s' % STAMP
epochs = 15
batch_size = 256

(kfold_data, X_test) = prepare_data_cv()


train_probas = np.zeros(shape=(x_train.shape[0], 6))
test_probas = np.zeros(shape=(x_test.shape[0], 6))

models_roc = []
models_train_roc = []


for idx, data in enumerate(tqdm(kfold_data)):
    X_train, y_train, X_valid, y_valid, val_indices = data

    model = get_lstm_v2()
    callbacks = get_model_callbacks(save_dir=os.path.join(experiment_path, 'fold_%02d' % idx))

    model.fit(X_train, y_train, 
               batch_size=batch_size, 
               epochs=epochs, 
               validation_data=(X_valid, y_valid),
               shuffle=True,
               callbacks=callbacks, verbose=1)

    model.load_weights(filepath=os.path.join(experiment_path, ('fold_%02d/model/model_weights.hdf5' % idx)))

    proba = model.predict(X_train, batch_size=batch_size*2)
    proba_val = model.predict(X_valid, batch_size=batch_size*2)
    proba_test = model.predict(x_test, batch_size=batch_size*2)

    models_roc.append(roc_auc_score(y_valid, proba_val))
    models_train_roc.append(roc_auc_score(y_train, proba))
    
    train_probas[val_indices] += proba_val
    test_probas += proba_test / 5.


    print('Train ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_train_roc),
                                                                 np.std(models_train_roc),
                                                                 np.min(models_train_roc),
                                                                 np.max(models_train_roc)))

    print('Val ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_roc),
                                                                 np.std(models_roc),
                                                                 np.min(models_roc),
                                                                 np.max(models_roc)))


for i, cls_name in enumerate(class_names):
    train_submission[cls_name] = train_probas[:, i]
train_submission.to_csv('train_%s.csv' % STAMP, index=False)

for i, cls_name in enumerate(class_names):
    submission[cls_name] = test_probas[:, i]
submission.to_csv('submission_%s.csv' % STAMP, index=False)

## TextCNN

In [None]:
def _bn_elu():
    def func(x):
        x = BatchNormalization()(x)
        x = Activation('elu')(x)
        return x
    return func


def get_text_cnn(dropout=0., dropout_dense=0., weight_decay=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    #x = GaussianNoise(stddev=0.1)(x)
    
    x = Conv1D(filters=256, kernel_size=7, padding='same')(x)
    x = _bn_elu()(x) 
    x = MaxPooling1D(2)(x)
    
    x = Conv1D(filters=256, kernel_size=7, padding='same')(x)
    x = _bn_elu()(x) 
    x = Attention(maxlen // 2)(x)
    
    x = Dense(128, kernel_regularizer=l2(weight_decay))(x)
    x = _bn_elu()(x) 
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, use_bias=True, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
model = get_text_cnn(dropout_dense=0., weight_decay=0.)
model.summary()

In [None]:
STAMP = 'textcnn_100'
experiment_path = './experiments/%s' % STAMP
epochs = 15
batch_size = 256

(kfold_data, X_test) = prepare_data_cv()


train_probas = np.zeros(shape=(x_train.shape[0], 6))
test_probas = np.zeros(shape=(x_test.shape[0], 6))

models_roc = []
models_train_roc = []


for idx, data in enumerate(tqdm(kfold_data)):
    X_train, y_train, X_valid, y_valid, val_indices = data

    model = get_text_cnn(dropout_dense=0.3, weight_decay=1e-4)
    callbacks = get_model_callbacks(save_dir=os.path.join(experiment_path, 'fold_%02d' % idx))

    model.fit(X_train, y_train, 
               batch_size=batch_size, 
               epochs=epochs, 
               validation_data=(X_valid, y_valid),
               shuffle=True,
               callbacks=callbacks, verbose=1)

    model.load_weights(filepath=os.path.join(experiment_path, ('fold_%02d/model/model_weights.hdf5' % idx)))

    proba = model.predict(X_train, batch_size=batch_size*2)
    proba_val = model.predict(X_valid, batch_size=batch_size*2)
    proba_test = model.predict(x_test, batch_size=batch_size*2)

    models_roc.append(roc_auc_score(y_valid, proba_val))
    models_train_roc.append(roc_auc_score(y_train, proba))
    
    train_probas[val_indices] += proba_val
    test_probas += proba_test / 5.


    print('Train ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_train_roc),
                                                                 np.std(models_train_roc),
                                                                 np.min(models_train_roc),
                                                                 np.max(models_train_roc)))

    print('Val ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_roc),
                                                                 np.std(models_roc),
                                                                 np.min(models_roc),
                                                                 np.max(models_roc)))


for i, cls_name in enumerate(class_names):
    train_submission[cls_name] = train_probas[:, i]
train_submission.to_csv('train_%s.csv' % STAMP, index=False)

for i, cls_name in enumerate(class_names):
    submission[cls_name] = test_probas[:, i]
submission.to_csv('submission_%s.csv' % STAMP, index=False)