# Homework 2 - TF-IDF Classifier

Ваша цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

В процессе обучения нужно ответить на ***[вопросы](https://docs.google.com/forms/d/e/1FAIpQLSd9mQx8EFpSH6FhCy1M_FmISzy3lhgyyqV3TN0pmtop7slmTA/viewform?usp=sf_link)***

Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



In [None]:
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('./input/train.csv').fillna('Unknown')
test = pd.read_csv('./input/test.csv').fillna('Unknown')
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [None]:
import re, string
re_tok = re.compile('([%s“”¨«»®´·º½¾¿¡§£₤‘’])' % string.punctuation)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
from keras.models import Model, Layer
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam, RMSprop
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, CuDNNLSTM
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import initializers, regularizers, constraints
import keras.backend as K
from keras.layers import Conv1D, GaussianNoise

In [None]:
EMBEDDING_FILE = './input/crawl-300d-2M.vec'

In [None]:
max_features = 300000
maxlen = 200
embed_size = 300

In [None]:
targets_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
train = train["comment_text"].fillna("fillna").map(clean_text).values
test = test["comment_text"].fillna("fillna").map(clean_text).values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train) + list(test))
X_train = tokenizer.texts_to_sequences(train)
X_test = tokenizer.texts_to_sequences(test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf-8'))

In [None]:
from sklearn.metrics import roc_auc_score
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, batch_size=512, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            
            
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [None]:
def get_gru(dropout=0., dropout_dense=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = GaussianNoise(stddev=0.3)(x)
    x = Conv1D(filters=1024, 
               input_shape=(200, 300), 
               kernel_size=5,
               dilation_rate=2,
               use_bias=True,
               padding='same')(x)
    x = Activation('elu')(x)
    x = Bidirectional(CuDNNGRU(1024, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(1024)(x)
    x = BatchNormalization()(x)
    x = Activation('elu')(x)
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
def get_lstm(dropout=0., dropout_dense=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = GaussianNoise(stddev=0.1)(x)
    x = Conv1D(filters=512, 
               input_shape=(200, 300), 
               kernel_size=5,
               dilation_rate=2,
               use_bias=True,
               padding='same')(x)
    x = Activation('elu')(x)
    x = Bidirectional(CuDNNLSTM(512, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(256)(x)
    x = BatchNormalization()(x)
    x = Activation('elu')(x)
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
from keras.layers import Conv2D, AveragePooling2D, MaxPool2D, Reshape, Concatenate, Flatten, Lambda

def _conv(filters, kernel):
    def _func(x):
        conv = Conv2D(filters, 
                      kernel_size=(kernel, embed_size), 
                      dilation_rate=(2, 1),
                      use_bias=True,
                      padding='same')(x)
        bn = BatchNormalization()(conv)
        activation = Activation('elu')(bn)
        return activation
    return _func

def get_textcnn(dropout_dense=0., dropout=None):
    inp = Input(shape=(maxlen,), dtype='int32')
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    reshape = Reshape((maxlen, embed_size, 1))(x)

    conv_0 = _conv(16, 3)(reshape)
    conv_1 = _conv(16, 5)(reshape)
    conv_2 = _conv(16, 7)(reshape)
    
    
    avg_pool_0 = AveragePooling2D(pool_size=(1, embed_size), padding='same')(conv_0)
    avg_pool_1 = AveragePooling2D(pool_size=(1, embed_size), padding='same')(conv_1)
    avg_pool_2 = AveragePooling2D(pool_size=(1, embed_size), padding='same')(conv_2)
    
    avg_pool_0 = Lambda(lambda x : K.squeeze(x, axis=2))(avg_pool_0)
    avg_pool_1 = Lambda(lambda x : K.squeeze(x, axis=2))(avg_pool_1)
    avg_pool_2 = Lambda(lambda x : K.squeeze(x, axis=2))(avg_pool_2)

    att0 = Attention(maxlen)(avg_pool_0)
    att1 = Attention(maxlen)(avg_pool_1)
    att2 = Attention(maxlen)(avg_pool_2)
    
    concatenated_tensor = Concatenate(axis=1)([att0, att1, att2])
    dropout = Dropout(dropout_dense)(concatenated_tensor)
    output = Dense(6, activation='sigmoid')(dropout)

    # this creates a model that includes
    model = Model(inputs=inp, outputs=output)  
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
batch_size = 256
epochs = 100

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
classifier = get_gru(dropout=0., dropout_dense=0.)
classifier.summary()

In [None]:
STAMP = 'gru_035'

curr_train_rargets = np.array(targets_train)
X_train, X_test, y_train, y_test = train_test_split(x_train, 
                                                    curr_train_rargets, 
                                                    test_size=0.2, 
                                                    random_state=0xCAFFE)


early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=1e-3)
sheduler = lr_sheduler = ReduceLROnPlateau(monitor='val_loss',
                                            factor=0.3,
                                            patience=2,
                                            verbose=True,
                                            mode='min',
                                            epsilon=1e-2,
                                            min_lr=1e-5)
bst_model_path = './models/' + STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

RocAuc = RocAucEvaluation(validation_data=(X_test, y_test), interval=1)
RocAucTrain = RocAucEvaluation(validation_data=(X_train, y_train), interval=1)

hist = classifier.fit(X_train, y_train, 
                      batch_size=batch_size, 
                      epochs=epochs, 
                      validation_data=(X_test, y_test),
                      shuffle=True,
                      callbacks=[sheduler, early_stopping, model_checkpoint, RocAucTrain, RocAuc], verbose=1)

classifier.load_weights(bst_model_path)
probas = classifier.predict([x_train], batch_size=32, verbose=1)
for i, cls_name in enumerate(class_names):
    train_submission[cls_name] = probas[:, i]
train_submission.to_csv('train_%s.csv' % STAMP, index=False)

probas = classifier.predict([x_test], batch_size=32, verbose=1)
for i, cls_name in enumerate(class_names):
    submission[cls_name] = probas[:, i]

submission.to_csv('submission_%s.csv' % STAMP, index=False)