# Packages

In [50]:
import pandas as pd
import numpy as np
import os, re, csv, codecs, operator, sys
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from textblob import TextBlob    # For pos-tagging

from keras import optimizers, initializers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation, SpatialDropout1D, Reshape, \
GlobalAveragePooling1D, merge, Flatten, Bidirectional, CuDNNGRU, add, Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.engine import InputSpec, Layer
from keras import backend as K

# Parameters

In [57]:
PATH = 'datasets/'
FAST_TEXT_EMBEDDING = 'pretrain_embedding/crawl-300d-2M.vec'
CLEAN_WORD_PATH = None
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'
MAX_SEQUENCE_LENGTH = 350
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
FOLD_COUNT = 10
BATCH_SIZE = 200

# Load Pretrain Models

In [3]:
def load_pretrain_embedding(file):
    print('Indexing word vectors')
    embeddings_index = {}
    f = open(file, 'r', encoding='utf-8')
    for line in f:
        values = line.split()
        try:
            word = values[0]
            coefs = np.array(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except:
            print("Error on: ", values[:3])
    f.close()
    print("Total %s word vectors" % len(embeddings_index))
    return embeddings_index

In [4]:
embeddings_index = load_pretrain_embedding(FAST_TEXT_EMBEDDING)

Indexing word vectors
Total 2000000 word vectors


# Data Overview

In [5]:
train_df = pd.read_csv('datasets/train.csv')
test_df = pd.read_csv('datasets/test.csv')

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [7]:
train_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
train_df.corr()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
toxic,1.0,0.308619,0.676515,0.157058,0.647518,0.266009
severe_toxic,0.308619,1.0,0.403014,0.123601,0.375807,0.2016
obscene,0.676515,0.403014,1.0,0.141179,0.741272,0.286867
threat,0.157058,0.123601,0.141179,1.0,0.150022,0.115128
insult,0.647518,0.375807,0.741272,0.150022,1.0,0.337736
identity_hate,0.266009,0.2016,0.286867,0.115128,0.337736,1.0


In [9]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
train_comments = train_df['comment_text'].values
test_comments = test_df['comment_text'].values
train_comments_lengths = [len(s) for s in train_comments]
test_comments_lengths = [len(s) for s in test_comments]

In [11]:
def explore_comments(arr):
    print("MAX LENGTH:\t\t", np.max(arr))
    print("AVG LENGTH:\t\t", np.average(arr))
    print("MIN LENGTH:\t\t", np.min(arr))
    print("STANDARD DIVISION:\t", np.std(arr))
    print("RANGE:\t\t\t", np.min(arr), " to ", np.average(arr) + 2 * np.std(arr))
    
print("------Train------")
explore_comments(train_comments_lengths)

print("------Test------")
explore_comments(test_comments_lengths)

------Train------
MAX LENGTH:		 5000
AVG LENGTH:		 394.0732213246768
MIN LENGTH:		 6
STANDARD DIVISION:	 590.7184309382144
RANGE:			 6  to  1575.5100832011055
------Test------
MAX LENGTH:		 5000
AVG LENGTH:		 364.8751207855632
MIN LENGTH:		 1
STANDARD DIVISION:	 592.4901645516661
RANGE:			 1  to  1549.8554498888955


# Data Cleaning

## Load Cleaned Words

In [12]:
if CLEAN_WORD_PATH == None:
    ignored_words = set(stopwords.words('english'))
else:
    ignored_words = {}
    with open(CLEAN_WORD_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip('\n')
            typo, correct = line.split(',')
            ignored_words[typo] = correct

In [13]:
# Regex to remove all Non-Alpha Numeric and space
special_character_removal = re.compile(r'[^?!.,:a-z\d ]', re.IGNORECASE)
# Regex to remove all numerics
replace_numbers = re.compile(r'd+', re.IGNORECASE)

word_count_dict = defaultdict(int)

def clean_datasets(text, remove_stopwords=False, stem_words=False, count_null_words=True, clean_wiki_tokens=True):
    text = text.lower()
    text = re.sub(r"”", "\"", text)
    text = re.sub(r"“", "\"", text)
    text = replace_numbers.sub('', text)
    
    if count_null_words:
        text = text.split()
        for t in text:
            word_count_dict[t] += 1
        text = " ".join(text)
        
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
    return text

In [14]:
list_sentences_train = train_df['comment_text'].fillna('no comment').values
list_sentences_test = test_df['comment_text'].fillna('no comment').values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_labels = train_df[list_classes].values

In [15]:
cleaned_train_comments, cleaned_test_comments = [], []
print('Processing data cleaning...')

for text in list_sentences_train:
    cleaned_train_comments.append(clean_datasets(text))
for text in list_sentences_test:
    cleaned_test_comments.append(clean_datasets(text))

Processing data cleaning...


In [16]:
train_df['comment_text_cleaned'] = cleaned_train_comments
test_df['comment_text_cleaned'] = cleaned_test_comments

# Data Preprocessing

In [17]:
all_comment_text = pd.concat([train_df['comment_text_cleaned'], test_df['comment_text_cleaned']], axis=0).fillna("unknown")
nrow_train = train_df.shape[0]
all_comment_text.shape[0]

312735

In [18]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
data = vectorizer.fit_transform(all_comment_text)
print(data.shape)

(312735, 50000)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [19]:
norm_data = MaxAbsScaler().fit_transform(data)
print(norm_data.shape)

(312735, 50000)


## Build Vocabulary & Tokenizer

In [20]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [21]:
print('Automatically train vocab & tokenizer...')
tokenizer.fit_on_texts(cleaned_train_comments + cleaned_test_comments)

train_sequences = tokenizer.texts_to_sequences(cleaned_train_comments)
test_sequences = tokenizer.texts_to_sequences(cleaned_test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train_data tensor: ', train_data.shape)
print('Shape of train_label tensor: ', train_labels.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor: ', test_data.shape)

Automatically train vocab & tokenizer...
Found 420473 unique tokens
Shape of train_data tensor:  (159571, 350)
Shape of train_label tensor:  (159571, 6)
Shape of test_data tensor:  (153164, 350)


## Pos Feature Extraction

In [22]:
def sent2pos(sentence):
    try:
        tag = TextBlob(sentence).tags
    except:
        print(sentence)
        
    updated_sentence = ' '.join([i[0] for i in tag])
    tagged = ' '.join([i[1] for i in tag])
    return updated_sentence, tagged

In [23]:
inverse_word_index = {v: k for k, v in word_index.items()}

In [25]:
Pos_comments = []
Pos_updated_sentence = []
for text in train_sequences:
    text_ = ' '.join([inverse_word_index[word] for word in text])    # convert to word format
    if not isinstance(text_, str):
        print(text, '\n', text_)
    updated_sentence, tags = sent2pos(text_)
    Pos_updated_sentence.append(updated_sentence)
    Pos_comments.append(tags)
    assert len(updated_sentence.split(' ')) == len(tags.split(' ')), "T1 {} T2 {}".format(len(text), len(tags.split()))
    
Pos_test_comments = []
Pos_test_updated_sentence = []
for text in test_sequences:
    text_ = ' '.join([inverse_word_index[word] for word in text])
    updated_test_sentence, test_tags = sent2pos(text_)
    Pos_test_updated_sentence.append(updated_test_sentence)
    Pos_test_comments.append(test_tags)
    assert len(updated_test_sentence.split(' ')) == len(test_tags.split(' ')), "T1 {} T2 {}".format(len(text), len(test_tags.split()))

In [26]:
pos_tokenizer = Tokenizer(num_words=50, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [29]:
print('Automatically train pos tokenizer...')
pos_tokenizer.fit_on_texts(Pos_comments + Pos_test_comments)

train_pos_sequences = pos_tokenizer.texts_to_sequences(Pos_comments)
test_pos_sequences = pos_tokenizer.texts_to_sequences(Pos_test_comments)

pos_word_index = pos_tokenizer.word_index
print('Found %s unique tokens' % len(pos_word_index))

pos_train_data = pad_sequences(train_pos_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', pos_train_data.shape)

pos_test_data = pad_sequences(test_pos_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', pos_test_data.shape)

Automatically train pos tokenizer...
Found 34 unique tokens
Shape of data tensor: (159571, 350)
Shape of test_data tensor: (153164, 350)


## Second time valid for tokenzier

In [31]:
print('Automatically train pos tokenizer secondly...')
cleaned_train_comments, cleaned_test_comments = [], []
for text in Pos_updated_sentence:
    cleaned_train_comments.append(clean_datasets(text))
for text in Pos_test_updated_sentence:
    cleaned_test_comments.append(clean_datasets(text))
    
train_sequences = tokenizer.texts_to_sequences(cleaned_train_comments)
test_sequences = tokenizer.texts_to_sequences(cleaned_test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train_data tensor:', train_data.shape)
print('Shape of train_label tensor:', train_labels.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

Automatically train pos tokenizer secondly...
Found 420473 unique tokens
Shape of train_data tensor: (159571, 350)
Shape of train_label tensor: (159571, 6)
Shape of test_data tensor: (153164, 350)


In [38]:
train_df['cleaned_comment_text'] = cleaned_train_comments
test_df['cleaned_comment_text'] = cleaned_test_comments
train_df.to_csv(PATH + 'cleaned_train.csv', index=False)
test_df.to_csv(PATH + 'cleaned_test.csv', index=False)

## Sentence Embedding (Build a matrix)

In [37]:
print('Preparing embedding matrix...')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

null_words = open(PATH + 'null_words.txt', 'w', encoding='utf-8')

for word, idx in word_index.items():
    if idx >= MAX_NB_WORDS:
        null_words.write(word + ', ' + str(word_count_dict[word]) + '\n')
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
    else:
        null_words.write(word + ', ' + str(word_count_dict[word]) + '\n')
print('Null_word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix...
Null_word embeddings: 41446


In [40]:
print('Sorting null_words...')
null_dict = {}
with open(PATH + 'null_words.txt', 'r', encoding='utf-8') as nullword:
    for line in nullword:
        word, count = line.strip('\n').split(', ')
        null_dict[word] = int(count)

null_dict = sorted(null_dict.items(), key=operator.itemgetter(1), reverse=True)

with open(PATH + 'null_words.txt', 'w', encoding='utf-8') as output:
    for word, count in null_dict:
        output.write(word + ', ' + str(count) + '\n')
print('Sorting operation Done!')

Sorting null_words...
Sorting operation Done!


# Model Training

## Training Styles

In [61]:
STAMP = 'model_pool/pavel_rnn_%.2f_%.2f'%(0.5, 0.5)

def _train_model_by_auc(model, batch_size, train_x, train_y, val_x, val_y):
    best_auc = -1
    best_weight = None
    best_epoch = 0
    current_epoch = 1
    
    while True:
        model.fit(train_x, train_y, batch_size=batch_size, epoch=1, validation_data=[val_x, val_y])
        y_pred = model.predict(val_x, batch_size=batch_size)
        current_auc = roc_auc_score(val_y, y_pred)
        print('Epoch {} auc {:.6f} best_auc {:.6f}'.format(current_epoch, current_auc, best_auc))
        current_epoch += 1
        if best_auc < current_auc or best_auc == -1:
            best_auc = current_auc
            best_weight = model.get_weights()
            best_epoch = current_epoch
        else:
            # early stop
            if current_epoch - best_epoch == 5:
                break
                
    model.set_weights(best_weights)
    return model, best_auc

def _train_model_by_logloss(model, batch_size, train_x, pos_train_x, train_y, val_x, pos_val_x, val_y, fold_id):
    early_stopping = EarlyStopping(monitor='val_loss', patience=7)
    best_model_path = STAMP + str(fold_id) + '.h5'
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)
    train_data = {'Onehot': train_x, 'POS': pos_train_x}
    val_data = {'Onehot': val_x, 'POS': pos_val_x}
    
    hist = model.fit(train_data, train_y, validation_data=(val_data, val_y), epochs=50, batch_size=batch_size, shuffle=True, callbacks=[early_stopping, model_checkpoint])
    best_val_score = min(hist.history['val_loss'])
    predictions = model.predict(val_data)
    auc = roc_auc_score(val_y, predictions)
    print('AUC Score', auc)
    return model, best_val_score, auc, predictions

def train_folds(x, pos_x, y, fold_count, batch_size, get_model_func):
    fold_size = len(x) // fold_count
    models = []
    fold_predictions = []
    score, total_auc = 0, 0
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size
        
        if fold_id == fold_count - 1:
            fold_end = len(x)
            
        train_x = np.concatenate((x[:fold_start], x[fold_end:]))
        train_y = np.concatenate((y[:fold_start], y[fold_end:]))
        
        val_x = x[fold_start: fold_end]
        val_y = y[fold_start: fold_end]
        
        pos_train_x = np.concatenate((pos_x[:fold_start], pos_x[fold_end:]))
        pos_val_x = pos_x[fold_start: fold_end]
        print('In fold #', fold_id)
        model, best_val_score, auc, fold_prediction = _train_model_by_logloss(get_model_func, batch_size, train_x, pos_train_x, train_y, val_x, pos_val_x, val_y, fold_id)
        
        score += best_val_score
        total_auc += auc
        fold_predictions.append(fold_prediction)
        models.append(model)
    return models, score / fold_count, total_auc / fold_count , fold_predictions

## Model Zoo

In [62]:
class AttentionWeightedAverage(Layer):
    """
    Model for computing a weighted average of the different channels across timesteps.
    Uses 1 parameter per channel to compute the attention value for a single timestep.
    """
    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3
        
        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)
        
    def call(self, x, mask=None):
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
        
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        attn_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(attn_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, attn_weights]
        return result
    
    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)
    
    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)
    
    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [None]:
class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    With TensorFlow Backend.
    """
    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2] * self.k)
    
    def call(self, inputs):
        # top_k function can only be applied along the last dimension
        shifted_input = tf.transpose(inputs, [0, 2, 1])
        top_k = tf.nn.top_k(shifted_input, k=self.k, True, None)[0]
        return Flatten()(top_k)

In [None]:
def get_av_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    filter_nums = 300
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.25)(embedding_sequences)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_3 = Conv1D(filter_nums, 4, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    
    attn_0 = AttentionWeightedAverage()(conv_0)
    avg_0 = GlobalAveragePooling1D()(conv_0)
    maxpool_0 = GlobalMaxPooling1D()(conv_0)
    
    attn_1 = AttentionWeightedAverage()(conv_1)
    avg_1 = GlobalAveragePooling1D()(conv_1)
    maxpool_1 = GlobalMaxPooling1D()(conv_1)
    
    attn_2 = AttentionWeightedAverage()(conv_2)
    avg_2 = GlobalAveragePooling1D()(conv_2)
    maxpool_2 = GlobalMaxPooling1D()(conv_2)
    
    attn_3 = AttentionWeightedAverage()(conv_3)
    avg_3 = GlobalAveragePooling1D()(conv_3)
    maxpool_3 = GlobalMaxPooling1D()(conv_3)
    
    merged_maxpool = merge([maxpool_0, maxpool_1, maxpool_2, maxpool_3], mode='concat', concat_axis=1)
    merged_attn = merge([attn_0, attn_1, attn_2, attn_3], mode='concat', concat_axis=1)
    merged_avg = merge([avg_0, avg_1, avg_2, avg_3], mode='concat', concat_axis=1)
    merged_tensor = merge([merged_maxpool, merged_attn, merged_avg], mode='concat', concat_axis=1)
    
    output = Dropout(0.7)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [63]:
def get_av_pos_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words, 
                                embedding_dim, 
                                weights=[embedding_matrix], 
                                input_length=max_sequence_length, 
                                trainable=False)
    
    pos_embedding_layer = Embedding(50,
                                    30,
                                    input_length=max_sequence_length,
                                    trainable=True)
    
    filter_nums = 325
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    pos_input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='POS')
    
    embedding_sequences = embedding_layer(input_layer)
    pos_sequences = pos_embedding_layer(pos_input_layer)
    merged_embedding_layer = concatenate([embedding_sequences, pos_sequences])
    final_embedding_sequences = SpatialDropout1D(0.25)(embedding_layer)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_3 = Conv1D(filter_nums, 4, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)

    attn_0 = AttentionWeightedAverage()(conv_0)
    avg_0 = GlobalAveragePooling1D()(conv_0)
    maxpool_0 = GlobalMaxPooling1D()(conv_0)
    
    attn_1 = AttentionWeightedAverage()(conv_1)
    avg_1 = GlobalAveragePooling1D()(conv_1)
    maxpool_1 = GlobalMaxPooling1D()(conv_1)
    
    attn_2 = AttentionWeightedAverage()(conv_2)
    avg_2 = GlobalAveragePooling1D()(conv_2)
    maxpool_2 = GlobalMaxPooling1D()(conv_2)
    
    attn_3 = AttentionWeightedAverage()(conv_3)
    avg_3 = GlobalAveragePooling1D()(conv_3)
    maxpool_3 = GlobalMaxPooling1D()(conv_3)
    
    merged_tensor_maxpool = merge([maxpool_0, maxpool_1, maxpool_2, maxpool_3], mode='concat', concat_axis=1)
    merged_tensor_attn = merge([attn_0, attn_1, attn_2, attn_3], mode='concat', concat_axis=1)
    merged_tensor_avg = merge([avg_0, avg_1, avg_2, avg_3], mode='concat', concat_axis=1)
    merged_tensor = merge([merged_tensor_maxpool, merged_tensor_attn, merged_tensor_avg], mode='concat', concat_axis=1)
    
    output = Dropout(0.7)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=[input_layer, pos_input_layer], outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_corssentropy', optimizer=adam_optimizer, matrics=['accuracy'])
    return model

In [None]:
def get_kmax_text_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    filter_nums = 180
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(embedding_sequences)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_3 = Conv1D(filter_nums, 4, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    
    maxpool_0 = KMaxPooling(k=3)(conv_0)
    maxpool_1 = KMaxPooling(k=3)(conv_1)
    maxpool_2 = KMaxPooling(k=3)(conv_2)
    maxpool_3 = KMaxPooling(k=3)(conv_3)
    
    merged_tensor = merge([maxpool_0, maxpool_1, maxpool_2, maxpool_3], mode='concat', concat_axis=1)
    output = Dropout(0.6)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_rcnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    recurrent_units = 64
    filter_nums = 128
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.2)(embedding_sequences)
    
    rnn_layer = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    conv_layer = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu', strides=1)(rnn_layer)
    
    maxpool = GlobalMaxPooling1D()(conv_layer)
    attn = AttentionWeightedAverage()(conv_layer)
    avg = GlobalAveragePooling1D()(conv_layer)
    
    merged_tensor = merge([maxpool, attn, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=120, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_av_rnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    recurrent_units = 64
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.25)(embedding_sequences)
    
    rnn_layer_0 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    rnn_layer_1 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(rnn_layer_0)
    merged_rnn_layer = merge([rnn_layer_0, rnn_layer_1], mode='concat', concat_axis=2)
    
    last_layer = Lambda(lambda t: t[:, -1], name='last_layer')(merged_rnn_layer)
    maxpool = GlobalMaxPooling1D()(merged_rnn_layer)
    attn = AttentionWegithedAverage()(merged_rnn_layer)
    avg = GlobalAveragePooling1D()(merged_rnn_layer)
    
    merged_tensor = merge([last_layer, maxpool, attn, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=input_layer, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [64]:
def get_av_pos_rnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    pos_embedding_layer = Embedding(50,
                                    35,
                                    input_length=max_sequence_length,
                                    trainable=True)
    
    recurrent_units = 64
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    pos_input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='POS')
    embedding_sequences = embedding_layer(input_layer)
    pos_sequences = pos_embedding_layer(pos_input_layer)
    merged_embedding_layer = concatenate([embedding_sequences, pos_sequences], axis=2)
    final_embedding_sequences = SpatialDropout1D(0.2)(merged_embedding_layer)
    
    rnn_layer_0 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    rnn_layer_0 = SpatialDropout1D(0.3)(rnn_layer_0)
    rnn_layer_1 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(rnn_layer_0)
    
    last_layer = Lambda(lambda t: t[:, -1], name='last_layer')(rnn_layer_1)
    maxpool = GlobalMaxPooling1D()(rnn_layer_1)
    attn = AttentionWeightedAverage()(rnn_layer_1)
    avg = GlobalAveragePooling1D()(rnn_layer_1)
    
    merged_tensor = merge([last_layer, maxpool, attn, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=[input_layer, pos_input_layer], outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_dropout_bigru(nb_words, embedding_dims, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Emebdding(nb_words,
                                embedding_dims,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    recurrent_units = 64
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.2)(embedding_sequences)
    
    rnn_layer = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    rnn_layer = Dropout(0.35)(rnn_layer)
    rnn_layer = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(rnn_layer)
    
    last_layer = Lambda(lambda t: t[:, -1])(rnn_layer)
    maxpool = GlobalMaxPooling1D()(rnn_layer)
    avg = GlobalAveragePooling1D()(rnn_layer)
    
    merged_tensor = merge([last_layer, maxpool, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=72, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    model.compile(loss='binary_cro')

## Start Training

In [65]:
model_name = 'fasttext-avrnn-pos-' + str(nb_words) + 'vocabulary-' + str(MAX_SEQUENCE_LENGTH) + 'length'
model = get_av_pos_rnn(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, 6)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Onehot (InputLayer)             (None, 350)          0                                            
__________________________________________________________________________________________________
POS (InputLayer)                (None, 350)          0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 350, 300)     30000000    Onehot[0][0]                     
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 350, 35)      1750        POS[0][0]                        
__________________________________________________________________________________________________
concatenat

  name=name)


In [None]:
models, val_loss, total_auc, fold_predictions = train_folds(train_data, pos_train_data, train_labels, FOLD_COUNT, BATCH_SIZE, model)

In fold # 0
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
AUC Score 0.984756056571395
In fold # 1
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
AUC Score 0.9928424228111293
In fold # 2
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
AUC Score 0.99541853509794
In fold # 3
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
AUC Score 0.9960258036056061
In fold # 4
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
AUC Score 0.99716175505535

Epoch 7/50
Epoch 8/50
AUC Score 0.9975237139442849
In fold # 6
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
AUC Score 0.9978112128712229
In fold # 7
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
AUC Score 0.998177711570556
In fold # 8
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

In [None]:
print('Overall val-loss: {}, AUC {}'.format(val_loss, total_auc))

# Prediction

In [None]:
submit_path_prefix = 'results/RNN_Based/' + model_name

print('Predicting testing results...')
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_data_dict = {'Onehot': test_data, 'POS': pos_test_data}
    test_predict = model.predict(test_data_dict, batch_size=BATCH_SIZE, verbose=1)
    test_predicts_list.append(test_predict)
    np.save('predicts_pool/AVRNN/', test_predict)
    
test_predicts = np.zeros(test_predicts_list.shape[1])
for fold_predict in test_predicts_list:
    test_predicts += fold_predict
test_predicts /= len(test_predicts_list)

test_ids = test_df['id'].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=list_classes)
test_predicts['id'] = test_ids
test_predicts = test_predicts[['id'] + list_classes]
submit_path = submit_path_prefix + '-L{:4f}-A{:4f}.csv'.format(val_loss, total_auc)
test_predicts.to_csv(submit_path, index=False)

In [None]:
train_fold_predictions = np.concatenate((fold_predictions, axis=0))
train_auc = roc_auc_score(train_labels[:-1], train_fold_predictions)
print('Training AUC', train_auc)

In [None]:
print('Predicting training results...')
train_ids = train_df['id'].values
train_ids = train_ids.reshape((len(train_ids), 1))

train_predicts = pd.DataFrame(data=train_fold_predictions, columns=list_classes)
train_predicts['id'] = train_ids
train_predicts = train_predicts[['id'] + list_classes]
submit_path = submit_path_prefix + '-(Train)-L{:4f}-A{:4f}.csv'.format(val_loss, train_auc)
train_predicts.to_csv(submit_path, index=False)

# Result Ensemble (For Test Format)

## Bagging

In [None]:
def bagging(arrs):
    print("Doing ensemble on")
    subs = []
    for arr in arrs:
        print(arr)
        subs.append(pd.read_csv(arr))
    
    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    for sub in subs[1:]:
        for c in classes:
            subs[0][c] += sub[c]
    
    for c in classes:
        subs[0][c] /= len(subs)
        
    subs[0].to_csv('Bagging.csv', index=False)

# Check Correlation

In [None]:
def check_corr(arr1, arr2):
    res = 0
    for col in arr1.columns.values[1:]:
        cur = arr1[col].corr(arr2[col])
        corr = (arr1[col].rank() / len(arr1)).corr(arr2[col].rank() / len(arr2))
        print(col, corr)
        res += corr
    print("Avg Rank: ", res / len(arr1.columns.values[1:]))