# MuNks:Overview

We implemented three models for the Toxicity challenge.
   + RNN
   + Logistic Regression Model
   + Gated Recurrent Unit (GRU)

An ensemble was created from the results of these three models. The RNN is made up of 5 layers:

<ol>    
    <li> The first layer concatenated the <b>Fasttext</b> and <b>Glove</b> embeddings </li>
    <li> Spatial Dropout 1D (0.5)</li>
    <li> Bidrectional CuDNNLSTM with kernel size 40</li>
    <li> Bidrectional CuDNNRU with kernel size 40</li>
    <li> Concatenation of the last state, max pool, average pool and two features: "unique words rate" and "rate of all-caps words"</li>
    <li> Output dense layer </li>
</ol>

The hyperparameters were as follows:

   + Batch_size = 32 & 64
   + Epochs = 3 & 5
   + Max Length = 50
   + Max Features = 100,000
 
 For the Logistic Regression the following parameters were used:
 
   + Solver = sag
   + Inverse of regularization strength, C = 0.1

 For GRU, the following parameters were:
 
   + Max Features = 200,000
   + Embedding dimension  = 300
   + Max Length = 500
   
   Fasttext embedding has a dimension of 300 and so the embedding size parameter was kept at 300.
   
 An ensemble of the three results were then created. This yielded an ROC-AUC score of <b>0.9866</b>
 

## GRU

### Import modules

In [None]:
import warnings
warnings.filterwarnings('ignore')

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd

np.random.seed(15)

import os
os.environ['OMP_NUM_THREADS'] = '4'



### Loading the train, test and embedding files.
### Instead traing own word embeddinngs pretrain fasttext embeddings are used -"EMBEDDING_FILE "

In [None]:
train = pd.read_csv('/home/ubuntu/tc_data/train_preprocessed.csv')
test = pd.read_csv('/home/ubuntu/tc_data/test_preprocessed.csv')
submission = pd.read_csv('/home/ubuntu/tc_data/sample_submission.csv')

EMBEDDING_FILE = '/home/ubuntu/nana/crawl-300d-2M.vec'

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

### Extracting the fasttext embedding matrix and embedding_index(dict)

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

## Define evaluation metric 
#### Which the Area under the curve, for each epoch it returns the score

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

### Build the model with pretrained embeddings, and used bidirectional LSTMs

In [None]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.2)(x)
    x1 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x2 = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    conc = concatenate([x1, x2])
    avg_pool = GlobalAveragePooling1D()(conc)
    max_pool = GlobalMaxPooling1D()(conc)
    conc = concatenate([avg_pool, max_pool])
    x = Dense(64, activation='relu')(conc)
    x = Dropout(0.2)(x)
    outp = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

### Model training

In [None]:
model = get_model()


batch_size = 50
epochs = 4

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

### Predict on test set and write to csv file.

In [None]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_fasttext.csv', index=False)

## Logistic Regression Model

In [None]:
import numpy as np
import pandas as pd

# For Data Cleaning
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
#nltk.download('punkt')
#nltk.download('stopwords')
from sklearn.feature_extraction import text as sklearn_text

# For Feature Extraction
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer

# For Model Building
from sklearn.linear_model import LogisticRegression

# For Model Evaluation
from sklearn.model_selection import cross_val_score
from scipy.stats import spearmanr

print('### Import data ###')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
 
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_text = train['comment_text'].fillna(' ')
test_text = test['comment_text'].fillna(' ')

train_text[:20]

replacement = { "aren't" : "are not",
                "can't" : "cannot",
                "couldn't" : "could not",
                "didn't" : "did not",
                "doesn't" : "does not",
                "don't" : "do not",
                "hadn't" : "had not",
                "hasn't" : "has not",
                "haven't" : "have not",
                "he'd" : "he would",
                "he'll" : "he will",
                "he's" : "he is",
                "i'd" : "I would",
                "i'd" : "I had",
                "i'll" : "I will",
                "i'm" : "I am",
                "isn't" : "is not",
                "it's" : "it is",
                "it'll":"it will",
                "i've" : "I have",
                "let's" : "let us",
                "mightn't" : "might not",
                "mustn't" : "must not",
                "shan't" : "shall not",
                "she'd" : "she would",
                "she'll" : "she will",
                "she's" : "she is",
                "shouldn't" : "should not",
                "that's" : "that is",
                "there's" : "there is",
                "they'd" : "they would",
                "they'll" : "they will",
                "they're" : "they are",
                "they've" : "they have",
                "we'd" : "we would",
                "we're" : "we are",
                "weren't" : "were not",
                "we've" : "we have",
                "what'll" : "what will",
                "what're" : "what are",
                "what's" : "what is",
                "what've" : "what have",
                "where's" : "where is",
                "who'd" : "who would",
                "who'll" : "who will",
                "who're" : "who are",
                "who's" : "who is",
                "who've" : "who have",
                "won't" : "will not",
                "wouldn't" : "would not",
                "you'd" : "you would",
                "you'll" : "you will",
                "you're" : "you are",
                "you've" : "you have",
                "'re": " are",
                "wasn't": "was not",
                "we'll":" will",
                "didn't": "did not"
              }


replacement.update({"im" : "i am", "youre" : "you are", "ur" : "you are",
                    "theyre" : "they are", "pls" : "please", "fk" : "fuck"})
print('\n#### Data Cleaning ####')

def replace_comment(comment):
    comment=comment.lower()
    
    # Replace words like gooood to good
    comment = re.sub(r'(\w)\1{2,}', r'\1\1', comment)
    
    # Normalize common abbreviations
    words=comment.split(' ')
    words=[replacement[word] if word in replacement else word for word in words]

    comment_repl=" ".join(words)
    return comment_repl

# Lower the case and replace common abbreviation
train_text = train_text.apply(lambda x: replace_comment(x))
test_text = test_text.apply(lambda x: replace_comment(x))

############################
# DATA CLEANING
############################

# For checking Regexp: https://regex101.com/
def standardize_text(datafile):
    datafile = datafile.str.lower()
    # Remove website link
    datafile = datafile.str.replace(r"http\S+", "")
    datafile = datafile.str.replace(r"https\S+", "")
    datafile = datafile.str.replace(r"http", "")
    datafile = datafile.str.replace(r"https", "")
    # Remove name tag
    datafile = datafile.str.replace(r"@\S+", "")
    # Remove time related text
    datafile = datafile.str.replace(r'\w{3}[+-][0-9]{1,2}\:[0-9]{2}\b', "") # e.g. UTC+09:00
    datafile = datafile.str.replace(r'\d{1,2}\:\d{2}\:\d{2}', "")            # e.g. 18:09:01
    datafile = datafile.str.replace(r'\d{1,2}\:\d{2}', "")                  # e.g. 18:09
    # Remove date related text
        # e.g. 11/12/19, 11-1-19, 1.12.19, 11/12/2019  
    datafile = datafile.str.replace(r'\d{1,2}(?:\/|\-|\.)\d{1,2}(?:\/|\-|\.)\d{2,4}', "")
        # e.g. 11 dec, 2019   11 dec 2019   dec 11, 2019
    datafile = datafile.str.replace(r"([\d]{1,2}\s(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s[\d]{1,2})(\s|\,|\,\s|\s\,)[\d]{2,4}", "")
        # e.g. 11 december, 2019   11 december 2019   december 11, 2019
    datafile = datafile.str.replace(r"[\d]{1,2}\s(january|february|march|april|may|june|july|august|september|october|november|december)(\s|\,|\,\s|\s\,)[\d]{2,4}", "")
        # Remove line breaks
    datafile = datafile.str.replace("\r"," ")
    datafile = datafile.str.replace("\n"," ")
    # Remove special characters
    datafile = datafile.str.replace(r"[^A-Za-z0-9(),.!?@\`\"\_ ]", "")
    datafile = datafile.str.replace(' "" ','')
    # Remove phone number and IP address
    datafile = datafile.str.replace(r'\d{8,}', "")
    datafile = datafile.str.replace(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', "")
    # Adjust common abbreviation
    datafile = datafile.str.replace(r" you re ", " you are ")
    datafile = datafile.str.replace(r" we re ", " we are ")
    datafile = datafile.str.replace(r" they re ", " they are ")
    datafile = datafile.str.replace(r"@", "at")
    return datafile

# Use regular expressions to clean up pour data.
train_text = standardize_text(train_text)
test_text = standardize_text(test_text)

############################
# STOP WORD REMOVAL
############################
nltk.download('stopwords')
stopwords_list = nltk.corpus.stopwords.words('english') # stopwords from nltk

# Exclude from stopwords: not, cannot
stopwords_list_rev = list(filter(lambda x: x not in ('not','cannot'), stopwords_list)) 
stopwords_list_rev.sort()

train_text = train_text.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords_list_rev))
test_text = test_text.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords_list_rev))

train_text[:20]

############################
# CHECKING COMMON WORDS
############################

""" Revised method based on:
    Ref 1: https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline
    Ref 2: https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams
"""

# Word 1-gram
word_vectorizer_1 = TfidfVectorizer(sublinear_tf=True,
                                    strip_accents='unicode',
                                    analyzer='word',
                                    token_pattern=r'\w{1,}',
                                    stop_words='english',
                                    ngram_range=(1, 1),
                                    max_features=10000
                                    )
# Word 2-gram
word_vectorizer_2 = TfidfVectorizer(sublinear_tf=True,
                                    strip_accents='unicode',
                                    analyzer='word',
                                    token_pattern=r'\w{1,}',
                                    stop_words=stopwords_list_rev,
                                    ngram_range=(2, 2),
                                    max_features=5000
                                    )
# Char 2 to 6-gram
char_vectorizer = TfidfVectorizer(  sublinear_tf = True,
                                    strip_accents = 'unicode',
                                    analyzer = 'char',
                                    stop_words = 'english',
                                    ngram_range = (2, 6),
                                    max_features = 50000
                                 )

# Fit vectorizer by all text
all_text = pd.concat([train_text, test_text])
print('\n### Vectorizer fitting ###')
word_vectorizer_1.fit(all_text)
word_vectorizer_2.fit(all_text)
char_vectorizer.fit(all_text)

# Transform dataset to document-term matrix
print('\n### DTM Transforming (train) ###')
train_word_features1 = word_vectorizer_1.transform(train_text)
train_word_features2 = word_vectorizer_2.transform(train_text)
train_char_features  = char_vectorizer.transform(train_text)
print('\n### DTM Transforming (test) ###')
test_word_features1 = word_vectorizer_1.transform(test_text)
test_word_features2 = word_vectorizer_2.transform(test_text)
test_char_features  = char_vectorizer.transform(test_text)

# Merge features
print('\n### Merging Features Martix ###')
train_features = hstack([train_word_features1, train_word_features2, train_char_features])
test_features = hstack([test_word_features1, test_word_features2, test_char_features])

############################
# BASE MODELS
############################

""" model 1 : Simple Logistic Regression (Well Calibrated)
    https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams
    # 0.9792
""" 
scores = []

LR = pd.DataFrame.from_dict({'id': sample_submission['id']}).sort_values('id')

print('\n### Model 1: Simple Logistic Regression ###')
for label in labels:
    pred_model = LogisticRegression(C=0.1, solver='sag')
    #AUC
    score = np.mean(cross_val_score( pred_model, train_features, train[label], cv=3, scoring='roc_auc'))
    scores.append(score)
    print('For {}, AUC is {}.'.format(label, score))
    
    pred_model.fit(train_features, train[label])
    LR[label] = pred_model.predict_proba(test_features)[:, 1]
    
print('\nOverall CV score is {}'.format(np.mean(scores)))



############################
# DATA BLENDING
############################
# https://www.kaggle.com/reppic/lazy-ensembling-algorithm

# Controls weights when combining predictions
# 0: equal average of all inputs; 1: up to 50% of weight going to least correlated input
DENSITY_COEFF = 0
assert DENSITY_COEFF >= 0.0 and DENSITY_COEFF <= 1.0

# When merging 2 files with corr > OVER_CORR_CUTOFF 
# the result's weight is the max instead of the sum of the merged files' weights
OVER_CORR_CUTOFF = 0.98
assert OVER_CORR_CUTOFF >= 0.0 and OVER_CORR_CUTOFF <= 1.0

###############################################

def load_submissions():
    csv_files = {'sub1': 'submission1.csv',
                 'sub4': 'submission4.csv',
                 'sub5': 'submission5.csv',
                 'sub7': 'submission7.csv'
                }
    frames = { f:pd.read_csv(f).sort_values('id') for f in csv_files.values() }
    models = [ m for m in csv_files.keys() ]
    data = dict(zip(models, frames.values()))
    # Adding LR model to import models
    data = dict(data,**{'LR': LR})
    del frames
    return data

def get_corr_mat(frames, label):
    c = pd.DataFrame()
    for datafile, values in frames.items():
        c[datafile] = values[label]
    cor = c.corr()
    
    # Set the diagonal correlation to zero for merging
    for index, name in enumerate(cor):
        cor.iat[index,index] = 0.0
    del c
    return cor


def highest_corr(mat):
    n_cor = np.array(mat.values)
    corr = np.max(n_cor)
    idx = np.unravel_index(np.argmax(n_cor, axis=None), n_cor.shape)
    f1 = mat.columns[idx[0]]
    f2 = mat.columns[idx[1]]
    return corr,f1,f2


def get_merge_weights(m1,m2,densities):
    d1 = densities[m1]
    d2 = densities[m2]
    d_tot = d1 + d2
    weights1 = 0.5*DENSITY_COEFF + (d1/d_tot)*(1-DENSITY_COEFF)
    weights2 = 0.5*DENSITY_COEFF + (d2/d_tot)*(1-DENSITY_COEFF)
    return weights1, weights2


def ensemble_col(label,frames,densities):
    if len(frames) == 1:
        model, value = frames.popitem() # Pop the last item
        return value[label]
    else:
        corr_mat = get_corr_mat(frames, label)
        
        corr, merge1, merge2 = highest_corr(corr_mat)
        w1,w2 = get_merge_weights(merge1,merge2,densities)
        
        comb_model = pd.DataFrame()
        comb_model[label] = (frames[merge1][label]*w1) + (frames[merge2][label]*w2)
    
        comb_col = merge1 + '_' + merge2
        frames[comb_col] = comb_model
    
        if corr >= OVER_CORR_CUTOFF:
            print('\t',merge1,merge2,'  (OVER CORR)')
            densities[comb_col] = max(densities[merge1],densities[merge2])
        else:
            densities[comb_col] = densities[merge1] + densities[merge2]
        
        del frames[merge1]
        del frames[merge2]
        del densities[merge1]
        del densities[merge2]
        return ensemble_col(label, frames, densities)

print('\n#### Data Blending ####')

final_submission = pd.DataFrame.from_dict({'id': sample_submission['id']}).sort_values('id')

for label in labels:
    frames = load_submissions()
    densities = { k: 1.0 for k in frames.keys() }   # Pre-set density as 1 to all models
    
    print('\n\n # ', label)
    final_submission[label] = ensemble_col(label, frames, densities)
    
############################
# Output
############################

final_submission.to_csv('submission.csv',index=False)

## RNN

In [None]:
## Import the relevant libraries
##
## https://www.kaggle.com/larryfreeman/toxic-comments-code-for-alexander-s-9872-model
## Alexander Burmistrov Implementation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unidecode as uc
import string
import re
import gensim
import pickle
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['OMP_NUM_THREADS'] = '4'
import gc
import time
import nltk

from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNLSTM, CuDNNGRU
from keras.preprocessing import text, sequence

from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda

from nltk.corpus import stopwords
from keras import backend as K
from sklearn.model_selection import KFold
from unidecode import unidecode


np.random.seed(42)
eng_stopwords = set(stopwords.words("english"))
nltk.download('stopwords')

## ROC-AUC Score Class
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True


## RNN Model Layers
def getModel(features,clipvalue=1.,num_filters=40,dropout=0.5,embed_size=501):
    
    features_input = Input(shape=(features.shape[1],))
    inp = Input(shape=(maxlen, ))
    
    # Layer 1: concatenated fasttext and glove twitter embeddings.
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    
    # Uncomment for best result
    # Layer 2: SpatialDropout1D(0.5)
    x = SpatialDropout1D(dropout)(x)
    
    # Uncomment for best result
    # Layer 3: Bidirectional CuDNNLSTM
    x = Bidirectional(LSTM(num_filters, return_sequences=True))(x)


    # Layer 4: Bidirectional CuDNNGRU
    x, x_h, x_c = Bidirectional(GRU(num_filters, return_sequences=True, return_state = True))(x)  
    
    # Layer 5: A concatenation of the last state, maximum pool, average pool and 
    # two features: "Unique words rate" and "Rate of all-caps words"
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    x = concatenate([avg_pool, x_h, max_pool,features_input])
    
    # Layer 6: output dense layer.
    outp = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=[inp,features_input], outputs=outp)
    adam = optimizers.adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model

## Get data and start preprocessing here
def getData():
    
    dataTrain = pd.read_csv('train.csv')
    dataTest = pd.read_csv('test.csv')
    
    return dataTrain,dataTest


special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)


def cleanText(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean



## Add features
def addFeatures(df):
    
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df




# Get the data set
train,test = getData()


# Create a new column with cleaned text of the "comment_text" column
train['clean_text'] = train['comment_text'].apply(lambda x: cleanText(str(x)))
test['clean_text'] = test['comment_text'].apply(lambda x: cleanText(str(x)))


# FillNAs in the "clean_text" column with "Something" and obtain the X_Train and Y_Train
X_train = train['clean_text'].fillna("something").values
Y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test['clean_text'].fillna("something").values



## Execute Add features with all
train = addFeatures(train)
test = addFeatures(test)

## FillNAs in caps_vs_length & words_vs_unique with 0s
features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

# Transform
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)


# For best score (Public: 9869, Private: 9865), change to max_features = 283759, maxlen = 900
max_features = 10000
maxlen = 50

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train_sequence, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequence, maxlen=maxlen)
print(len(tokenizer.word_index))


# Load the FastText Web Crawl vectors
# T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. Advances in Pre-Training Distributed Word Representations
#
#
EMBEDDING_FILE_FASTTEXT = "/Users/nana/Documents/DataScience/ModuleIII/crawl-300d-2M.vec"
EMBEDDING_FILE_TWITTER="/Users/nana/Documents/DataScience/ModuleIII/glove/glove.twitter.27B.200d.txt"

def getCoeffs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index_ft = dict(getCoeffs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE_FASTTEXT,encoding='utf-8'))
embeddings_index_tw = dict(getCoeffs(*o.strip().split()) for o in open(EMBEDDING_FILE_TWITTER,encoding='utf-8'))


## Use the GENSIM library --> Word2vec embedding 
spell_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE_FASTTEXT)


# This code is  based on: Spellchecker using Word2vec by CPMP
# https://www.kaggle.com/cpmpml/spell-checker-using-word2vec

words = spell_model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

# Use fast text as vocabulary
def words(text): 
    return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])


word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words,501))

something_tw = embeddings_index_tw.get("something")
something_ft = embeddings_index_ft.get("something")

something = np.zeros((501,))
something[:300,] = something_ft
something[300:500,] = something_tw
something[500,] = 0

def all_caps(word):
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix,i,word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None: 
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_matrix[i,500] = last_value
        embedding_vector_tw = embeddings_index_tw.get(word)
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:500] = embedding_vector_tw

            
# Fasttext vector is used by itself if there is no glove vector but not the other way around.
for word, i in word_index.items():
    
    if i >= max_features: continue
        
    if embeddings_index_ft.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        # change to > 20 for better score.
        if len(word) > 0:
            embedding_matrix[i] = something
        else:
            word2 = correction(word)
            if embeddings_index_ft.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
                word2 = correction(singlify(word))
                if embeddings_index_ft.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something
                    
                    

##
##   Begin writing the model here
##
model = getModel(features)

batch_size = 32

# Used epochs=100 with early exiting for best score.
epochs = 1
gc.collect()
K.clear_session()

# Change to 10
num_folds = 2 #number of folds

predict = np.zeros((test.shape[0],6))

# Uncomment for out-of-fold predictions
#scores = []
#oof_predict = np.zeros((train.shape[0],6))

kf = KFold(n_splits=num_folds, shuffle=True, random_state=239)

for train_index, test_index in kf.split(x_train):
    
    kfold_y_train,kfold_y_test = Y_train[train_index], Y_train[test_index]
    kfold_X_train = x_train[train_index]
    kfold_X_features = features[train_index]
    kfold_X_valid = x_train[test_index]
    kfold_X_valid_features = features[test_index] 
    
    gc.collect()
    K.clear_session()
    
    model = getModel(features)
    
    ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_valid_features], kfold_y_test), interval = 1)
    
    model.fit([kfold_X_train,kfold_X_features], kfold_y_train, batch_size=batch_size, epochs=epochs, verbose=1,
             callbacks = [ra_val])
    gc.collect()
    
    #model.load_weights(bst_model_path)
    model.load_weights("best_weights.h5")
    
    predict += model.predict([x_test,test_features], batch_size=batch_size,verbose=1) / num_folds
    
    #gc.collect()
    # uncomment for out of fold predictions
    #oof_predict[test_index] = model.predict([kfold_X_valid, kfold_X_valid_features],batch_size=batch_size, verbose=1)
    #cv_score = roc_auc_score(kfold_y_test, oof_predict[test_index])
    
    #scores.append(cv_score)
    #print('score: ',cv_score)

print("Done")
#print('Total CV score is {}'.format(np.mean(scores)))    


sample_submission = pd.read_csv("sample_submission.csv")
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
sample_submission[class_names] = predict
sample_submission.to_csv('/Users/nana/Documents/DataScience/ModuleIII/munks_rnn_5ep_submission.csv',index=False)                  
                    
