# Model wrapping 

## Imports

In [5]:
import pickle as pickle
import pandas as pd
import numpy as np

#keras to prepare data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#laeyers
from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense, SpatialDropout1D, Dropout
from keras.layers.merge import Concatenate

#models
from keras.models import Sequential, Model

#train test split
from sklearn.model_selection import train_test_split

#for custom metric function
import tensorflow as tf
import keras.backend as K

#metrics
from sklearn.metrics import roc_curve, roc_auc_score, auc

#optimizers
from keras.optimizers import Adam

#wrapper
from keras.wrappers.scikit_learn import KerasClassifier

## Loading data and preparing data

In [3]:
#loading data
raw_data = pd.read_csv('../data/class_data_001.csv', header = None)
raw_data.columns = ['label', 'text']
texts = raw_data.text.tolist()
labels = raw_data.label.values

#First of all we need to tokenize text. i'am using keras tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

#lets compute max sequence lenth. We have to know it because it is the size of input vector.
#All text with lenth less then maximum should be padded to this size.
MAX_SEQUENCE_LENGTH = max(map(len, sequences))

#lets add 0 at the begining of sequenses (perform padding).
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

embeddings_index = {}
f = open('../data/glove.6B.50d.txt')
for line in f:
    values = line.split()
    #As i mention above 1st element is the token. We keep token in string word
    word = values[0]
    #The other elements is the elements of real value vector. We keep it as np.array.
    coefs = np.asarray(values[1:], dtype='float32')
    #Collecting dict. The keys of this dict is tokens and values is real vector elemnts.
    embeddings_index[word] = coefs
f.close()

EMBEDDING_DIM = 50

def creat_emb(random_init = False):
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            # probably we could random initialize them
            # to do this use random_init = True
        elif random_init:
            embedding_matrix[i] = np.random.rand(50)
    return embedding_matrix

emb = creat_emb()

## Creating wrapper

In [43]:
#defining our build_fn
#for example i want to be able to chose how many filters to use in the model  using sklearn interface
#and what metric to use
def create_simple_CNN(filter_num = 64):
    #CONSTRUCTING MODEL
    embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[emb],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
    simple_cnn = Sequential([
        embedding_layer,
        SpatialDropout1D(0.2),
        Dropout(0.25),
        Conv1D(filter_num, 5, padding='same', activation='relu'), #insted number i use the param 
        Dropout(0.25),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation='relu'),
        Dropout(0.7),
        Dense(1, activation='sigmoid')])
    #COMPILING MODEL
    #there is no reason to add mcc for keras because sklearn will use his own metrics
    simple_cnn.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['acc']) 
    #RETURNING MODEL
    return simple_cnn

#now iam gonna use class keras.wrappers.scikit_learn.KerasClassifier(build_fn=None, **sk_params) to get
#instance of wrapped CNN

#also in this step i have to add default vlues for parameters
sklearn_model = KerasClassifier(build_fn = create_simple_CNN, epochs=5, batch_size=32, verbose=0)

## Testing the interface

In [11]:
X = data
y = labels

In [15]:
%%time
from sklearn.model_selection import cross_val_score

#cross val score will predict accuracy by default
print( 'Cross validation accuracy is {}'.format(cross_val_score(sklearn_model, X, y, cv = 5).mean()) )

Cross validation accuracy is 0.833113565259
CPU times: user 1min 11s, sys: 11.7 s, total: 1min 23s
Wall time: 37 s


In [39]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer

# class sklearn.metrics.matthews_corrcoef is not a scorer it is a metric so i have to make scorer
skl_mcc = make_scorer(matthews_corrcoef)

sklearn_model = KerasClassifier(build_fn = create_simple_CNN, epochs=5, batch_size=32, verbose=0)
print( 'Cross validation mcc is {}'.format(cross_val_score(sklearn_model, X, y, cv = 5, scoring = skl_mcc)) )

Cross validation mcc is [ 0.          0.          0.          0.73930848  0.        ]
CPU times: user 1min 23s, sys: 12.5 s, total: 1min 36s
Wall time: 58.5 s


In [40]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=5, shuffle=True)
print( 'Cross validation mcc is {}'.format(cross_val_score(sklearn_model, X, y, cv = kfold, scoring = skl_mcc)) )

Cross validation mcc is [ 0.76873419  0.72804584  0.7680874   0.6925121   0.76535267]


According to the documentation cv = INT should use StratifiedKFold with nuber of folds INT, but now i dont think so.

Surprising low time to train

In [28]:
sklearn_model.predict(X[:5,:])

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [29]:
sklearn_model.predict_proba(X[-5:,:])

array([[ 0.10878825,  0.89121175],
       [ 0.20608097,  0.79391903],
       [ 0.28616792,  0.71383208],
       [ 0.16331297,  0.83668703],
       [ 0.19768989,  0.80231011]], dtype=float32)