In [28]:
# Lets first import our packages for everything. If you don't have a package installed you can use !pip install package_name

# These are for automatic hyperparameter optimization
import pprint
pp = pprint.PrettyPrinter(indent = 4)

# Our standard packages for data science.
import os
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

# These are for augmenting and finding the data. We won't be using SKLearn much for actually modelling.
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# These are the actual packages for deep learning. We will mostly use the high level keras package for tensorflow.
import tensorflow as tf

import keras
from keras.layers import *
from keras.callbacks import *
from keras.models import Model, Sequential
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.optimizers import *
from keras.layers import LeakyReLU, Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

print('Packages are ready!')

Packages are ready!


In [29]:
# Lets import our data
target = pd.read_csv('target.csv')
data = pd.read_csv('data.csv')
test = pd.read_csv('test.csv')
vdata = pd.read_csv('vdata.csv')
vtarget = pd.read_csv('vtarget.csv')
ltarget = pd.read_csv('ltarget.csv', header=None)
lvtarget = pd.read_csv('lvtarget.csv', header=None)
wdata = pd.read_csv('whole_data.csv')
wtarget = pd.read_csv('whole_target.csv', header=None)

winedata = pd.read_csv('winedata.csv')

ccdata = pd.read_csv('creditcard.csv')

irisdata = pd.read_csv('irisdata.csv')
iristarget = pd.read_csv('iristarget.csv')

print('Data is ready!')

Data is ready!


In [30]:
# Lets go ahead and set up our data. First lets make our target and drop it from wine.
winetarget = winedata['quality']
winedata = winedata.drop('quality', axis=1)

# Lets trim off time and amount from ccdata as those are independent features we don't want the model to learn.
cctarget = ccdata['Class']
ccdata = ccdata.drop(['Time','Amount', 'Class'], axis=1)

iristarget = iristarget['target']

target = target['surface']
vtarget = vtarget['surface']
ltarget = ltarget[0]
lvtarget = lvtarget[0]
wdata = wdata.drop(['series_id', 'group_id', 'surface'], axis=1)
wtarget = wtarget[0]

data = data.values
vdata = vdata.values
wdata = wdata.values
test = test.values

In [31]:
# Now lets make sure our train and target variables are even for every dataset
print(irisdata.shape)
print(iristarget.shape)

print(winedata.shape)
print(winetarget.shape)

print(ccdata.shape)
print(cctarget.shape)

print(target.shape)
print(vtarget.shape)
print(ltarget.shape)
print(lvtarget.shape)
print(wtarget.shape)
print(wdata.shape)
print(vdata.shape)
print(data.shape)
test.shape

(150, 4)
(150,)
(6497, 11)
(6497,)
(284807, 28)
(284807,)
(2804,)
(1006,)
(358912,)
(128768,)
(487680,)
(487680, 23)
(128768, 23)
(358912, 23)


(488448, 23)

Perfect! All our data is set up properally and is ready to be worked on!


In [32]:
# This is our decoder for the target of the IMUSD
decode_dic = {0: 'fine_concrete',
              1: 'concrete',
              2: 'soft_tiles',
              3: 'tiled',
              4: 'soft_pvc',
              5: 'hard_tiles_large_space',
              6: 'carpet',
              7: 'hard_tiles',
              8: 'wood'}

Lets make an intelligent way to search our model creationg for the smaller datasets. Below is a great way to automatically find the best keras model.

Here is another resource for keras hyperparameter optimization https://medium.com/@mikkokotila/a-comprehensive-list-of-hyperparameter-optimization-tuning-solutions-88e067f19d9

In [33]:
# Here we build a template for the gridsearch to work with

def build_model(optimizer, learning_rate, activation, dropout_rate,
                initilizer,num_unit):
    keras.backend.clear_session()
    model = Sequential()
    model.add(Dense(num_unit, kernel_initializer=initilizer,
                    activation=activation, input_shape=(784,)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_unit, kernel_initializer=initilizer,
                    activation=activation))
    model.add(Dropout(dropout_rate)) 
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer(lr=learning_rate),
                  metrics=['accuracy'])
    return model

In [34]:
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

In [35]:
# This is the list of options we are giving our model, [:1] is for testing each value inside.

batch_size = [20, 50, 100][:1]

epochs = [1, 20, 50][:1]

initilizer = ['lecun_uniform', 'normal', 'he_normal', 'he_uniform'][:1]

learning_rate = [0.1, 0.001, 0.02][:1]

dropout_rate = [0.3, 0.2, 0.8][:1]

num_unit = [32, 10, 5][:1]

activation = ['relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'][:1]

optimizer = [SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam][:1]

In [36]:
# Now lets make our gridsearch
# parameters is a dict with all values

parameters = dict(batch_size = batch_size,
                  epochs = epochs,
                  dropout_rate = dropout_rate,
                  num_unit = num_unit,
                  initilizer = initilizer,
                  learning_rate = learning_rate,
                  activation = activation,
                  optimizer = optimizer)

model = KerasClassifier(build_fn=build_model, verbose=0)

models = GridSearchCV(estimator = model, param_grid=parameters, n_jobs=1)

In [37]:
# Here we will be given our best model from the search.
# The larger the model and the more options given the longer it will take to train.
best_model = models.fit(x_train, y_train)
print('Best model :')
pp.pprint(best_model.best_params_)

Best model :
{   'activation': 'relu',
    'batch_size': 20,
    'dropout_rate': 0.3,
    'epochs': 1,
    'initilizer': 'lecun_uniform',
    'learning_rate': 0.1,
    'num_unit': 32,
    'optimizer': <class 'keras.optimizers.SGD'>}


Cool Now lets tackle the big boy. We will start with just a standard dense model with 3 hidden layers. You can use the earlier method to create the best model again but it will take a VERY long time!!!  

In [38]:
# We need to make sure the data is the right shape for training.
data = data.reshape((358912, 23))
vdata = vdata.reshape((128768, 23))
test = test.reshape((488448, 23))

In [39]:
model = Sequential()

num_classes = 9

# Here is a standard keras fully connected model.
model.add(Dense(64, input_dim=23, use_bias=False, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(64, use_bias=False, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(64, use_bias=False, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [40]:
# Let's train the model using RMSprop.
opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [41]:
# AAAAND start the training.
model.fit(data, ltarget,
              batch_size=128,
              epochs=5,
              validation_data=(vdata, lvtarget),
              shuffle=True)

Train on 358912 samples, validate on 128768 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17905300320>

Now lets make an lstm to read the data as a time series. I used an attention layer because it's cool but really it's not needed. Here is a great article on attention for DL.

https://medium.com/syncedreview/a-brief-overview-of-attention-mechanism-13c578ba9129

In [42]:
# Reshape the data again for the Time Series model as LSTM's need 3 input dimensions.
data = data.reshape((2804, 128, 23))
vdata = vdata.reshape((1006, 128, 23))
test = test.reshape((3816, 128, 23))

In [43]:
class Attention(Layer):
    def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)
        
    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],), initializer=self.init, name='{}_W'.format(self.name), regularizer=self.W_regularizer, constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        if self.bias:
            self.b = self.add_weight((input_shape[1],), initializer='zero', name='{}_b'.format(self.name), regularizer=self.b_regularizer, constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias: eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None: a *= K.cast(mask, K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [44]:
# Now lets make the keras model. This time we will not use a sequential model but instead a functional api model.
def make_model():
    inp = Input(shape=(128, 23))
    x = Bidirectional(CuDNNLSTM(32, return_sequences=True))(inp)
    x = Attention(128)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(.5)(x)
    x = Dense(9, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [45]:
# Here we define our k-folds and our validation data.
def k_folds(X, y, X_test, k=5):
    folds = list(StratifiedKFold(n_splits=k).split(X, y))
    y_test = np.zeros((X_test.shape[0], 9))
    y_oof = np.zeros((X.shape[0]))
    
    for i, (train_idx, val_idx) in  enumerate(folds):
        print(f'Fold {i+1}')
        model = make_model()
        model.fit(X[train_idx], y[train_idx], batch_size=128, epochs=100, 
                  validation_data=[X[val_idx], y[val_idx]], verbose=0)
        
        pred_val = np.argmax(model.predict(X[val_idx]), axis=1)
        score = accuracy_score(pred_val, y[val_idx])
        y_oof[val_idx] = pred_val
        
        print(f'Scored {score:.3f} on validation data')
        
        y_test += model.predict(X_test)
        
    return y_oof, y_test  

In [46]:
# LET THE TEST BEGIN!
y_oof, y_test = k_folds(data, target, test, k=5)

Fold 1
Scored 0.616 on validation data
Fold 2
Scored 0.607 on validation data
Fold 3
Scored 0.624 on validation data
Fold 4
Scored 0.611 on validation data
Fold 5
Scored 0.567 on validation data


In [47]:
# Cool, so here is our output.
y_test = np.argmax(y_test, axis=1)
y_test

array([3, 1, 3, ..., 6, 1, 8], dtype=int64)

Now lets make the same model but use our validation data instead of K-folds! You will notice that the validation score is much lower but actually this is from the model NOT overfitting. So the K-folds for deep learning may not be the best for every use case. Instead you want to create actual validation data that is never trained on by the model!

In [48]:
# We are hijacking the model from the last model.
model = make_model()

In [49]:
# We are using rmsprop again but feel free to try new things!
opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [50]:
model.fit(data, target,
              batch_size=128,
              epochs=10,
              validation_data=(vdata, vtarget),
              shuffle=True)

Train on 2804 samples, validate on 1006 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17903a6a9b0>

Thats it! Feel Free to check out my other notebooks or the keras documentation for more Deep learning goodness!

https://keras.io/