In [1]:
# Lets first import our packages for everything. If you don't have a package installed you can use !pip install package_name

# These are for automatic hyperparameter optimization
import pprint
pp = pprint.PrettyPrinter(indent = 4)

# Our standard packages for data science.
import os
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

# These are for augmenting and finding the data. We won't be using SKLearn much for actually modelling.
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import OneHotEncoder

# These are the actual packages for deep learning. We will mostly use the high level keras package for tensorflow.
import tensorflow as tf

import keras
from keras.layers import *
from keras.callbacks import *
from keras.models import Model, Sequential
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.optimizers import *
from keras.layers import LeakyReLU, Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

print('Packages are ready!')

Packages are ready!


Using TensorFlow backend.


In [2]:
# Lets import our data
target = pd.read_csv('target.csv')
data = pd.read_csv('data.csv')
test = pd.read_csv('test.csv')
vdata = pd.read_csv('vdata.csv')
vtarget = pd.read_csv('vtarget.csv')
ltarget = pd.read_csv('ltarget.csv', header=None)
lvtarget = pd.read_csv('lvtarget.csv', header=None)
wdata = pd.read_csv('whole_data.csv')
wtarget = pd.read_csv('whole_target.csv', header=None)

winedata = pd.read_csv('winedata.csv')

ccdata = pd.read_csv('creditcard.csv')

irisdata = pd.read_csv('irisdata.csv')
iristarget = pd.read_csv('iristarget.csv')

print('Data is ready!')

Data is ready!


In [3]:
# Lets go ahead and set up our data. First lets make our target and drop it from wine.
winetarget = winedata['quality']
winedata = winedata.drop('quality', axis=1)

# Lets trim off time and amount from ccdata as those are independent features we don't want the model to learn.
cctarget = ccdata['Class']
ccdata = ccdata.drop(['Time','Amount', 'Class'], axis=1)

iristarget = iristarget['target']

target = target['surface']
vtarget = vtarget['surface']
ltarget = ltarget[0]
lvtarget = lvtarget[0]
wdata = wdata.drop(['series_id', 'group_id', 'surface'], axis=1)
wtarget = wtarget[0]

irisd, id_test, irist, it_test = train_test_split(irisdata, iristarget, test_size=0.33, random_state=42)
wined, wd_test, winet, wt_test = train_test_split(winedata, winetarget, test_size=0.33, random_state=42)
ccd, ccd_test, cct, cct_test = train_test_split(ccdata, cctarget, test_size=0.33, random_state=42)

data = data.values
vdata = vdata.values
wdata = wdata.values
test = test.values
# irisdata = irisdata.values
# ccdata = ccdata.values
# winedata = winedata.values
# id_test = id_test.values
# ccd_test = ccd_test.values
# wd_test = wd_test.values

In [4]:
# Now lets make sure our train and target variables are even for every dataset
print(irisdata.shape)
print(iristarget.shape)

print(winedata.shape)
print(winetarget.shape)

print(ccdata.shape)
print(cctarget.shape)

print(target.shape)
print(vtarget.shape)
print(ltarget.shape)
print(lvtarget.shape)
print(wtarget.shape)
print(wdata.shape)
print(vdata.shape)
print(data.shape)
test.shape

(150, 4)
(150,)
(6497, 11)
(6497,)
(284807, 28)
(284807,)
(2804,)
(1006,)
(358912,)
(128768,)
(487680,)
(487680, 23)
(128768, 23)
(358912, 23)


(488448, 23)

Perfect! All our data is set up properally and is ready to be worked on!

One large problem we always need to think about when creating a keras model is if we want to go 'deep' or 'wide'.

Here is a good example of the difference.

<img src='http://www.coldvision.io/wp-content/uploads/2016/07/dnn_ann_vs_dnn.png' />

Having only a single, very large hidden layer is what we call a 'wide' model. Having many small hidden layers are what we call a 'deep' model. A wide model can learn any function but it will also be prone to overfitting. A deep model generalizes better but can take longer to train depending on the layers you have. So in deep learning its all about finding the balance between the two models. You want to make the model wide enough to where it learns but no wider and then as deep as you can computaionally afford!

One of the other ways to improve neural networks and reduce the chance of overfitting is by introducing a concept called 'dropout'. It randomly by some value between 0 and 1 it will stop neurons in the network from working. This forces the network to learn new paths down its layers. This can stop the model from getting stuck at local maxima during Backpropagation.

https://medium.com/@amarbudhiraja/https-medium-com-amarbudhiraja-learning-less-to-learn-better-dropout-in-deep-machine-learning-74334da4bfc5

https://arxiv.org/pdf/1902.06720.pdf

In [5]:
# This is our decoder for the target of the IMUSD
decode_dic = {0: 'fine_concrete',
              1: 'concrete',
              2: 'soft_tiles',
              3: 'tiled',
              4: 'soft_pvc',
              5: 'hard_tiles_large_space',
              6: 'carpet',
              7: 'hard_tiles',
              8: 'wood'}

Lets make an intelligent way to search our model creationg for the smaller datasets. Below is a great way to automatically find the best keras model.

Here is another resource for keras hyperparameter optimization https://medium.com/@mikkokotila/a-comprehensive-list-of-hyperparameter-optimization-tuning-solutions-88e067f19d9

Lets start with wine

In [6]:
# Here we build a template for the gridsearch to work with

input_shape = 11
num_classes = 1

def build_model(optimizer, learning_rate, activation, dropout_rate, num_unit):
    keras.backend.clear_session()
    
    model = Sequential()
    model.add(Dense(num_unit, activation=activation, input_shape=(input_shape,)))
    model.add(Dropout(dropout_rate))
    
    model.add(Dense(num_unit, activation=activation))
    model.add(Dropout(dropout_rate)) 
    
    model.add(Dense(num_classes, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer(lr=learning_rate),
                  metrics=['accuracy'])
    
    return model

In [7]:
# This is the list of options we are giving our model.
# The more parameters you have the longer it will take by x!
# I also highly recommend that you use this with gpu enabled.

batch_size = [100, 200]

epochs = [10]

learning_rate = [0.1, 0.001, 0.01]

dropout_rate = [0.3, 0.2, 0.1]

num_unit = [64, 32]

activation = ['relu', 'tanh']

optimizer = [SGD, RMSprop, Adam]

In [8]:
# Now lets make our gridsearch
# parameters is a dict with all values

parameters = dict(batch_size = batch_size,
                  epochs = epochs,
                  dropout_rate = dropout_rate,
                  num_unit = num_unit,
                  learning_rate = learning_rate,
                  activation = activation,
                  optimizer = optimizer)

model = KerasClassifier(build_fn=build_model, verbose=0)

models = GridSearchCV(estimator = model, param_grid=parameters, n_jobs=-1)

Here we will be given our best wine model from the search.
If this code fails then you will need to change the keras file.

https://stackoverflow.com/a/52132383/9975219

In [9]:
best_model = models.fit(wined, winet, validation_data=(wd_test, wt_test), shuffle=True)
print('Best model :')
pp.pprint(best_model.best_params_)



Best model :
{   'activation': 'tanh',
    'batch_size': 200,
    'dropout_rate': 0.1,
    'epochs': 10,
    'learning_rate': 0.01,
    'num_unit': 64,
    'optimizer': <class 'keras.optimizers.Adam'>}


In [10]:
# Now with this info lets see how well our best model does then we will redo this process for CCD and IRIS

input_shape = 11
num_classes = 1
opt = keras.optimizers.Adam(lr=0.01)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
model.add(Dropout(0.1))
    
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1)) 
    
model.add(Dense(num_classes, activation='sigmoid'))
    
model.compile(loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])

In [11]:
model.fit(wined, winet,
              batch_size=200,
              epochs=10,
              validation_data=(wd_test, wt_test),
              shuffle=True)

Train on 4352 samples, validate on 2145 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a2daa3c8d0>

In [12]:
model.evaluate(wd_test, wt_test)



[0.5961327495135905, 0.6526806526806527]

We see that our model is capping around .75%. We could improve this by hand tunning the model. Or we could do some more feature engineering to the model to give it more to learn from.

In [17]:
# Lets do CCD first

input_shape = 28
num_classes = 1

def build_model(optimizer, learning_rate, activation, dropout_rate, num_unit):
    keras.backend.clear_session()
    
    model = Sequential()
    model.add(Dense(num_unit, activation=activation, input_shape=(input_shape,)))
    model.add(Dropout(dropout_rate))
    
    model.add(Dense(num_unit, activation=activation))
    model.add(Dropout(dropout_rate)) 
    
    model.add(Dense(num_classes, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer(lr=learning_rate),
                  metrics=['accuracy'])
    
    return model

In [18]:
# We could change our hyperparameters buuuuuutttttt nah.

batch_size = [100, 200]

epochs = [10]

learning_rate = [0.1, 0.001, 0.01]

dropout_rate = [0.3, 0.2, 0.1]

num_unit = [64, 32]

activation = ['relu', 'tanh']

optimizer = [SGD, RMSprop, Adam]

In [19]:
# Another GridSearch!

parameters = dict(batch_size = batch_size,
                  epochs = epochs,
                  dropout_rate = dropout_rate,
                  num_unit = num_unit,
                  learning_rate = learning_rate,
                  activation = activation,
                  optimizer = optimizer)

model = KerasClassifier(build_fn=build_model, verbose=0)

models = GridSearchCV(estimator = model, param_grid=parameters, n_jobs=-1)

In [20]:
best_model = models.fit(ccd, cct, validation_data=(ccd_test, cct_test), shuffle=True)
print('Best model :')
pp.pprint(best_model.best_params_)

Best model :
{   'activation': 'relu',
    'batch_size': 100,
    'dropout_rate': 0.1,
    'epochs': 10,
    'learning_rate': 0.001,
    'num_unit': 64,
    'optimizer': <class 'keras.optimizers.Adam'>}


In [21]:
# Awesome. That one probably took longer huh?

input_shape = 28
num_classes = 1
opt = keras.optimizers.Adam(lr=0.01)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
model.add(Dropout(0.1))
    
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1)) 
    
model.add(Dense(num_classes, activation='sigmoid'))
    
model.compile(loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])

In [22]:
model.fit(ccd, cct,
              batch_size=100,
              epochs=10,
              validation_data=(ccd_test, cct_test),
              shuffle=True)

Train on 190820 samples, validate on 93987 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a2a34e0c88>

In [23]:
model.evaluate(ccd_test, cct_test)



[0.003477105172812994, 0.9994041729175311]

We see that our data is doing better then just guessing everything 'not fruad'. We could improve our model like we did for the SKLearn models with SMOTE or some other under sampling approach. We could also make our data more sensitive to the minority 
class using the Neual Net weights. Which is what we are doing in this notebook!

https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html

In [24]:
# First lets SMOTE our data. We are only increasing the minority sample to 50% of the majority class.
# This will hold that we have a minority class and that we need to use the weights to adjust for it.
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy=0.5, random_state=2)

ccd_SMOTE, cct_SMOTE = sm.fit_resample(ccd,cct)

In [25]:
# Lets see how many new minority samples there are.
print(len(cct))
len(cct_SMOTE)

190820


285715

In [26]:
# Lets rerun that code but add the 'class_weight' dict.

input_shape = 28
num_classes = 1
opt = keras.optimizers.Adam(lr=0.001)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
model.add(Dropout(0.1))
    
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1)) 
    
model.add(Dense(num_classes, activation='sigmoid'))
    
model.compile(loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])

In [27]:
class_weight = {0: 1.0,
                1: 2.0}

model.fit(ccd, cct,
              batch_size=200,
              epochs=30,
              validation_data=(ccd_test, cct_test),
              class_weight=class_weight,
              shuffle=True)

Train on 190820 samples, validate on 93987 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x2a2abacceb8>

In [28]:
model.evaluate(ccd_test, cct_test)



[0.003164525856868862, 0.9994360922255205]

We see some improvement!! We could get to 100% if you fine tune and do the process of gridsearch again with a deeper and more diverse pool for the search. That would take a day and if I was doing this as a product to find fraud then I would!

So here we need to do somethings different. For multiclass classification we need to use 'sparse_categorical_crossentropy' or just regular 'categorical_crossentropy' for the loss instead of 'binary_crossentropy'. This is because our model will fail in general but also it won't learn. We will also need to change the final output activation to 'softmax' instead of 'sigmoid' as sigmoid does not work for multiclass classification! This is very important and easy to forget.

In [36]:
# another template, this time for iris.

input_shape = 4
num_classes = 3

def build_model(optimizer, learning_rate, activation, dropout_rate, num_unit):
    keras.backend.clear_session()
    
    model = Sequential()
    model.add(Dense(num_unit, activation=activation, input_shape=(input_shape,)))
    model.add(Dropout(dropout_rate))
    
    model.add(Dense(num_unit, activation=activation))
    model.add(Dropout(dropout_rate)) 
    
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer(lr=learning_rate),
                  metrics=['accuracy'])
    
    return model

In [37]:
# Our hyperparameters

batch_size = [100, 200]

epochs = [10]

learning_rate = [0.1, 0.001, 0.01]

dropout_rate = [0.3, 0.2, 0.1]

num_unit = [64, 32]

activation = ['relu', 'tanh']

optimizer = [SGD, RMSprop, Adam]

In [38]:
# Now lets make our gridsearch again.

parameters = dict(batch_size = batch_size,
                  epochs = epochs,
                  dropout_rate = dropout_rate,
                  num_unit = num_unit,
                  learning_rate = learning_rate,
                  activation = activation,
                  optimizer = optimizer)

model = KerasClassifier(build_fn=build_model, verbose=0)

models = GridSearchCV(estimator = model, param_grid=parameters, n_jobs=-1)

In [39]:
# Thank god this one won't take long!
best_model = models.fit(irisd, irist, validation_data=(id_test, it_test), shuffle=True)
print('Best model :')
pp.pprint(best_model.best_params_)



Best model :
{   'activation': 'tanh',
    'batch_size': 100,
    'dropout_rate': 0.3,
    'epochs': 10,
    'learning_rate': 0.01,
    'num_unit': 32,
    'optimizer': <class 'keras.optimizers.Adam'>}


In [40]:
# Last time lets make our best model!

input_shape = 4
num_classes = 3
opt = keras.optimizers.Adam(lr=0.01)
model = Sequential()
model.add(Dense(32, activation='tanh', input_shape=(input_shape,)))
model.add(Dropout(0.3))
    
model.add(Dense(32, activation='tanh'))
model.add(Dropout(0.3)) 
    
model.add(Dense(num_classes, activation='softmax'))
    
model.compile(loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])

In [42]:
model.fit(irisd, irist,
              batch_size=100,
              epochs=10,
              validation_data=(id_test, it_test),
              shuffle=True)

Train on 100 samples, validate on 50 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a2acc93fd0>

In [43]:
model.evaluate(id_test, it_test)



[0.3193650126457214, 0.9599999904632568]

Cool Now lets tackle the big boy. We will start with just a standard dense model with 3 hidden layers. You can use the earlier method to create the best model again but it will take a VERY long time!!!  

In [44]:
# We need to make sure the data is the right shape for training.
data = data.reshape((358912, 23))
vdata = vdata.reshape((128768, 23))
test = test.reshape((488448, 23))

In [45]:
model = Sequential()

num_classes = 9
input_dim = 23

# Here is a standard keras fully connected model.
# The reason I am using 'tanh' instead of 'relu' is that about half the variables are negative.
model.add(Dense(64, input_dim=input_dim, use_bias=False, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(64, use_bias=False, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(64, use_bias=False, kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [46]:
# Let's train the model using RMSprop.
opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [47]:
# AAAAND start the training.
model.fit(data, ltarget,
              batch_size=128,
              epochs=10,
              validation_data=(vdata, lvtarget),
              shuffle=True)

Train on 358912 samples, validate on 128768 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a2accaf828>

Now lets make an lstm to read the data as a time series. I used an attention layer because it's cool but really it's not needed. Here is a great article on attention for DL.

https://medium.com/syncedreview/a-brief-overview-of-attention-mechanism-13c578ba9129

In [48]:
# Reshape the data again for the Time Series model as LSTM's need 3 input dimensions.
data = data.reshape((2804, 128, 23))
vdata = vdata.reshape((1006, 128, 23))
test = test.reshape((3816, 128, 23))

In [49]:
class Attention(Layer):
    def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)
        
    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],), initializer=self.init, name='{}_W'.format(self.name), regularizer=self.W_regularizer, constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        if self.bias:
            self.b = self.add_weight((input_shape[1],), initializer='zero', name='{}_b'.format(self.name), regularizer=self.b_regularizer, constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias: eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None: a *= K.cast(mask, K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [50]:
# Now lets make the keras model. This time we will not use a sequential model but instead a functional api model.
def make_model():
    inp = Input(shape=(128, 23))
    x = Bidirectional(CuDNNLSTM(32, return_sequences=True))(inp)
    x = Attention(128)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(.5)(x)
    x = Dense(9, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [51]:
# Here we define our k-folds and our validation data.
def k_folds(X, y, X_test, k=5):
    folds = list(StratifiedKFold(n_splits=k).split(X, y))
    y_test = np.zeros((X_test.shape[0], 9))
    y_oof = np.zeros((X.shape[0]))
    
    for i, (train_idx, val_idx) in  enumerate(folds):
        print(f'Fold {i+1}')
        model = make_model()
        model.fit(X[train_idx], y[train_idx], batch_size=128, epochs=100, 
                  validation_data=[X[val_idx], y[val_idx]], verbose=0)
        
        pred_val = np.argmax(model.predict(X[val_idx]), axis=1)
        score = accuracy_score(pred_val, y[val_idx])
        y_oof[val_idx] = pred_val
        
        print(f'Scored {score:.3f} on validation data')
        
        y_test += model.predict(X_test)
        
    return y_oof, y_test  

In [52]:
# LET THE TEST BEGIN!
y_oof, y_test = k_folds(data, target, test, k=5)

Fold 1
Scored 0.621 on validation data
Fold 2
Scored 0.611 on validation data
Fold 3
Scored 0.629 on validation data
Fold 4
Scored 0.608 on validation data
Fold 5
Scored 0.618 on validation data


In [53]:
# Cool, so here is our output.
y_test = np.argmax(y_test, axis=1)
y_test

array([3, 1, 3, ..., 6, 1, 8], dtype=int64)

Now lets make the same model but use our validation data instead of K-folds! You will notice that the validation score is much lower but actually this is from the model NOT overfitting. So the K-folds for deep learning may not be the best for every use case. Instead you want to create actual validation data that is never trained on by the model!

In [54]:
# We are hijacking the model from the last model.
model = make_model()

In [55]:
# We are using rmsprop again but feel free to try new things!
opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [56]:
model.fit(data, target,
              batch_size=128,
              epochs=10,
              validation_data=(vdata, vtarget),
              shuffle=True)

Train on 2804 samples, validate on 1006 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a3089e8080>

Thats it! Feel Free to check out my other notebooks or the keras documentation for more Deep learning goodness!

https://keras.io/

Also not to make you feel like deep learning is limited here is a list of different networks that can be bootstrapped together to solve any problem!

https://towardsdatascience.com/the-mostly-complete-chart-of-neural-networks-explained-3fb6f2367464