# Using Keras and Tensorflow to build a MLP  

## Loading the data and data processing

First, we will load the data and process it into a form that can be fed into our MLP neural network.

In [1]:
%matplotlib inline
%config InlineBackend.figre_formats='svg'

from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
import warnings
warnings.simplefilter('ignore')

sns.set()

In [2]:
# Load the training and test data sets
train_df = pd.read_csv(r'C:\Users\Law Wen Yu\.jupyter\data\cmiyc\train_sessions.csv')
test_df = pd.read_csv(r'C:\Users\Law Wen Yu\.jupyter\data\cmiyc\test_sessions.csv')

In [3]:
# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df.sort_values(by='time1', inplace=True)
train_df.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54842,54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77291,77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
# Change site1, ... , site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]

# For the empty sites replace NaN with 0 and change type to int
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

# Load website dictionary
with open(r'C:\Users\Law Wen Yu\.jupyter\data\cmiyc\site_dic.pkl', 'rb') as input_file:
    site_dict = pickle.load(input_file)
    
# Create dataframe for the dictionary
site_dict = pd.DataFrame(list(site_dict.keys()),
                        index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', site_dict.shape[0])
site_dict.head()

Websites total: 48371


Unnamed: 0,site
35429,damkool.com
7849,www.compteur-visite.com
18488,lieuxsacres.canalblog.com
24547,stockage.univ-brest.fr
41836,intranet.crfclermont.fr


In [5]:
# Create the target variable 
y_train = train_df['target']
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test set
idx_split = train_df.shape[0]

In [6]:
# Create a new df with only the visited sites
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
21668,56,55,0,0,0,0,0,0,0,0
54842,56,55,56,55,0,0,0,0,0,0
77291,946,946,951,946,946,945,948,784,949,946
114020,945,948,949,948,945,946,947,945,946,946
146669,947,950,948,947,950,952,946,951,946,947


In [7]:
sites_flatten = full_sites.values.flatten()
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                               sites_flatten,
                               range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]

In [8]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                            target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                               index = np.arange(1,
                                                predicted_labels.shape[0] + 1),
                               columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

## Feature Engineering

Next, let's create some relevant and interesting features. We will skip doing EDA since we have already done it in our previous notebook.

In [13]:
time_df = pd.DataFrame(index=full_df.index)

time_df['min'] = full_df[times].min(axis=1)
time_df['max'] = full_df[times].max(axis=1)
time_df['minutes'] = round((time_df['max'] - time_df['min']).astype('timedelta64[s]')/60,2)

time_df['start_month'] = time_df['min'].apply(lambda ts: 100 * ts.year + ts.month)
time_df['year'] = time_df['min'].apply(lambda ts: ts.year)
time_df['month'] = time_df['min'].apply(lambda ts: ts.month)

time_df['start_week'] = time_df['min'].apply(lambda ts: 100 * ts.year + ts.week)
time_df['start_day'] = time_df['min'].apply(lambda ts: ts.timetuple().tm_yday)
time_df['start_hour'] = time_df['min'].apply(lambda ts: ts.hour)

time_df['dow'] = time_df['min'].apply(lambda ts: ts.date().weekday())
time_df['is_weekend'] = time_df['min'].apply(lambda ts: 1 if ts.date().weekday() in (5,6) else 0)
time_df['work_hours'] = time_df['min'].apply(lambda ts: 1 if (ts.date().weekday() in (0,1,2,3))
                                            & ((ts.hour>=8)&(ts.hour<=17)&(ts.hour!=12)) else 0)

hour = time_df['min'].apply(lambda ts: ts.hour)
time_df['morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
time_df['day'] = ((hour >= 12) & (hour <= 18)).astype('int')
time_df['evening'] = ((hour >=19) & (hour <= 23)).astype('int')
time_df['night'] = ((hour >= 0) & (hour <= 6)).astype('int')

time_df['target'] = y_train
time_df.head()

Unnamed: 0,min,max,minutes,start_month,year,month,start_week,start_day,start_hour,dow,is_weekend,work_hours,morning,day,evening,night,target
21668,2013-01-12 08:05:57,2013-01-12 08:05:57,0.0,201301,2013,1,201302,12,8,5,1,0,1,0,0,0,0
54842,2013-01-12 08:37:23,2013-01-12 09:07:09,29.77,201301,2013,1,201302,12,8,5,1,0,1,0,0,0,0
77291,2013-01-12 08:50:13,2013-01-12 08:50:17,0.07,201301,2013,1,201302,12,8,5,1,0,1,0,0,0,0
114020,2013-01-12 08:50:17,2013-01-12 08:50:20,0.05,201301,2013,1,201302,12,8,5,1,0,1,0,0,0,0
146669,2013-01-12 08:50:20,2013-01-12 08:50:22,0.03,201301,2013,1,201302,12,8,5,1,0,1,0,0,0,0


In [14]:
# Segregrate based on the length of the session
short = (time_df['minutes'] <= 1).astype('int')
long = (time_df['minutes'] > 1).astype('int')

# Segregrate based on day of week
mon = (time_df['dow']==0).astype('int')
tue = (time_df['dow']==1).astype('int')
wed = (time_df['dow']==2).astype('int')
thurs = (time_df['dow']==3).astype('int')
fri = (time_df['dow']==4).astype('int')
sat = (time_df['dow']==5).astype('int')
sun = (time_df['dow']==6).astype('int')

# Segregrate based on month
jan = (time_df['month']==1).astype('int')
feb = (time_df['month']==2).astype('int')
mar = (time_df['month']==3).astype('int')
apr = (time_df['month']==4).astype('int')
may = (time_df['month']==5).astype('int')
june = (time_df['month']==6).astype('int')
july = (time_df['month']==7).astype('int')
aug = (time_df['month']==8).astype('int')
sep = (time_df['month']==9).astype('int')
oct = (time_df['month']==10).astype('int')
nov = (time_df['month']==11).astype('int')
dec = (time_df['month']==12).astype('int')

## Logistic Regression with 8 features

Before we begin to implement our MLP NN, let's run a logistic regression again with 8 features, and evaluate the results using the correct time aware cross validation scheme.

In [20]:
from sklearn.feature_selection import SelectKBest, f_classif

raw_m = np.matrix([short, long, mon, tue, wed, thurs, fri, sat, sun]).T
m = m[:idx_split, :]
x_data_kbest = SelectKBest(f_classif, k=4).fit_transform(m, y_train)
x_data_kbest.shape

(253561, 4)

In [21]:
kbest_X_train = hstack([full_sites_sparse[:idx_split, :], x_data_kbest])

In [24]:
# Create additional matrix of the features that we would like to add
m1 = np.matrix(time_df[['morning', 'day', 'evening', 'night']])
m2 = np.matrix([short, long, mon, wed]).T

# Stack the matrices together
f_full_sites_sparse = hstack([full_sites_sparse, m1, m2], format='csr')

In [25]:
# Seperate our training and our test test
X_train = f_full_sites_sparse[:idx_split, :]
X_test = f_full_sites_sparse[idx_split:, :]

# Set up our logistic regression model 
lr = LogisticRegression(C=1.0, solver='lbfgs',
                       random_state=17).fit(X_train, y_train)

In [26]:
# Set up the correct time aware cross validation scheme
time_split = TimeSeriesSplit(n_splits=10)

cv_scores_1 = cross_val_score(lr, X_train, y_train, cv=time_split,
                           scoring='roc_auc', n_jobs=-1)

cv_scores_2 = cross_val_score(lr, kbest_X_train, y_train, cv=time_split,
                             scoring='roc_auc', n_jobs=-1)

print((cv_scores_1, cv_scores_1.mean()), (cv_scores_2, cv_scores_2.mean()))

(array([0.90078713, 0.8250601 , 0.93537853, 0.98023751, 0.91629899,
       0.96431189, 0.94736491, 0.9524913 , 0.8834486 , 0.96206353]), 0.9267442484537618) (array([0.8597026 , 0.72166563, 0.88656256, 0.96148235, 0.87898207,
       0.93132678, 0.93054671, 0.91691339, 0.81895597, 0.94242813]), 0.884856618974835)


In [14]:
%%time
# Hyperparameters tuning
param_grid = {'C': np.logspace(-2, 2, 10)}
logit_grid_searcher = GridSearchCV(lr, param_grid=param_grid,
                                  scoring='roc_auc', n_jobs=-1,
                                  cv=time_split, verbose=1)

logit_grid_searcher.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished


Wall time: 1min 22s


In [15]:
print(logit_grid_searcher.best_params_, logit_grid_searcher.best_score_)

{'C': 0.5994842503189409} 0.9260464779265046


## Training our shallow neural network

Now, we shall attempt to train our shallow neural network using Keras+Tensorflow!

In [16]:
# Load some important packages
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras import metrics
np.random.seed(17)

Using TensorFlow backend.


In [17]:
# Let's take every 12th data to reduce the size of training set
nn_X_train = X_train[::12, :]
nn_y_train = y_train[::12]

In [18]:
nn_X_train.shape

(21131, 48379)

In [19]:
def create_model():
    # Create model
    model = Sequential()
    model.add(Dense(4, input_dim=48379, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam',
                 metrics=[metrics.categorical_accuracy])
    return model

In [20]:
# Create the model
model = KerasClassifier(build_fn=create_model, verbose=1)
# Fit the model
model.fit(nn_X_train, nn_y_train, epochs=5, batch_size=10)

Epoch 1/5
 1380/21131 [>.............................] - ETA: 42s - loss: 0.5157 - categorical_accuracy: 1.0000

KeyboardInterrupt: 

In [None]:
# Create a seperate hold out set
nn_X_test = X_train[2::12, :]
nn_y_test = y_train[2::12]

In [None]:
cross_val = cross_val_score(model, X_train, y_train,
                           cv=time_split, scoring='roc_auc',
                           n_jobs=1)
print(cross_val, np.mean(cross_val))

In [30]:
predictions = model.predict(X_test)
write_to_submission_file(predictions, 'baseline_16.csv')



When we ran the MLP with all our training data, we managed to score 0.92. However, running it with a smaller training set got us a score of 0.58, a vast drop in performance for our neural network.

It seems like having more data is indeed more important... However, it took us 3 minutes per epoch when training the MLP using the complete training set. 

## Pruning Features

In order for our Neural Network to run faster, let's try pruning some of the less important features. In this case, we will be removing the sites that have only been visited once during the entire training set.

In [21]:
# Sum up the number of times that a site appears in the training set 
site_count = full_sites_sparse[:idx_split, :].sum(axis=0).tolist()[0]
keep_index = [ind for ind, x in enumerate(site_count) if x > 1]
len(keep_index)

26317

In [22]:
# Remove the sites that have only been visited once in the training set
pruned_nn_X_train = X_train[:, keep_index]
print(pruned_nn_X_train.shape, X_train.shape)

(253561, 26317) (253561, 48379)


In [23]:
def create_model():
    # Create model
    model = Sequential()
    model.add(Dense(4, input_dim=26317, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam',
                 metrics=[metrics.categorical_accuracy])
    return model

In [24]:
## Create the model
model = KerasClassifier(build_fn=create_model, verbose=1)
# Fit the model
model.fit(pruned_nn_X_train, y_train, epochs=5, batch_size=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ad3b527048>

In [67]:
cross_val = cross_val_score(model, pruned_nn_X_train, y_train,
                           cv=time_split, scoring='roc_auc',
                           n_jobs=1)
print(cross_val, np.mean(cross_val))

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
[0.58742378 0.54954618 0.76366229 0.93318354 0.79425275 0.85913611
 0.89598052 0.83011927 0.89553738 0.89180542] 0.8000647252461242


## Hyperparameters Optimization

Now, let's try to optimize the parameters of our Neural Network using the GridSearchCV method. We will first optimize the epoch number and batch sizes.

In [25]:
param_grid = {'batch_size': [10, 20, 30],
             'epochs': [5, 10, 20]}

nn_model_grid = GridSearchCV(model, param_grid=param_grid, n_jobs=1,
                             scoring='roc_auc', verbose=True)

In [None]:
nn_model_grid.fit(pruned_nn_X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20