In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Activation
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
RIDER_LABEL_DICT = {
    0: 'random riders',
    1: 'weekend riders',
    2: 'less flexible commuters with normal commute hours',
    3: 'less flexible commuters with early commute hours',
    4: 'more flexible commuters with normal commute hours',
    5: 'more flexible commuters with early commute hours',
    6: 'weekend riders who also ride over weekdays'
}

In [4]:
def generate_training_samples(profile_filenames, sample_factor=500, noise_std=0.2):
    hr_cols = ['hr_' + str(i) for i in range(1, 169)]
    X_1D = np.zeros((1, 168))
    y = np.zeros(1,)
    for filename in profile_filenames:
        df_profile = pd.read_csv(filename, index_col=0)
        X_1D = np.vstack((X_1D, df_profile[hr_cols].values))
        y = np.hstack((y, df_profile['manual_label'].values))
    # drop the first (dummy) row
    X_1D = X_1D[1:]
    y = y[1:]
    
    # reshape the time matrix to be 7x24
    X = X_1D.reshape((X_1D.shape[0], 7, 24))

    # Upsample the temporal matrices by adding Gaussian noise
    N = sample_factor * X.shape[0]
    X_expand = np.zeros((N, 7, 24))
    y_expand = np.zeros(N,)
    for i in range(N):
        X_expand[i] = X[int(i/sample_factor)] + np.random.normal(0, noise_std, (7, 24))
        y_expand[i] = y[int(i/sample_factor)]

    # Add a dimension to X_expand for 2D convolution
    X_expand = np.expand_dims(X_expand, axis=-1)
    return X_expand, y_expand

def train_save_cnn(profile_filenames, batch_size=100, epochs=10, model_name='report_cnn.h5'):
    X_expand, y_expand = generate_training_samples(profile_filenames)
    X_train, X_test, y_train, y_test = train_test_split(X_expand, y_expand, test_size=0.2, random_state=297)

    # onehot-encode rider types
    y_train_onehot = np.eye(len(RIDER_LABEL_DICT))[y_train.astype(int)]
    y_test_onehot = np.eye(len(RIDER_LABEL_DICT))[y_test.astype(int)]

    # model train
    model = Sequential()
    model.add(Conv2D(32, (3, 3), padding='same', input_shape=X_train.shape[1:]))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(32, (3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(len(RIDER_LABEL_DICT)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

    # model fit
    model.fit(X_train, y_train_onehot, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test_onehot), shuffle=True)
    
    # model save
    model.save(model_name)

    scores = model.evaluate(X_test, y_test_onehot, verbose=1)
    print('[Finished Fitting] loss:', scores[0])
    print('[Finished Fitting] accuracy:', scores[1])

In [5]:
files = []
hier_s = ['non-hierarchical', 'hierarchical']
algo_s = ['lda', 'kmeans']
months_text = ['2016-Dec','2017-Jan','2017-Feb','2017-Mar','2017-Apr','2017-May','2017-Jun','2017-Jul','2017-Aug',
              '2017-Sep','2017-Oct','2017-Nov']
months = ['1612','1701','1702','1703','1704','1705','1706','1707','1708','1709','1710','1711']
for hier in hier_s:
    for algo in algo_s:
        for month, month_text in zip(months, months_text):
            files.append('data/cached_profiles/{}/{}_cluster_profiles_{}_1_0_{}.csv'.format(month_text,hier,month,algo))
len(files)

48

In [6]:
train_save_cnn(files)

Train on 129200 samples, validate on 32300 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[Finished Fitting] loss: 0.019254854075378303
[Finished Fitting] accuracy: 0.993250773993808
