# Machine learning and deep learning methods applied to predicting customer status

Purpose: Predict active and not active clients based on the proposed data structure and naive data structure. Here, we implemented Support Vector Machines (SVM), Random Forest (RF), K-nearest Neighbours (KNN) and Lasso.

Author: Gabriel Rodrigues Palma and Rafael de Andrade Moral

# Packages used in the project

In [1]:
# LIME modules
import lime
import lime.lime_tabular

# visualisation modules
import matplotlib.pyplot as plt

# Data manipulation modules
import numpy as np
import pandas as pd

# Machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Deep learning modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from keras.layers import LeakyReLU

# Machine learning packages
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import collections
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import LeaveOneOut

# Additional packages
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Conv2DTranspose, UpSampling2D, Flatten, Reshape
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import L1
from keras import metrics
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K
import tensorflow as tf

# Testing GPU from MacOs
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# Packages used in the project

In [7]:
def create_dataset(path, response_class):
    ''' This function reads and prepare the datasets for applying the ML and DL methods'''
    data = pd.read_csv(path)
    data = data.drop(columns = ['Unnamed: 0', 'Subject'])
    explanatory_variables = data.drop(columns = response_class)
    response_variables = data[response_class]        
    binarizer = LabelBinarizer()
    response_variables = binarizer.fit_transform(response_variables)
    
    onehot_encoder = OneHotEncoder(sparse=False)
    hot_encode_response_variable = onehot_encoder.fit_transform(np.array(response_variables).reshape(-1, 1))          
    
    return(explanatory_variables, 
           hot_encode_response_variable, response_variables)

def check_zero_division_and_get_rates(cm):
    ''' This functions checks for divisions per zeros in the computation of 
       True and false positive rate based on the confusion matrix array. Also, 
       this function returns the checked rates'''
    
    if any(np.sum(cm, axis = 1)==0):            
            tpr = cm[:,1][1]/(np.sum(cm, axis = 1)[1]+1e-16)
            fpr = cm[:,1][0]/(np.sum(cm, axis = 1)[0]+1e-16)    
    else:           
        rates = cm[:,1]/np.sum(cm, axis = 1)    
        fpr = rates[0]    
        tpr = rates[1]
    if np.isnan(tpr):        
        tpr=0
        
    return(tpr, fpr)

def check_and_compute_rates(predictions, 
                            classes, 
                            cm):    
    ''' This function returns the values of true and false positive rate for special cases
       where the division is not possible to obtain automatically based on the confusion
       matrix array provided by sklearn'''
        
    if (sum(predictions) == 0 and sum(classes) == 0):        
        fpr = 0
        tpr = 0
    elif (sum(predictions) == len(predictions) and sum(classes) == len(classes)):        
        fpr = 0
        tpr = 1
    else:            
        tpr, fpr = check_zero_division_and_get_rates(cm)
        
    return(tpr, fpr)
    
def get_rates(y_pred,
              y_true):
    ''' This function get the true and false positive rates based on the 
       predictied'''
            
    cm = confusion_matrix(y_true = y_true, y_pred = y_pred)
    tpr, fpr = check_and_compute_rates(y_pred, y_true, cm)
        
    return(tpr, fpr)

def get_rates_by_cross_validation(raw_data):
    ''' This function obtains the accuracy, true and false positive rates 
       based on the cross k-fold cross validation'''
            
    for train_index, test_index in KFold(n_splits=5, shuffle=True).split(patterns):
        
        x_train, x_test = patterns[train_index], patterns[test_index]
        y_train, y_test = classes[train_index], classes[test_index]
                
        pbp_predictions = pbp_prediction(patterns_array=x_test, 
                                         clustered_patterns = clustered_patterns, 
                                         d_base = d_base, alpha = alpha, 
                   outbreak_p_means = prediction.obtain_p_means_with_distance,
                   outbreak_prediction = prediction.predict_with_distance)
        rates = get_rates(predictions = pbp_predictions, classes = y_test)
        tpr.append(rates[0])
        fpr.append(rates[1])
    
    return(np.mean(tpr[tpr!=np.nan]), np.mean(fpr[fpr!=np.nan]))

def get_statistics(y_pred, y_true):
    ''' This function obtains the following statistics: Accuracy, True positive rate and 
       False positive rate'''

    auc = roc_auc_score(y_score = y_pred,
              y_true = y_true)
    acc = accuracy_score(y_pred = y_pred,
                         y_true = y_true)
    rates = get_rates(y_pred = y_pred,
              y_true = y_true)
    tpr = rates[0]
    fpr = rates[1]
    return([acc, tpr, fpr, auc])

In [89]:
def check_class(lime_importance):
    '''This function classify the class among control or experiment based on the LIME score
    negative scores relates to control and positive experiment'''
    if (lime_importance < 0):
        return('Control')
    else:
        return('Experiment')
def get_lime_importance(explanatory_variables, 
                        response_variables, 
                        trial_name, num_features, 
                        feature_names):
    '''This function obtains the parameters importance based on the LIME method. For that the leave one out cross 
    validation approach was used in order to obtain parameter importance for each subject of the experiment'''
    parameters = []
    loo = LeaveOneOut()
    loo.get_n_splits(explanatory_variables)
    
    
    for train_index, test_index in loo.split(explanatory_variables):
        X_train, X_test = explanatory_variables[train_index], explanatory_variables[test_index]
        y_train, y_test = response_variables[train_index], response_variables[test_index]   
        
        predict_fn_rf = lambda x: rf.predict_proba(x).astype(float)
        explainer = lime.lime_tabular.LimeTabularExplainer(X_train, 
                                                           feature_names = feature_names,
                                                           class_names=['Control', 'Experiment'], 
                                                           verbose = False, mode='classification')

        exp = explainer.explain_instance(X_test[0], predict_fn_rf, num_features=4)
        lime_results = exp.as_list()
        for feature_index in range(num_features):
            lime_feature_importance = list(lime_results[feature_index])
            lime_feature_importance.append(check_class(list(lime_results[feature_index])[1]))
            lime_feature_importance.append(trial_name)
            parameters.append(lime_feature_importance)
    parameters = pd.DataFrame(lime_results_Trial_1, columns = ['Features', 'Local \n Importance', 'Class', 'Trial'])
        
    return(parameters)

# Importing Datasets

Before obtaining the performance of the learning algorithms selected for this paper, we need to import the datasets related to the Hidden Markov Models features and the peak features.

## Coordinate data Trial 1

In [3]:
coordinates_explanatory_variables_t1, \
  coordinates_hot_encode_response_variable_t1, coordinates_response_variable_t1 = create_dataset(path = '../../output_data/New_data/CoordinateDatasets/ZscoredScaledCoordinates_data_Trial1.csv', 
                                                                                        response_class = 'Class')

In [4]:
coordinates_explanatory_variables_t1.head()

Unnamed: 0,dis_sum,angle_sum,average_speed,idle_time
0,4.328422,1444.039766,0.002537,1.5
1,4.301916,909.662506,0.002402,1.75
2,4.007659,651.069516,0.001659,4.5
3,4.350488,1406.028426,0.00266,1.25
4,4.195562,882.929882,0.002113,2.5


In [85]:
lime_results_Trial_1 = get_lime_importance(explanatory_variables = np.array(coordinates_explanatory_variables_t1),
                                           response_variables = coordinates_response_variable_t1, 
                                           trial_name = 'Trial 1',
                                           num_features = 4, 
                                           feature_names = coordinates_explanatory_variables_t1.columns.values.tolist())

[4.32842180e+00 1.44403977e+03 2.53667050e-03 1.50000000e+00]




[4.30191569e+00 9.09662506e+02 2.40240576e-03 1.75000000e+00]




[4.00765930e+00 6.51069516e+02 1.65882334e-03 4.50000000e+00]




[4.35048783e+00 1.40602843e+03 2.65962038e-03 1.25000000e+00]




[4.19556231e+00 8.82929882e+02 2.11335518e-03 2.50000000e+00]




[4.20307333e+00 1.91621540e+03 2.63619777e-03 2.25000000e+00]




[3.97888110e+00 2.06148491e+03 2.33602053e-03 3.75000000e+00]




[4.41838631e+00 1.80219393e+03 3.03468681e-03 1.00000000e+00]




[4.01331409e+00 1.43561481e+03 1.69319076e-03 4.25000000e+00]




[4.34731900e+00 7.90190243e+02 2.58571528e-03 1.50000000e+00]




[4.40716805e+00 1.58712306e+03 3.00651182e-03 1.00000000e+00]




[2.81371475e+00 1.74734699e+03 2.81947249e-03 1.20000000e+01]




[4.31555665e+00 1.54493634e+03 2.39430378e-03 1.75000000e+00]




[4.41349463e+00 2.01482920e+03 3.01743339e-03 1.00000000e+00]




[4.33671992e+00 1.75637526e+03 2.57083715e-03 1.50000000e+00]




[4.36213157e+00 9.76867362e+02 2.71862105e-03 1.25000000e+00]




[4.11534180e+00 1.15943822e+03 1.91873812e-03 3.25000000e+00]




[3.91105530e+00 1.27432277e+03 2.16871176e-03 4.00000000e+00]




[4.31487452e+00 2.89243921e+03 2.46316313e-03 1.75000000e+00]




[4.20022001e+00 2.93922646e+03 2.33676659e-03 1.75000000e+00]




[3.04314173e+00 1.04105471e+03 1.39208968e-03 1.00000000e+01]




[4.34157831e+00 1.75181005e+03 2.67906645e-03 1.25000000e+00]




[1.40997156e+00 3.38999833e+02 5.26737855e-03 1.25000000e+00]




[3.11530193e+00 1.89306752e+03 3.92744151e-03 5.00000000e-01]




[6.59445432e+00 3.74404248e+03 2.57413903e-03 5.00000000e-01]




[1.40524463e+00 4.93741199e+02 9.72467357e-04 1.07500000e+01]




[1.14682873e+00 7.32559366e+02 5.50000325e-03 1.50000000e+00]




[6.45178967e+00 2.91706744e+03 1.80898588e-03 2.00000000e+00]




[1.60587654e+00 5.81283068e+02 7.39495181e-03 5.00000000e-01]




[4.78757664e+00 1.34211348e+03 2.14586373e-03 2.75000000e+00]




[1.66020632e+00 2.85878008e+02 3.48093726e-03 2.50000000e+00]




[3.98039142e+00 2.15896327e+03 2.20223988e-03 2.75000000e+00]




[5.59066319e+00 2.59348063e+03 1.09080512e-03 7.50000000e+00]




[2.53862349e+00 1.46239659e+03 3.01004483e-03 2.25000000e+00]




[4.60259431e+00 9.29648251e+02 1.43927464e-03 5.25000000e+00]




[6.30984593e+00 1.42420342e+03 1.56697057e-03 2.75000000e+00]




[4.63374573e+00 1.68187692e+03 2.21247763e-03 1.20000000e+01]




[5.99843407e+00 2.85046610e+03 1.75462305e-03 3.00000000e+00]




[1.88215218e+00 7.80435883e+02 6.42328911e-03 5.00000000e-01]




[1.64205224e+00 7.61700258e+02 1.35211595e-03 7.75000000e+00]




[6.09888289e+00 1.29924443e+03 1.24337836e-03 4.75000000e+00]




[6.50762051e+00 2.45029323e+03 1.85591846e-03 1.25000000e+00]




[1.22843796e+00 1.70196682e+02 4.37498334e-03 2.25000000e+00]




[3.40277740e+00 1.09022121e+03 2.99913923e-03 1.25000000e+00]




[6.57067956e+00 9.14488335e+02 2.23565697e-03 5.00000000e-01]




[4.25137125e+00 1.99890718e+03 3.15436361e-03 5.00000000e-01]




[2.36303630e+00 1.46201975e+03 4.71434511e-03 5.00000000e-01]




[6.20362784e+00 1.42276528e+03 1.35660128e-03 3.50000000e+00]




[1.09112667e+00 1.19146608e+02 6.58596656e-03 1.00000000e+00]




In [88]:
pd.DataFrame(lime_results_Trial_1, columns = ['Features', 'Local \n Importance', 'Class', 'Trial'])

Unnamed: 0,0,1,2,3
0,0.00 < average_speed <= 0.00,0.060155,Experiment,Trial 1
1,1.25 < idle_time <= 1.88,-0.058647,Control,Trial 1
2,1423.48 < angle_sum <= 1898.85,0.016892,Experiment,Trial 1
3,4.23 < dis_sum <= 4.46,-0.004441,Control,Trial 1
4,1.25 < idle_time <= 1.88,-0.059490,Control,Trial 1
...,...,...,...,...
191,dis_sum > 4.41,0.005056,Experiment,Trial 1
192,average_speed > 0.00,-0.152607,Control,Trial 1
193,idle_time <= 1.25,-0.138778,Control,Trial 1
194,angle_sum <= 913.28,-0.044671,Control,Trial 1


## Coordinate data Trial 12

In [5]:
lime_results_Trial_1 = get_lime_importance(explanatory_variables = np.array(coordinates_explanatory_variables_t1),
                                           response_variables = coordinates_response_variable_t1, 
                                           trial_name = 'Trial 1',
                                           num_features = 4, 
                                           feature_names = coordinates_explanatory_variables_t1.columns.values.tolist())

In [None]:
lime_results_Trial_12 = get_lime_importance(explanatory_variables = np.array(coordinates_explanatory_variables_t12),
                                           response_variables = coordinates_response_variable_t12, 
                                           trial_name = 'Trial 12',
                                           num_features = 4, 
                                           feature_names = coordinates_explanatory_variables_t1.columns.values.tolist())