# Mode Choice Optima

### Importing data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("ModeChoiceOptima.txt", sep="\t")
df.head()

Unnamed: 0,ID,DestAct,NbTransf,TimePT,WalkingTimePT,WaitingTimePT,CostPT,CostCar,TimeCar,NbHousehold,...,FreqTripHouseh,Region,distance_km,Choice,InVehicleTime,ModeToSchool,ReportedDuration,CoderegionCAR,age,Weight
0,10350017,2,4,85,23,10,12.4,3.17,32,2,...,4,1,30.0,1,52,3,255,1,27,0.000379
1,10350020,1,4,108,26,16,12.4,3.28,30,2,...,4,1,32.0,-1,66,3,150,1,28,0.000341
2,10350025,11,2,82,33,5,3.0,0.45,6,-1,...,2,1,4.5,0,44,-1,20,1,-1,0.000368
3,10350075,1,3,107,21,31,24.0,2.36,23,2,...,1,1,25.0,1,55,-1,30,1,63,0.000368
4,10350085,1,5,190,116,18,10.8,1.16,14,3,...,3,1,12.5,1,56,-1,20,1,57,0.000409


In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,2265.0,4.506873e+07,2.402392e+07,1.035002e+07,2.824097e+07,4.464044e+07,5.444027e+07,9.604054e+07
DestAct,2265.0,4.139073e+00,3.559886e+00,-1.000000e+00,1.000000e+00,3.000000e+00,7.000000e+00,1.100000e+01
NbTransf,2265.0,2.132450e+00,2.244178e+00,0.000000e+00,0.000000e+00,2.000000e+00,4.000000e+00,1.500000e+01
TimePT,2265.0,1.136199e+02,9.117243e+01,0.000000e+00,5.000000e+01,9.100000e+01,1.490000e+02,8.260000e+02
WalkingTimePT,2265.0,4.084238e+01,2.882418e+01,0.000000e+00,2.000000e+01,3.400000e+01,5.300000e+01,3.060000e+02
...,...,...,...,...,...,...,...,...
ModeToSchool,2265.0,3.726711e+00,1.771864e+00,-2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,8.000000e+00
ReportedDuration,2265.0,6.295276e+01,7.887233e+01,-1.000000e+00,1.800000e+01,3.900000e+01,7.700000e+01,8.550000e+02
CoderegionCAR,2265.0,4.483885e+00,2.103512e+00,1.000000e+00,3.000000e+00,5.000000e+00,6.000000e+00,8.000000e+00
age,2265.0,4.659823e+01,1.897462e+01,-1.000000e+00,3.800000e+01,4.700000e+01,6.000000e+01,8.800000e+01


### Compare machine learning methods

 - Neural Networks: based on current literature this method seems to provide the highest accuracy. It can be prone to overfitting, however, and is difficult to predict.
 - Random Forests: is a method that creates multiple decision trees, thus it is more accurate and less prone to overfitting, though also complex to interpret.
 - Naïve Bayes: is seemingly the least accurate as it assumes that each input factor is independent of the other input factors. On the other hand, it is quite interpretable however, according to Hagenauer and Helbich (2017). This method can be competitive with more complex methods in some cases.


In [4]:
# Predict the choice using the following methods:
# - Naive Bayes
# - Random Forests
# - Neural Networks
# - Logistic Regression
# - K-Nearest Neighbors
# - Support Vector Machines
# - Decision Trees
# - Gradient Boosting
# - AdaBoost

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [5]:
# Split the data into training and test sets
X = df.drop('Choice', axis=1)
y = df['Choice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Create a list of the methods to be used
methods = [
    GaussianNB(),
    RandomForestClassifier(),
    MLPClassifier(),
    LogisticRegression(),
    KNeighborsClassifier(),
    SVC(),
    DecisionTreeClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier()
]

In [7]:
# Create a list of the method names
method_names = [
    'Naive Bayes',
    'Random Forests',
    'Neural Networks',
    'Logistic Regression',
    'K-Nearest Neighbors',
    'Support Vector Machines',
    'Decision Trees',
    'Gradient Boosting',
    'AdaBoost'
]

In [8]:
# Create a dictionary to store the method accuracies
method_accuracies = {}

In [9]:
# Loop through the methods and print the accuracy
for method in methods:
    method.fit(X_train, y_train)
    y_pred = method.predict(X_test)
    method_accuracies[method_names[methods.index(method)]] = accuracy_score(y_test, y_pred)
    print(method_names[methods.index(method)], accuracy_score(y_test, y_pred))

Naive Bayes 0.5121412803532008
Random Forests 0.6887417218543046
Neural Networks 0.059602649006622516
Logistic Regression 0.4878587196467991
K-Nearest Neighbors 0.4922737306843267
Support Vector Machines 0.5121412803532008
Decision Trees 0.5982339955849889
Gradient Boosting 0.7483443708609272
AdaBoost 0.6048565121412803


In [10]:
# Create a dataframe from the dictionary
df_accuracies = pd.DataFrame.from_dict(method_accuracies, orient='index', columns=['Accuracy'])

### Handle missing values
Handle missing values. Missing values are currently represented as -1. Replace these with the mean of the column. Then, re-run the machine learning methods and compare the results.

In [11]:
# Drop the rows with 3 or more missing values. Missing values are represented as -1.
df2 = df.drop(df[df.isin([-1]).sum(axis=1) >= 3].index)

# Print the number of dropped columns and total columns remaining, using f-string formatting.
print(f"{df.shape[0] - df2.shape[0]} of {df.shape[0]} rows dropped, {df2.shape[0]} rows remaining.")

588 of 2265 rows dropped, 1677 rows remaining.


In [12]:
# Replace the missing values with the mean of the column, rounded and saved as int
df2 = df2.replace(-1, df2.mean().round().astype(int))

In [13]:
# Split the data into training and test sets
X = df2.drop('Choice', axis=1)
y = df2['Choice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Create a new dictionary for the accuracies
method_accuracies2 = {}

In [15]:
# Loop through the methods and print the accuracy
for method in methods:
    method.fit(X_train, y_train)
    y_pred = method.predict(X_test)
    method_accuracies2[method_names[methods.index(method)]] = accuracy_score(y_test, y_pred)
    print(method_names[methods.index(method)], accuracy_score(y_test, y_pred))

Naive Bayes 0.7232142857142857
Random Forests 0.8452380952380952
Neural Networks 0.7232142857142857
Logistic Regression 0.7202380952380952
K-Nearest Neighbors 0.7351190476190477
Support Vector Machines 0.7232142857142857
Decision Trees 0.7678571428571429
Gradient Boosting 0.875
AdaBoost 0.7946428571428571


In [16]:
# Add a new column to the dataframe
df_accuracies['Accuracy (Missing Values Replaced)'] = method_accuracies2.values()

# Calculate the difference in accuracy
df_accuracies['Improvement'] = df_accuracies['Accuracy (Missing Values Replaced)'] - df_accuracies['Accuracy']
df_accuracies

Unnamed: 0,Accuracy,Accuracy (Missing Values Replaced),Improvement
Naive Bayes,0.512141,0.723214,0.211073
Random Forests,0.688742,0.845238,0.156496
Neural Networks,0.059603,0.723214,0.663612
Logistic Regression,0.487859,0.720238,0.232379
K-Nearest Neighbors,0.492274,0.735119,0.242845
Support Vector Machines,0.512141,0.723214,0.211073
Decision Trees,0.598234,0.767857,0.169623
Gradient Boosting,0.748344,0.875,0.126656
AdaBoost,0.604857,0.794643,0.189786


In [17]:
# Sort the dataframe by the accuracy (missing values replaced)
df_accuracies.sort_values(by='Accuracy (Missing Values Replaced)', ascending=False)

Unnamed: 0,Accuracy,Accuracy (Missing Values Replaced),Improvement
Gradient Boosting,0.748344,0.875,0.126656
Random Forests,0.688742,0.845238,0.156496
AdaBoost,0.604857,0.794643,0.189786
Decision Trees,0.598234,0.767857,0.169623
K-Nearest Neighbors,0.492274,0.735119,0.242845
Naive Bayes,0.512141,0.723214,0.211073
Neural Networks,0.059603,0.723214,0.663612
Support Vector Machines,0.512141,0.723214,0.211073
Logistic Regression,0.487859,0.720238,0.232379


## Determine Feature Importance
Determine which features are the most important in predicting the choice. Use the Random Forests method to determine the feature importance.

Metrics for feature importance:
 - Gini Importance: the average decrease in the Gini impurity of the nodes that use the feature.
 - Permutation Importance: the decrease in the model score when a single feature value is randomly shuffled.
 - Drop Column Importance: the decrease in the model score when a single feature is dropped.

In [19]:
# Import permutation importance
from sklearn.inspection import permutation_importance

In [20]:
# Create a list of the metrics
metrics = [
    'gini',
    'permutation',
]

In [21]:
# Create a list of the metric names
metric_names = [
    'Gini Importance',
    'Permutation Importance',
]

In [30]:
# Create a new random forest classifier
clf = RandomForestClassifier()
# Fit the classifier to the training data
clf.fit(X_train, y_train)
# Print the accuracy
print(f"Accuracy: {accuracy_score(y_test, clf.predict(X_test))}")
# Get the feature importances
feat_importances = clf.feature_importances_
# Get the permutation importances
perm_importances = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=42)
# Create a single dataframe from the feature importances and permutation importances
df_importances = pd.DataFrame({'Feature': X.columns, 'Gini Importance': feat_importances, 'Permutation Importance': perm_importances['importances_mean']})

Accuracy: 0.8690476190476191


In [31]:
df_importances.sort_values(by='Gini Importance', ascending=False)

Unnamed: 0,Feature,Gini Importance,Permutation Importance
35,MarginalCostPT,0.057487,0.028274
99,ReportedDuration,0.053393,0.038393
97,distance_km,0.045825,0.018452
7,CostCar,0.035613,0.013988
4,WalkingTimePT,0.031419,0.028869
...,...,...,...
24,Mothertongue,0.002297,0.000298
86,UrbRur,0.002202,0.001488
19,OwnHouse,0.001575,0.001488
16,NewsPaperSubs,0.001464,0.000000


In [36]:
accuracy = 0
prev_accuracy = 0

drop_df = df2.copy()
perm_importance_df = df_importances.drop('Gini Importance', axis=1)
reduced_counter = 0

# While the accuracy keeps improving
while True:
    # Set the previous accuracy to the current accuracy
    prev_accuracy = accuracy
    # Get the feature with the lowest permutation importance
    lowest_perm_importance = perm_importance_df.sort_values(by='Permutation Importance', ascending=True).iloc[0]['Feature']
    # Drop the feature with the lowest permutation importance
    drop_df = drop_df.drop(lowest_perm_importance, axis=1)
    # Split the data into training and test sets
    X = drop_df.drop('Choice', axis=1)
    y = drop_df['Choice']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Create a new random forest classifier
    clf = RandomForestClassifier()
    # Fit the classifier to the training data
    clf.fit(X_train, y_train)
    # Get the accuracy
    accuracy = accuracy_score(y_test, clf.predict(X_test))
    # Calculate the permutation importances
    perm_importances = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=42)
    # Create a dataframe from the permutation importances
    perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Permutation Importance': perm_importances['importances_mean']})

    print(f"Accuracy: {accuracy:.4f}, dropped feature: {lowest_perm_importance}")

    # If the accuracy is reduced 3 times in a row, stop
    if accuracy < prev_accuracy:
        reduced_counter += 1
        if reduced_counter >= 3:
            break
    else:
        reduced_counter = 0

Accuracy: 0.8720, dropped feature: Mobil14
Accuracy: 0.8661, dropped feature: Mobil17
Accuracy: 0.8571, dropped feature: LineRelST
Accuracy: 0.8869, dropped feature: NbTransf
Accuracy: 0.8512, dropped feature: ModeToSchool
Accuracy: 0.8601, dropped feature: BirthYear
Accuracy: 0.8601, dropped feature: ResidCh07
Accuracy: 0.8958, dropped feature: Mobil04
Accuracy: 0.8810, dropped feature: LifSty02
Accuracy: 0.8780, dropped feature: Mobil21
Accuracy: 0.8720, dropped feature: SocioProfCat


In [37]:
# Convert this notebook to html
!jupyter nbconvert --to html "test.ipynb"

[NbConvertApp] Converting notebook test.ipynb to html
[NbConvertApp] Writing 632908 bytes to test.html
