In [35]:
# Import Necessary Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import roc_auc_score as rs
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
% matplotlib inline

In [36]:
# Load Dataset
df = pd.read_csv('../Data/trip_survey_working.csv', index_col=0)
df.head()

Unnamed: 0,SAMPN,PERNO,PERTYPE,ORIG_HOME,DEST_HOME,DOW_x,OTAZ,DTAZ,OTPURP_AGG,DTPURP_AGG,...,WFIPS,WMODE,O_WMODE,TTTWS,CDRIV,WSTRT,WEND,STRVR,ENDVR,STUDE
0,3000056,1,2,1,0,3,2626,2627,0,9,...,Fairfield,Auto Passenger,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No
1,3000056,1,2,0,1,3,2627,2626,9,0,...,Fairfield,Auto Passenger,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No
2,3000056,1,2,1,0,3,2626,2627,0,1,...,Fairfield,Auto Passenger,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No
3,3000056,1,2,0,1,3,2627,2626,1,0,...,Fairfield,Auto Passenger,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No
4,3000056,1,2,1,0,3,2626,2628,0,9,...,Fairfield,Auto Passenger,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No


In [37]:
# Select target variable and feature space
X = pd.get_dummies(df[['ODTPURP2_R', 'GENDER', 'AGE_R', 'INCOM_R', 'HHVEH', 'LIC', 'WORKS', 'TRIPDIST_R1',
                'TRPDUR_R', 'OTAZ', 'OTRACT', 'DTAZ', 'DTRACT', 'TOD_R']])
Y = df['PMODE_R']


X.dropna(inplace=True)
Y.dropna(inplace=True)
X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=0.3, random_state=999)

In [38]:
# Find best C parameter
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': np.linspace(1,50,30)}
dtc = DTC()
C_param = GridSearchCV(dtc, param_grid, cv=5)
C_param.fit(X_train, y_train)
C_param.best_params_

{'max_depth': 9.4482758620689662}

In [39]:
# Cross validate using best C parameter
avg_acc = []
for i in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.33, random_state=i) #Use random_state to fix samples
    X_train_dummies = pd.get_dummies(X_train)
    X_test_dummies = pd.get_dummies(X_test)

    clf = DTC(max_depth=C_param.best_params_.values()[0]) 
    clf.fit(X_train_dummies, Y_train)

    avg_acc.append(1.0*(clf.predict(X_test_dummies)==np.asarray(Y_test)).sum()/len(Y_test))

print ("Successfully (OS) predict {}% of the modes".format(np.mean(avg_acc)*100))

Successfully (OS) predict 86.1288658585% of the modes


In [40]:
Feature_importance=pd.DataFrame([list(X_train.columns),list(clf.feature_importances_)]).T
Feature_importance.columns=["variables","importance"]
Feature_importance.sort_values(by="importance",ascending=False).iloc[:15,:]

Unnamed: 0,variables,importance
5,OTAZ,0.322695
3,TRIPDIST_R1,0.224008
2,HHVEH,0.114076
7,DTAZ,0.0937297
4,TRPDUR_R,0.0926981
21,LIC_Yes,0.0829692
0,ODTPURP2_R,0.0316069
8,DTRACT,0.0106503
6,OTRACT,0.00711198
9,TOD_R,0.00707497
