### Decision Tree Model - Trip Departure Time

In [61]:
# Import Necessary Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTR
from sklearn.metrics import roc_auc_score as rs
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
% matplotlib inline

In [62]:
# Load Dataset
df = pd.read_csv('../Data/trip_diary_filtered.csv', index_col=0)
df.head()

Unnamed: 0,PLSAM,SAMPN,PERNO,PLANO,OTAZ,OTRACT,DTAZ,DTRACT,LTMODE_AGG,TRPDIST_HN,...,WTAZ,WTRACT,STUDE,SCHOL,STAZ,STRACT,PTRIPS_V,PTRIPS_T,PTRIPS_NM,PTRIPS_O
0,30000560102,3000056,1,2,2626,5100,2627,9001205200,1.0,2.81,...,2627,9000000000.0,No,,,,6,0,0,0
1,30000560103,3000056,1,3,2627,5200,2626,9001205100,1.0,2.81,...,2627,9000000000.0,No,,,,6,0,0,0
2,30000560104,3000056,1,4,2626,5100,2627,9001205200,1.0,2.81,...,2627,9000000000.0,No,,,,6,0,0,0
6,30000560202,3000056,2,2,2626,5100,2676,34003016000,1.0,69.56,...,2676,34000000000.0,No,,,,2,0,0,0
7,30000560203,3000056,2,3,2676,6000,2626,9001205100,1.0,66.12,...,2676,34000000000.0,No,,,,2,0,0,0


In [63]:
# Print Columns
df.columns

Index([u'PLSAM', u'SAMPN', u'PERNO', u'PLANO', u'OTAZ', u'OTRACT', u'DTAZ',
       u'DTRACT', u'LTMODE_AGG', u'TRPDIST_HN', u'TRPDUR', u'TRP_DEP_HR',
       u'TRP_DEP_MIN', u'TRP_ARR_HR', u'TRP_ARR_MIN', u'TOUR_PURP', u'HHSIZ',
       u'INCOM', u'HHVEH', u'HHCHD', u'RESTY', u'GENDER', u'AGE_R', u'RACE',
       u'EMPLY', u'WORKS', u'WDAYS', u'INDUS', u'OCCUP', u'WTAZ', u'WTRACT',
       u'STUDE', u'SCHOL', u'STAZ', u'STRACT', u'PTRIPS_V', u'PTRIPS_T',
       u'PTRIPS_NM', u'PTRIPS_O'],
      dtype='object')

In [64]:
# Select columns for decision tree
data = df[['TRP_DEP_HR', 'TRP_DEP_MIN', 'TOUR_PURP', 'HHSIZ', 'INCOM', 'HHVEH', 'HHCHD', 'RESTY', 'GENDER', 'AGE_R', 'RACE',
       'EMPLY', 'WORKS', 'WDAYS', 'INDUS', 'OCCUP', 'WTAZ', 'WTRACT', 'STUDE', 'SCHOL', 'STAZ', 'STRACT']]
data.head()

Unnamed: 0,TRP_DEP_HR,TRP_DEP_MIN,TOUR_PURP,HHSIZ,INCOM,HHVEH,HHCHD,RESTY,GENDER,AGE_R,...,WORKS,WDAYS,INDUS,OCCUP,WTAZ,WTRACT,STUDE,SCHOL,STAZ,STRACT
0,7,29,9,6,5,3,2,1,Female,35-54 years,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,,,
1,8,35,9,6,5,3,2,1,Female,35-54 years,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,,,
2,12,37,1,6,5,3,2,1,Female,35-54 years,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,,,
6,6,15,1,6,5,3,2,1,Male,35-54 years,...,Works,Five,MANUFACTURING,PERSONAL CARE AND SERVICE OCCUPATIONS,2676,34000000000.0,No,,,
7,18,7,1,6,5,3,2,1,Male,35-54 years,...,Works,Five,MANUFACTURING,PERSONAL CARE AND SERVICE OCCUPATIONS,2676,34000000000.0,No,,,


In [65]:
# Check columns with n/a values
data.isnull().any()

TRP_DEP_HR     False
TRP_DEP_MIN    False
TOUR_PURP      False
HHSIZ          False
INCOM          False
HHVEH          False
HHCHD          False
RESTY          False
GENDER         False
AGE_R          False
RACE           False
EMPLY           True
WORKS           True
WDAYS           True
INDUS           True
OCCUP           True
WTAZ            True
WTRACT          True
STUDE          False
SCHOL           True
STAZ            True
STRACT          True
dtype: bool

In [66]:
# Prepare Dataset
data = data.fillna('Not Applicable')
data.head()

Unnamed: 0,TRP_DEP_HR,TRP_DEP_MIN,TOUR_PURP,HHSIZ,INCOM,HHVEH,HHCHD,RESTY,GENDER,AGE_R,...,WORKS,WDAYS,INDUS,OCCUP,WTAZ,WTRACT,STUDE,SCHOL,STAZ,STRACT
0,7,29,9,6,5,3,2,1,Female,35-54 years,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,Not Applicable,Not Applicable,Not Applicable
1,8,35,9,6,5,3,2,1,Female,35-54 years,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,Not Applicable,Not Applicable,Not Applicable
2,12,37,1,6,5,3,2,1,Female,35-54 years,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,Not Applicable,Not Applicable,Not Applicable
6,6,15,1,6,5,3,2,1,Male,35-54 years,...,Works,Five,MANUFACTURING,PERSONAL CARE AND SERVICE OCCUPATIONS,2676,34000000000.0,No,Not Applicable,Not Applicable,Not Applicable
7,18,7,1,6,5,3,2,1,Male,35-54 years,...,Works,Five,MANUFACTURING,PERSONAL CARE AND SERVICE OCCUPATIONS,2676,34000000000.0,No,Not Applicable,Not Applicable,Not Applicable


In [67]:
#data['TRP_DEP_TIME'] = 60*data['TRP_DEP_HR'] + data['TRP_DEP_MIN']
data.drop(['TRP_DEP_MIN'], 1,inplace=True)
data.head()

Unnamed: 0,TRP_DEP_HR,TOUR_PURP,HHSIZ,INCOM,HHVEH,HHCHD,RESTY,GENDER,AGE_R,RACE,...,WORKS,WDAYS,INDUS,OCCUP,WTAZ,WTRACT,STUDE,SCHOL,STAZ,STRACT
0,7,9,6,5,3,2,1,Female,35-54 years,White,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,Not Applicable,Not Applicable,Not Applicable
1,8,9,6,5,3,2,1,Female,35-54 years,White,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,Not Applicable,Not Applicable,Not Applicable
2,12,1,6,5,3,2,1,Female,35-54 years,White,...,Works,Five,EDUCATIONAL SERVICES,OFFICE AND ADMINISTRATIVE SUPPORT OCCUPATIONS,2627,9000000000.0,No,Not Applicable,Not Applicable,Not Applicable
6,6,1,6,5,3,2,1,Male,35-54 years,White,...,Works,Five,MANUFACTURING,PERSONAL CARE AND SERVICE OCCUPATIONS,2676,34000000000.0,No,Not Applicable,Not Applicable,Not Applicable
7,18,1,6,5,3,2,1,Male,35-54 years,White,...,Works,Five,MANUFACTURING,PERSONAL CARE AND SERVICE OCCUPATIONS,2676,34000000000.0,No,Not Applicable,Not Applicable,Not Applicable


In [74]:
# Select target variable and feature space
Y = data['TRP_DEP_HR']
X = pd.get_dummies(data.iloc[:, 1:])

X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=0.3, random_state=999)

In [75]:
X.columns

Index([u'TOUR_PURP', u'HHSIZ', u'INCOM', u'HHVEH', u'HHCHD', u'RESTY',
       u'GENDER_Female', u'GENDER_Male', u'GENDER_RF', u'AGE_R_16-18 years',
       ...
       u'STAZ_99', u'STAZ_Location Outside of NYBPM Area',
       u'STAZ_Not Applicable', u'STRACT_3.40E+10', u'STRACT_3.60E+10',
       u'STRACT_3.61E+10', u'STRACT_9.00E+09', u'STRACT_9.01E+09',
       u'STRACT_Location Outside of NYBPM Area', u'STRACT_Not Applicable'],
      dtype='object', length=3066)

In [76]:
dtr = DTR()
dtr.fit(X_train,y_train)
pred = dtr.predict(X_test)

In [77]:
rs(pd.get_dummies(y_test), pd.get_dummies(pred))

0.52515579727153583

In [78]:
Feature_importance=pd.DataFrame([list(X_train.columns),list(dtr.feature_importances_)]).T
Feature_importance.columns=["variables","importance"]
Feature_importance.sort_values(by="importance",ascending=False).iloc[:15,:]

Unnamed: 0,variables,importance
2,INCOM,0.0922346
3,HHVEH,0.0685385
1,HHSIZ,0.0575951
0,TOUR_PURP,0.0411057
4,HHCHD,0.0272525
5,RESTY,0.0246217
12,AGE_R_35-54 years,0.0213423
13,AGE_R_55-64 years,0.0191846
26,RACE_White,0.0174834
2488,WTRACT_3.61E+10,0.0129678
