 # Libraries

In [1]:
#importing required libraries
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import export_graphviz
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Getting Data

In [2]:
#importing the data from tip0 and tip1 tables at GBQ as Dataframes
projectid = "newyorkdavitabgaryan"

tip0 = pd.read_gbq('SELECT * FROM New_York.tip0', projectid)

Requesting query... ok.
Job ID: c5f2d432-6cbe-4177-9139-2682952170a8
Query running...
Query done.
Cache hit.

Retrieving results...
Got 50000 rows.

Total time taken 11.84 s.
Finished at 2018-03-15 17:14:41.


In [3]:
tip1 = pd.read_gbq('SELECT * FROM New_York.tip1', projectid)

Requesting query... ok.
Job ID: 743831e1-f30c-481a-83ba-fd196e8d6c1d
Query running...
Query done.
Cache hit.

Retrieving results...
Got 50000 rows.

Total time taken 10.6 s.
Finished at 2018-03-15 17:14:57.


In [4]:
#combining the data
data=tip0.append(tip1,ignore_index=False, verify_integrity=False)
data.head()

Unnamed: 0,tip,tip_amount,Hr,Wk,TripMonth,time,trip_distance,fare_amount,extra,mta_tax,tolls_amount,rate_code,passenger_count
0,0,0.0,0,1,7,26.216667,7.66,25.0,0.5,0.5,0.0,1,2
1,0,0.0,0,1,12,36.416667,6.4,27.0,0.5,0.5,0.0,1,1
2,0,0.0,0,1,6,44.716667,16.6,49.5,0.5,0.5,5.54,1,1
3,0,0.0,0,1,10,29.2,9.0,29.5,0.5,0.5,0.0,1,1
4,0,0.0,0,1,10,31.833333,5.23,23.0,0.5,0.5,0.0,1,2


# Cleaning Data

In [5]:
#having look at the data types
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 49999
Data columns (total 13 columns):
tip                100000 non-null object
tip_amount         100000 non-null float64
Hr                 100000 non-null int32
Wk                 100000 non-null int32
TripMonth          100000 non-null int32
time               100000 non-null float64
trip_distance      100000 non-null float64
fare_amount        100000 non-null float64
extra              100000 non-null float64
mta_tax            100000 non-null float64
tolls_amount       100000 non-null float64
rate_code          100000 non-null int32
passenger_count    100000 non-null int32
dtypes: float64(7), int32(5), object(1)
memory usage: 8.8+ MB


In [6]:
# counting values for "tip"
100.0*data.tip.value_counts()/len(data)

0    50.0
1    50.0
Name: tip, dtype: float64

In [7]:
#dropping tip_amount
data=data.drop(['tip_amount'], axis=1)
data.head()

Unnamed: 0,tip,Hr,Wk,TripMonth,time,trip_distance,fare_amount,extra,mta_tax,tolls_amount,rate_code,passenger_count
0,0,0,1,7,26.216667,7.66,25.0,0.5,0.5,0.0,1,2
1,0,0,1,12,36.416667,6.4,27.0,0.5,0.5,0.0,1,1
2,0,0,1,6,44.716667,16.6,49.5,0.5,0.5,5.54,1,1
3,0,0,1,10,29.2,9.0,29.5,0.5,0.5,0.0,1,1
4,0,0,1,10,31.833333,5.23,23.0,0.5,0.5,0.0,1,2


In [8]:
#changing type of "tip" from object to integer
data.tip = data.tip.astype("category").cat.reorder_categories(["0","1"]).cat.codes

In [9]:
#counting values again
data.tip.value_counts()

1    50000
0    50000
Name: tip, dtype: int64

# Target, Features and Train Test Split

In [10]:
# assigning target and features
y = data.tip
x = data.drop(["tip"],axis=1)
x.head()

Unnamed: 0,Hr,Wk,TripMonth,time,trip_distance,fare_amount,extra,mta_tax,tolls_amount,rate_code,passenger_count
0,0,1,7,26.216667,7.66,25.0,0.5,0.5,0.0,1,2
1,0,1,12,36.416667,6.4,27.0,0.5,0.5,0.0,1,1
2,0,1,6,44.716667,16.6,49.5,0.5,0.5,5.54,1,1
3,0,1,10,29.2,9.0,29.5,0.5,0.5,0.0,1,1
4,0,1,10,31.833333,5.23,23.0,0.5,0.5,0.0,1,2


In [11]:
# split 75% train and 25% test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=42)

# Decision Tree Classifier

In [12]:
# grid searching best parameters
test_model = DecisionTreeClassifier(random_state=42)

parameters = dict(max_depth=[i for i in range(5,16)], min_samples_leaf = [i for i in range(100,1000,50)])
selector = GridSearchCV(test_model, parameters)

selector.fit(x_train,y_train)
print(selector.best_params_)

{'max_depth': 7, 'min_samples_leaf': 200}


In [13]:
#appling DTC model
model_tree = DecisionTreeClassifier(random_state=42, max_depth=7,min_samples_leaf=200, class_weight="balanced")
model_tree.fit(x_train,y_train)
print(model_tree.score(x_train,y_train)*100)
print(model_tree.score(x_test,y_test)*100)

89.16266666666667
89.076


In [14]:
# doing cross-validation
print(np.mean(cross_val_score(model_tree,x,y,cv=10))*100)

71.16400000000002


In [15]:
# calculating recall and roc_auc scores
prediction = model_tree.predict(x_test)
print (recall_score(y_test,prediction)*100)
print(roc_auc_score(y_test,prediction)*100)

94.68409236690187
89.08853405062845


In [16]:
# viewing feature importances
model_tree.feature_importances_

array([4.99895575e-03, 4.94640356e-04, 1.38231809e-05, 1.23531248e-03,
       1.58351588e-03, 9.36125424e-01, 1.27745123e-03, 0.00000000e+00,
       7.54293722e-03, 4.64694412e-02, 2.58499001e-04])

In [17]:
# showing them as dataframe
tree_coef = pd.DataFrame(data=model_tree.feature_importances_.reshape(-1,1),index=x_train.columns,columns=["coefficient"])
tree_coef.head(12)

Unnamed: 0,coefficient
Hr,0.004999
Wk,0.000495
TripMonth,1.4e-05
time,0.001235
trip_distance,0.001584
fare_amount,0.936125
extra,0.001277
mta_tax,0.0
tolls_amount,0.007543
rate_code,0.046469


In [18]:
# selecting imprortant features
selected = tree_coef[tree_coef.coefficient>0.01].sort_values("coefficient", axis=0, ascending=False)
selected.head()

Unnamed: 0,coefficient
fare_amount,0.936125
rate_code,0.046469


In [19]:
# fittig final model to data with only selected features
selected_features = selected.index
x_train_selected = x_train[selected_features]
x_test_selected = x_test[selected_features]

model_tree.fit(x_train_selected, y_train)
print(model_tree.score(x_train_selected, y_train)*100)
print(model_tree.score(x_test_selected, y_test)*100)

prediction = model_tree.predict(x_test_selected)
print (recall_score(y_test,prediction)*100)
print(roc_auc_score(prediction, y_test)*100)

export_graphviz(model_tree,out_file="tree.dot",filled=True,feature_names = x_train_selected.columns)

88.96133333333334
88.724
94.41148171905067
89.22646737126378


# Random Forest

In [20]:
# applying Random forest model

model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(x_train,y_train)
print(model_rf.score(x_train,y_train)*100)
print(model_rf.score(x_test,y_test)*100)

# calculating recall and roc_auc scores
prediction = model_rf.predict(x_test)
print (recall_score(y_test,prediction)*100)
print(roc_auc_score(y_test,prediction)*100)

99.26266666666666
87.676
90.4746632456703
87.68225499448266


# Gradient Boosting

In [21]:
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(x_train,y_train)
print(model_gb.score(x_train,y_train)*100)
print(model_gb.score(x_test,y_test)*100)

# calculating recall and roc_auc scores
prediction = model_gb.predict(x_test)
print (recall_score(y_test,prediction)*100)
print(roc_auc_score(y_test,prediction)*100)

89.34
89.092
94.60391276459269
89.10431908983148


In [22]:
# showing feature importances as dataframe
tree_coef = pd.DataFrame(data=model_gb.feature_importances_.reshape(-1,1),index=x_train.columns,columns=["coefficient"])
tree_coef.head(12)

Unnamed: 0,coefficient
Hr,0.046884
Wk,0.039783
TripMonth,0.017999
time,0.109504
trip_distance,0.086643
fare_amount,0.448927
extra,0.026016
mta_tax,0.002808
tolls_amount,0.073541
rate_code,0.106373


In [23]:
# selecting imprortant features
selected = tree_coef[tree_coef.coefficient>0.05].sort_values("coefficient", axis=0, ascending=False)
selected.head()

Unnamed: 0,coefficient
fare_amount,0.448927
time,0.109504
rate_code,0.106373
trip_distance,0.086643
tolls_amount,0.073541


In [24]:
# fittig final model to data with only selected features
selected_features = selected.index
x_train_selected = x_train[selected_features]
x_test_selected = x_test[selected_features]

model_gb.fit(x_train_selected, y_train)
print(model_gb.score(x_train_selected, y_train)*100)
print(model_gb.score(x_test_selected, y_test)*100)

prediction = model_gb.predict(x_test_selected)
print (recall_score(y_test,prediction)*100)
print(roc_auc_score(prediction, y_test)*100)


89.30933333333333
89.032
94.6119307248236
89.51898355348179


# Logistic Regression

In [25]:
# applying Logistic Regression on initial data
model_logit = LogisticRegression(random_state=42)
model_logit.fit(x_train,y_train)
print(model_logit.score(x_train,y_train)*100)
print(model_logit.score(x_test,y_test)*100)

# calculating recall and roc_auc scores
prediction = model_logit.predict(x_test)
print (recall_score(y_test,prediction)*100)
print(roc_auc_score(y_test,prediction)*100)

86.176
86.0
87.79666452854393
86.00401553374834


In [26]:
# applying Logistic Regression on selected features
model_logit = LogisticRegression(random_state=42)
model_logit.fit(x_train_selected,y_train)
print(model_logit.score(x_train_selected,y_train)*100)
print(model_logit.score(x_test_selected,y_test)*100)

# calculating recall and roc_auc scores
prediction = model_logit.predict(x_test_selected)
print (recall_score(y_test,prediction)*100)
print(roc_auc_score(y_test,prediction)*100)

86.58533333333334
86.252
88.47017318794099
86.25695760291046
