In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier  
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score, recall_score, precision_score
import matplotlib.pyplot as plt

%matplotlib inline



In [6]:
churn = pd.read_csv('data/cleaned_churn.csv', index_col = 0)

In [7]:
churn['last_trip_date'] = pd.to_datetime(churn['last_trip_date'])
churn['signup_date'] = pd.to_datetime(churn['signup_date'])

In [18]:
earliest = churn['signup_date'].min().toordinal()

In [20]:
churn['signup_date'] = churn['signup_date'].apply(lambda x: x.toordinal() - earliest)

In [21]:
X = churn.drop(['last_trip_date','retention'], axis=1)
y = churn['retention']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
model = GradientBoostingClassifier(learning_rate = .1,
                                 n_estimators=100)

In [47]:
model.fit(X_train, y_train)
feat_imp = model.feature_importances_

In [48]:
yhat = model.predict(X_test)

In [49]:
yhat

array([ True, False,  True, ..., False, False,  True])

In [51]:
from sklearn.model_selection import cross_validate

scoring = {'acc': 'accuracy',
           'prec': 'precision',
           'rec': 'recall'}

scores = cross_validate(GradientBoostingClassifier(), X_train, y_train, scoring=scoring,
                         cv=10, return_train_score=True)
print(scores.keys())
print(np.mean(scores['test_acc'])) 
print(np.mean(scores['test_prec']))
print(np.mean(scores['test_rec']))

dict_keys(['fit_time', 'score_time', 'test_acc', 'train_acc', 'test_prec', 'train_prec', 'test_rec', 'train_rec'])
0.7886333333333333
0.7460465353175694
0.6620458679319889


In [31]:
for a, b in zip(X.columns, feat_imp):
    print(f"{a} : {b}")

avg_dist : 0.018007252677353636
avg_rating_by_driver : 0.22732149969624085
avg_rating_of_driver : 0.011906099882766138
avg_surge : 0.0043499868399234955
surge_pct : 0.1863985912999881
trips_in_first_30_days : 0.04401350413575053
luxury_car_user : 0.06908522816760335
weekday_pct : 0.16936555995377836
rating_by_driver? : 0.0008269753609271424
rating_of_driver? : 0.0007987614404506778
King's Landing : 0.1654351791912978
Winterfell : 0.014682716808935495
Android : 0.03762522016960844
iPhone : 0.05018342437537609


In [32]:
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

In [38]:
gradient_boosting_grid = {'learning_rate': [0.1, 0.15, 0.05],
                          'max_depth': [2, 4, 6],
                          'min_samples_leaf': [1, 2, .5, .1],
                          'max_features': [1.0, 0.3, 0.1,'sqrt','log2'],
                          'subsample' : [1, 0.5, 0.25],
                          'n_estimators': [100],
                          'random_state': [1]}
grid_seearch_boosted_cleaned = GridSearchCV(estimator=GradientBoostingClassifier(), 
                                            n_jobs = -1,
                                            param_grid=gradient_boosting_grid, 
                                             cv=5, verbose=1, return_train_score=True)

tqdm(grid_seearch_boosted_cleaned.fit(X_train,y_train))



Fitting 5 folds for each of 540 candidates, totalling 2700 fits


0it [00:00, ?it/s]

<tqdm.std.tqdm at 0x7fded30e1100>

In [53]:
grid_seearch_boosted_cleaned.best_params_


{'learning_rate': 0.15,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'n_estimators': 100,
 'random_state': 1,
 'subsample': 1}

In [46]:
grid_seearch_boosted_cleaned.best_score_

0.7898333333333334

In [23]:
g = GradientBoostingClassifier(criterion = 'gini')

In [51]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score


In [54]:
grid_seearch_boosted_cleaned.best_params_

{'learning_rate': 0.15,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'n_estimators': 100,
 'random_state': 1,
 'subsample': 1}

In [43]:
from sklearn.model_selection import cross_validate

In [55]:
from sklearn.model_selection import cross_validate

scoring = {'acc': 'accuracy',
           'prec': 'precision',
           'rec': 'recall'}

scores = cross_validate(GradientBoostingClassifier(learning_rate = 0.15, max_depth = 4, max_features =  'sqrt', 
                                                  min_samples_leaf = 1,
                                                  n_estimators = 100,
                                                  random_state = 1), X_train, y_train, scoring=scoring,
                         cv=10, return_train_score=True)
print(scores.keys())
print(np.mean(scores['test_acc'])) 
print(np.mean(scores['test_prec']))
print(np.mean(scores['test_rec']))

dict_keys(['fit_time', 'score_time', 'test_acc', 'train_acc', 'test_prec', 'train_prec', 'test_rec', 'train_rec'])
0.7896666666666666
0.7484210158525815
0.6652509721994007
