In [239]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, VotingRegressor
# Silence warnings
import warnings
warnings.filterwarnings('ignore')

In [29]:
import pandas as pd
df = pd.read_csv('cab_rides.csv', nrows=10000)
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   distance          10000 non-null  float64
 1   cab_type          10000 non-null  object 
 2   time_stamp        10000 non-null  int64  
 3   destination       10000 non-null  object 
 4   source            10000 non-null  object 
 5   price             9227 non-null   float64
 6   surge_multiplier  10000 non-null  float64
 7   id                10000 non-null  object 
 8   product_id        10000 non-null  object 
 9   name              10000 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 781.4+ KB


In [31]:
df[df.isna().any(axis=1)]

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name
18,1.11,Uber,1543673584211,West End,North End,,1.0,fa5fb705-03a0-4eb9-82d9-7fe80872f754,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
31,2.48,Uber,1543794776318,South Station,Beacon Hill,,1.0,eee70d94-6706-4b95-a8ce-0e34f0fa8f37,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
40,2.94,Uber,1543523885298,Fenway,North Station,,1.0,7f47ff53-7cf2-4a6a-8049-83c90e042593,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
60,1.16,Uber,1544731816318,West End,North End,,1.0,43abdbe4-ab9e-4f39-afdc-31cfa375dc25,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
69,2.67,Uber,1543583283653,Beacon Hill,North End,,1.0,80db1c49-9d51-4575-a4f4-1ec23b4d3e31,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
...,...,...,...,...,...,...,...,...,...,...
9949,1.08,Uber,1543272429665,North End,North Station,,1.0,74fffcba-da67-42d1-b585-13d546a125be,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
9953,2.46,Uber,1545045010035,Beacon Hill,Fenway,,1.0,18c2e91d-d594-4a22-9be7-0a5829efa4bf,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
9965,2.58,Uber,1544815809335,Beacon Hill,South Station,,1.0,77adadfb-4ac7-4cdf-aeab-6c4cfe8f7b26,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
9985,1.89,Uber,1544695512211,Beacon Hill,North End,,1.0,f2dfa974-f9d1-4e90-a0e6-77f7eea16956,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi


In [32]:
df.dropna(inplace=True)

In [35]:
df['date'] = pd.to_datetime(df['time_stamp'])
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,1970-01-01 00:25:44.952607890
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,1970-01-01 00:25:43.284023677
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,1970-01-01 00:25:43.366822198
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,1970-01-01 00:25:43.553582749
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,1970-01-01 00:25:43.463360223


In [36]:
df['date'] = pd.to_datetime(df['time_stamp']*(10**6))
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223


In [39]:
import datetime as dt
df['month'] = df['date'].dt.month
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek

In [40]:
def weekend(row):
    if row['dayofweek'] in [5,6]:
        return 1
    else:
        return 0

df['weekend'] = df.apply(weekend, axis=1)

In [42]:
def rush_hour(row):
    if (row['hour'] in [6,7,8,9,15,16,17,18]) & (row['weekend'] == 0):
        return 1
    else:
        return 0

df['rush_hour'] = df.apply(rush_hour, axis=1)

In [44]:
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,rush_hour
9995,3.05,Uber,1543504379037,Fenway,North Station,11.5,1.0,934d2fbe-f978-4495-9786-da7b4dd21107,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,2018-11-29 15:12:59.037,11,15,3,0,1
9996,3.05,Uber,1543800477997,Fenway,North Station,26.0,1.0,af8fd57c-fe7c-4584-bd1f-beef1a53ad42,6c84fd89-3f11-4782-9b50-97c468b19529,Black,2018-12-03 01:27:57.997,12,1,0,0,0
9997,3.05,Uber,1543407083241,Fenway,North Station,19.5,1.0,b3c5db97-554b-47bf-908b-3ac880e86103,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-11-28 12:11:23.241,11,12,2,0,0
9998,3.05,Uber,1544896813623,Fenway,North Station,36.5,1.0,fcb35184-9047-43f7-8909-f62a7b17b6cf,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,2018-12-15 18:00:13.623,12,18,5,1,0
9999,2.03,Lyft,1543812781166,Theatre District,Northeastern University,7.0,1.0,7f0e8caf-e057-41eb-bdef-27eb14c88122,lyft_line,Shared,2018-12-03 04:53:01.166,12,4,0,0,0


In [53]:
df['cab_type'].value_counts()

Uber    4654
Lyft    4573
Name: cab_type, dtype: int64

In [55]:
df['cab_freq'] = df.groupby('cab_type')['cab_type'].transform('count')

In [57]:
df['cab_freq'] = df['cab_freq']/len(df)

In [59]:
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,rush_hour,cab_freq
9995,3.05,Uber,1543504379037,Fenway,North Station,11.5,1.0,934d2fbe-f978-4495-9786-da7b4dd21107,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,2018-11-29 15:12:59.037,11,15,3,0,1,0.504389
9996,3.05,Uber,1543800477997,Fenway,North Station,26.0,1.0,af8fd57c-fe7c-4584-bd1f-beef1a53ad42,6c84fd89-3f11-4782-9b50-97c468b19529,Black,2018-12-03 01:27:57.997,12,1,0,0,0,0.504389
9997,3.05,Uber,1543407083241,Fenway,North Station,19.5,1.0,b3c5db97-554b-47bf-908b-3ac880e86103,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-11-28 12:11:23.241,11,12,2,0,0,0.504389
9998,3.05,Uber,1544896813623,Fenway,North Station,36.5,1.0,fcb35184-9047-43f7-8909-f62a7b17b6cf,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,2018-12-15 18:00:13.623,12,18,5,1,0,0.504389
9999,2.03,Lyft,1543812781166,Theatre District,Northeastern University,7.0,1.0,7f0e8caf-e057-41eb-bdef-27eb14c88122,lyft_line,Shared,2018-12-03 04:53:01.166,12,4,0,0,0,0.495611


In [83]:
from sklearn.datasets import load_breast_cancer

In [202]:
X, y = load_breast_cancer(return_X_y=True)

In [203]:
kfold = StratifiedKFold(n_splits=5)

In [204]:
from sklearn.model_selection import cross_val_score

def classification_model(model):
    # Obtain scores of cross-validation using 5 splits
    scores = cross_val_score(model, X, y, cv=kfold)

    # Return mean score
    return scores.mean()

In [205]:
classification_model(XGBClassifier())

0.9648657040832168

In [207]:
classification_model(XGBClassifier(booster='gblinear'))

0.8910572892408011

In [208]:
classification_model(XGBClassifier(booster='dart', one_drop=True))

0.9631113181183046

In [206]:
classification_model(RandomForestClassifier())

0.9578326346840551

In [209]:
classification_model(LogisticRegression(max_iter=10000))

0.9490451793199813

In [220]:
classification_model(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1))

0.9701133364384411

In [210]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [211]:
def y_pred(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    print(score)
    return y_pred

In [212]:
y_pred_gbtree = y_pred(XGBClassifier())

0.958041958041958


In [213]:
y_pred_dart = y_pred(XGBClassifier(booster='dart', one_drop=True))

0.951048951048951


In [214]:
y_pred_lin = y_pred(XGBClassifier(booster='gblinear'))

0.8881118881118881


In [215]:
y_pred_forest = y_pred(RandomForestClassifier())

0.9440559440559441


In [216]:
y_pred_logistic = y_pred(LogisticRegression(max_iter=10000))

0.9370629370629371


In [229]:
y_pred_xgb = y_pred(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1))

0.965034965034965


In [245]:
df_pred = pd.DataFrame(data= np.c_[y_pred_gbtree, y_pred_dart, y_pred_lin, y_pred_forest, y_pred_logistic, y_pred_xgb], 
                  columns=['gbtree', 'dart', 'gblinear', 'forest', 'logistic', 'xgb'])

In [246]:
df_pred.corr()

Unnamed: 0,gbtree,dart,gblinear,forest,logistic,xgb
gbtree,1.0,0.985561,0.826296,0.971247,0.927777,0.95671
dart,0.985561,1.0,0.83945,0.985476,0.941715,0.942396
gblinear,0.826296,0.83945,1.0,0.823078,0.83666,0.813388
forest,0.971247,0.985476,0.823078,1.0,0.955936,0.928198
logistic,0.927777,0.941715,0.83666,0.955936,1.0,0.914111
xgb,0.95671,0.942396,0.813388,0.928198,0.914111,1.0


In [236]:
estimators = []
logistic_model = LogisticRegression(max_iter=10000)
estimators.append(('logistic', logistic_model))
xgb_model = XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1)
estimators.append(('gbtree', xgb_model))
rf_model = RandomForestClassifier()
estimators.append(('rf', rf_model))
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.971883247942866


In [None]:
# X = df_pred

In [None]:
classification_model(LogisticRegression())

In [248]:
scores = cross_val_score(LogisticRegression(), df_pred, y_test, cv=kfold)

In [242]:

# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('lr', LogisticRegression()))
	level0.append(('xgb', XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1)))
	level0.append(('rf', RandomForestClassifier()))
	# define meta learner model
	level1 = LogisticRegression()
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model

In [243]:
model = get_stacking()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_pred, y_test)

0.951048951048951

In [171]:
X, y = fetch_california_housing(return_X_y=True)

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [180]:
y_reg = y_pred_reg(XGBRegressor(objective='reg:squarederror', random_state=2))

0.29179467660677316


In [181]:
y_lin = y_pred_reg(XGBRegressor(booster='gblinear', objective='reg:squarederror', random_state=2))

0.7436996379066284


In [183]:
y_dart = y_pred_reg(XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=True, random_state=2))

0.304899453326878


In [185]:
y_forest = y_pred_reg(RandomForestRegressor())

0.2630470217274644


In [186]:
y_ridge = y_pred_reg(Ridge())

0.5412646245664976


In [187]:
y_lasso = y_pred_reg(Lasso())

0.9684557723019674


In [189]:
df = pd.DataFrame(data= np.c_[y_reg, y_dart, y_lin, y_forest, y_ridge, y_lasso], 
                  columns=['gbtree', 'dart', 'gblinear', 'forest', 'ridge', 'lasso'])

In [190]:
df.corr()

Unnamed: 0,gbtree,dart,gblinear,forest,ridge,lasso
gbtree,1.0,0.997357,0.817918,0.971847,0.887739,0.818925
dart,0.997357,1.0,0.824795,0.97069,0.891045,0.825774
gblinear,0.817918,0.824795,1.0,0.791677,0.914602,0.997838
forest,0.971847,0.97069,0.791677,1.0,0.856447,0.792965
ridge,0.887739,0.891045,0.914602,0.856447,1.0,0.91501
lasso,0.818925,0.825774,0.997838,0.792965,0.91501,1.0


In [193]:
kfold = KFold(n_splits=5)
# create the sub models
estimators = []
model1 = RandomForestRegressor()
estimators.append(('rf', model1))
model2 = XGBRegressor(booster='gblinear', objective='reg:squarederror')
estimators.append(('gblinear', model2))
model3 = Ridge()
estimators.append(('ridge', model3))
# create the ensemble model
ensemble = VotingRegressor(estimators)
scores = cross_val_score(ensemble, X, y, cv=kfold, scoring='neg_mean_squared_error')
# Take square root of the scores
rmse = (-scores)**0.5
print(rmse.mean())

-0.4867647357982808


In [194]:
print(rmse.mean())

0.6973562601458638


In [227]:
df_new = pd.DataFrame([[0, 0, 0, 1, 1], [0, 0, 0, 1, 1], [0 , 1, 0, 1, 0]])

In [228]:
df_new.head()

Unnamed: 0,0,1,2,3,4
0,0,0,0,1,1
1,0,0,0,1,1
2,0,1,0,1,0
