In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 100)
pd.set_option("max_colwidth", 200)

from sklearn import preprocessing
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import RepeatedKFold

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
ordinal = pd.read_csv('../../Isolation Forest/OrdinalDrop1.csv')
ordinal.shape

(2579, 81)

In [3]:
ordinal = ordinal.drop(['Unnamed: 0'], axis=1)
ordinal.shape

(2579, 80)

In [4]:
X = ordinal.drop(['SalePrice'], axis=1)
y = ordinal.SalePrice

In [5]:
gb = GradientBoostingRegressor(random_state=1)
cb = CatBoostRegressor(random_state=1, verbose=False)

### n_jobs @ 2, 1, None makes no impact on stacking_regressor.score

In [6]:
stacking_regressor1 = StackingRegressor(estimators=[('GB', gb)], final_estimator=cb, n_jobs=2,
                                       cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1),
                                       passthrough=True)

In [7]:
stacking_regressor1.fit(X,y)

In [8]:
stacking_regressor1.score(X,y)

0.9897657978120481

In [9]:
stacking_regressor2 = StackingRegressor(estimators=[('GB', gb)], final_estimator=cb, n_jobs=1,
                                       cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1),
                                       passthrough=True)

In [10]:
stacking_regressor2.fit(X,y)

In [11]:
stacking_regressor2.score(X,y)

0.9897657978120481

In [12]:
stacking_regressor3 = StackingRegressor(estimators=[('GB', gb)], final_estimator=cb,
                                       cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1),
                                       passthrough=True)

In [13]:
stacking_regressor3.fit(X,y)

In [14]:
stacking_regressor3.score(X,y)

0.9897657978120481

### cv=5, cv=5, or None has no impact on stacking_regressor.score

In [15]:
stacking_regressor4 = StackingRegressor(estimators=[('GB', gb)], final_estimator=cb,
                                        cv=5,
                                        passthrough=True)

In [16]:
stacking_regressor4.fit(X,y)

In [17]:
stacking_regressor4.score(X,y)

0.9893671498149426

In [18]:
stacking_regressor5 = StackingRegressor(estimators=[('GB', gb)], final_estimator=cb,
                                        cv=5,
                                        passthrough=True)

In [19]:
stacking_regressor5.fit(X,y)

In [20]:
stacking_regressor5.score(X,y)

0.9893671498149426

In [21]:
stacking_regressor6 = StackingRegressor(estimators=[('GB', gb)], final_estimator=cb, passthrough=True)

In [22]:
stacking_regressor6.fit(X,y)

In [23]:
stacking_regressor6.score(X,y)

0.9893671498149426

### scores1, scores2, scores3 --> produce same results (same StackingRegressor + same cv)

### scores4, scores5, scores5 --> produce same results (same StackingRegressor w/default cv for train, same cv for test)

In [24]:
scores1 = cross_validate(stacking_regressor1, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1), return_estimator=True)

In [25]:
scores1

{'fit_time': array([4.04158306, 4.03392816, 4.03959775, 4.01647091, 4.04221487]),
 'score_time': array([0.00351477, 0.00351477, 0.00346327, 0.00340891, 0.00342488]),
 'estimator': [StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a1182afd0>,
                    n_jobs=2, passthrough=True),
  StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a1182e490>,
                    n_jobs=2, passthrough=True),
  StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                     

In [26]:
scores2 = cross_validate(stacking_regressor2, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1), return_estimator=True)

In [27]:
scores2

{'fit_time': array([5.070333  , 5.06353402, 5.0632062 , 5.03131294, 5.05367994]),
 'score_time': array([0.00338101, 0.00498509, 0.00342679, 0.00336194, 0.00336504]),
 'estimator': [StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a232d2280>,
                    n_jobs=1, passthrough=True),
  StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a232d23d0>,
                    n_jobs=1, passthrough=True),
  StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                     

In [28]:
scores3 = cross_validate(stacking_regressor3, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1), return_estimator=True)

In [29]:
scores3

{'fit_time': array([5.06589603, 5.06124496, 5.08118916, 5.04288888, 5.04896116]),
 'score_time': array([0.00336504, 0.00466204, 0.00340199, 0.00336218, 0.00341392]),
 'estimator': [StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a11879460>,
                    passthrough=True),
  StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a232ddf70>,
                    passthrough=True),
  StackingRegressor(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
                    estimators=[('GB',
                                 Gradient

In [30]:
scores4 = cross_validate(stacking_regressor4, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1), return_estimator=True)

In [31]:
scores4

{'fit_time': array([5.05103278, 5.07132387, 5.05107784, 5.03999901, 5.06222796]),
 'score_time': array([0.00334311, 0.00338912, 0.00341988, 0.00488091, 0.00333905]),
 'estimator': [StackingRegressor(cv=5,
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a11879b50>,
                    passthrough=True),
  StackingRegressor(cv=5,
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a23151ee0>,
                    passthrough=True),
  StackingRegressor(cv=5,
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a23151e80>,
                    passth

In [32]:
scores5 = cross_validate(stacking_regressor5, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1), return_estimator=True)

In [33]:
scores5

{'fit_time': array([5.05526495, 5.06732011, 5.06050396, 5.0351069 , 5.05555797]),
 'score_time': array([0.0034461 , 0.0034492 , 0.00415206, 0.00332904, 0.00332212]),
 'estimator': [StackingRegressor(cv=5,
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a23173760>,
                    passthrough=True),
  StackingRegressor(cv=5,
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a23222d90>,
                    passthrough=True),
  StackingRegressor(cv=5,
                    estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a232502b0>,
                    passth

In [34]:
scores6 = cross_validate(stacking_regressor6, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=1), return_estimator=True)

In [35]:
scores6

{'fit_time': array([5.06252789, 5.07124114, 5.05407882, 5.04792809, 5.05622196]),
 'score_time': array([0.00339508, 0.00339293, 0.00336123, 0.00337481, 0.00338507]),
 'estimator': [StackingRegressor(estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a117e56d0>,
                    passthrough=True),
  StackingRegressor(estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a23222460>,
                    passthrough=True),
  StackingRegressor(estimators=[('GB',
                                 GradientBoostingRegressor(random_state=1))],
                    final_estimator=<catboost.core.CatBoostRegressor object at 0x7f7a23222550>,
                    passthrough=True),
  StackingRegressor(estimators=[('GB',
                          

In [36]:
# Use cross-validation to evaluate model performance

def evaluate_model(model, X, y):
    
    # define the evaluation procedure
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
    
    # evaluate the model and collect the results
    scores = cross_validate(model, X, y, cv=cv, n_jobs=-1)
    
    #scores = cross_validate(model, X, y, scoring=scoring, 
    #                              cv=cv, n_jobs=-1) 
    
    df = pd.DataFrame(scores)
    
    return df

In [37]:
cat = evaluate_model(cb, X, y)

In [38]:
cat

Unnamed: 0,fit_time,score_time,test_score
0,8.14605,0.01952,0.92956
1,7.925259,0.020624,0.948942
2,8.030242,0.020575,0.943433
3,5.159423,0.004241,0.913321
4,5.087507,0.006051,0.951898
5,7.968694,0.022602,0.933712
6,5.251831,0.007186,0.93256
7,5.290271,0.006072,0.927932
8,4.524969,0.005938,0.949594
9,5.19782,0.004343,0.940376


In [39]:
cat = cat[['test_score']].rename(columns={'test_score':'cat_score'})

In [40]:
cat

Unnamed: 0,cat_score
0,0.92956
1,0.948942
2,0.943433
3,0.913321
4,0.951898
5,0.933712
6,0.93256
7,0.927932
8,0.949594
9,0.940376


In [41]:
g = evaluate_model(gb, X, y)

In [42]:
g

Unnamed: 0,fit_time,score_time,test_score
0,0.924757,0.003516,0.911278
1,0.910056,0.002331,0.938055
2,0.957707,0.002256,0.934433
3,0.904654,0.002905,0.896948
4,0.92739,0.004701,0.939217
5,0.923554,0.00229,0.908808
6,0.916041,0.006383,0.926451
7,0.918701,0.00226,0.91749
8,0.882644,0.002276,0.944368
9,0.881497,0.002246,0.924041


In [43]:
g = g[['test_score']].rename(columns={'test_score':'gb_score'})

In [44]:
g

Unnamed: 0,gb_score
0,0.911278
1,0.938055
2,0.934433
3,0.896948
4,0.939217
5,0.908808
6,0.926451
7,0.91749
8,0.944368
9,0.924041


In [45]:
df1 = evaluate_model(stacking_regressor1, X, y)

In [46]:
df1

Unnamed: 0,fit_time,score_time,test_score
0,12.19586,0.009428,0.929655
1,12.164871,0.008604,0.947243
2,12.296514,0.00689,0.943513
3,11.802094,0.010074,0.910384
4,12.47837,0.006757,0.951178
5,11.940661,0.007172,0.925782
6,13.685141,0.009314,0.931796
7,11.708071,0.011135,0.932813
8,10.994438,0.024673,0.948536
9,11.128459,0.005256,0.937194


In [47]:
df1 = df1[['test_score']].rename(columns={'test_score':'test_score_1'})

In [48]:
df1

Unnamed: 0,test_score_1
0,0.929655
1,0.947243
2,0.943513
3,0.910384
4,0.951178
5,0.925782
6,0.931796
7,0.932813
8,0.948536
9,0.937194


In [49]:
df2 = evaluate_model(stacking_regressor2, X, y)

In [50]:
df2

Unnamed: 0,fit_time,score_time,test_score
0,11.005521,0.00703,0.929655
1,11.863029,0.005384,0.947243
2,11.568304,0.010411,0.943513
3,10.836069,0.006812,0.910384
4,10.912638,0.005213,0.951178
5,11.528972,0.01417,0.925782
6,11.837805,0.006936,0.931796
7,11.707841,0.00632,0.932813
8,13.720052,0.004899,0.948536
9,13.277202,0.018403,0.937194


In [51]:
df2 = df2[['test_score']].rename(columns={'test_score':'test_score_2'})

In [52]:
df2

Unnamed: 0,test_score_2
0,0.929655
1,0.947243
2,0.943513
3,0.910384
4,0.951178
5,0.925782
6,0.931796
7,0.932813
8,0.948536
9,0.937194


In [53]:
df3 = evaluate_model(stacking_regressor3, X, y)

In [54]:
df3

Unnamed: 0,fit_time,score_time,test_score
0,11.611137,0.01026,0.929655
1,11.529823,0.010561,0.947243
2,11.414523,0.017625,0.943513
3,13.531156,0.008239,0.910384
4,11.351115,0.011884,0.951178
5,11.594858,0.011451,0.925782
6,11.346429,0.007629,0.931796
7,11.541958,0.008749,0.932813
8,11.448381,0.003979,0.948536
9,11.374058,0.008435,0.937194


In [55]:
df3 = df3[['test_score']].rename(columns={'test_score':'test_score_3'})

In [56]:
df3

Unnamed: 0,test_score_3
0,0.929655
1,0.947243
2,0.943513
3,0.910384
4,0.951178
5,0.925782
6,0.931796
7,0.932813
8,0.948536
9,0.937194


In [57]:
df4 = evaluate_model(stacking_regressor4, X, y)

In [58]:
df4

Unnamed: 0,fit_time,score_time,test_score
0,13.241216,0.010227,0.930282
1,12.702614,0.011034,0.946361
2,12.450101,0.019895,0.938985
3,12.270227,0.020622,0.908425
4,12.383892,0.008264,0.952033
5,12.458958,0.020475,0.929325
6,13.013943,0.013006,0.932299
7,12.756338,0.007829,0.928175
8,10.688427,0.008625,0.94857
9,10.788021,0.007361,0.935603


In [59]:
df4 = df4[['test_score']].rename(columns={'test_score':'test_score_4'})

In [60]:
df4

Unnamed: 0,test_score_4
0,0.930282
1,0.946361
2,0.938985
3,0.908425
4,0.952033
5,0.929325
6,0.932299
7,0.928175
8,0.94857
9,0.935603


In [61]:
df5 = evaluate_model(stacking_regressor5, X, y)

In [62]:
df5

Unnamed: 0,fit_time,score_time,test_score
0,11.930479,0.007337,0.930282
1,11.496202,0.010461,0.946361
2,11.376078,0.015897,0.938985
3,11.16802,0.01046,0.908425
4,11.464942,0.009902,0.952033
5,11.653533,0.008877,0.929325
6,11.247543,0.017991,0.932299
7,11.324624,0.014796,0.928175
8,9.931679,0.008422,0.94857
9,10.889916,0.0034,0.935603


In [63]:
df5 = df5[['test_score']].rename(columns={'test_score':'test_score_5'})

In [64]:
df5

Unnamed: 0,test_score_5
0,0.930282
1,0.946361
2,0.938985
3,0.908425
4,0.952033
5,0.929325
6,0.932299
7,0.928175
8,0.94857
9,0.935603


In [None]:
df6 = evaluate_model(stacking_regressor6, X, y)

In [None]:
df6

In [None]:
df6 = df6[['test_score']].rename(columns={'test_score':'test_score_6'})

In [None]:
df6

In [None]:
combo = pd.concat([cat, g, df1, df2, df3, df4, df5, df6], axis=1)

In [None]:
combo

In [None]:
sns.boxplot(data=combo, showmeans=True);

In [None]:
sns.boxplot(data=combo, showmeans=True);

In [None]:
combo.describe()