# Model Ensembling : 
- Combining the effect of multiple estimator.
- Estimator example : - Objects of  `LogisticRegression()`, `KNeighbors()`, `DecisionTreeClassifier()`...etc
- Ensembling  Techniques :
  1. Voting
  2. Bagging
  3. Boosting
  4. Stacking
- Prediction in Ensembling :
  1. Categorical : Majority Vote Approch `VotingClassifier()`
     - `HardVoting()`, `SoftVoting()`
  3. Numerical : Averaging Approch `VotingRegressor()`
     - `Averaging()` , `Weighted Averaging()`
- One model can have more estimetor

In [70]:
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB


import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Voting Classifier

In [11]:
bc = pd.read_csv('BreastCancer.csv',index_col=0)
bc.head(3)

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant


In [15]:
X = bc.drop(columns='Class')
y = bc.Class
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size=0.3,stratify=y)

In [31]:
dtc = DecisionTreeClassifier(random_state=24,max_depth=3)
log = LogisticRegression(random_state=24, C = 0.4)
nb = GaussianNB()
voting = VotingClassifier([('DTC',dtc),('LOG',log),('NB',nb)],voting='soft')
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9761904761904762


In [32]:
y_pred_proba = voting.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test,y_pred_proba))

0.9915458937198067


In [40]:
# GridSerachCV

dtc = DecisionTreeClassifier(random_state=24,max_depth=3)
log = LogisticRegression(random_state=24)
nb = GaussianNB()
voting = VotingClassifier([('DTC',dtc),('LOG',log),('NB',nb)],voting='soft')
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'DTC__min_samples_leaf' : [1,10,20],
'DTC__min_samples_split' : [2,10,20],
'DTC__max_depth' : [None, 2, 3], 'LOG__C': np.linspace(0.001,3,5)  }
gcv = GridSearchCV(voting, param_grid=params, cv = kfold, scoring='roc_auc', verbose=3)
gcv.fit(X, y)


Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5] END DTC__max_depth=None, DTC__min_samples_leaf=1, DTC__min_samples_split=2, LOG__C=0.001;, score=0.997 total time=   0.0s
[CV 2/5] END DTC__max_depth=None, DTC__min_samples_leaf=1, DTC__min_samples_split=2, LOG__C=0.001;, score=0.995 total time=   0.0s
[CV 3/5] END DTC__max_depth=None, DTC__min_samples_leaf=1, DTC__min_samples_split=2, LOG__C=0.001;, score=0.994 total time=   0.0s
[CV 4/5] END DTC__max_depth=None, DTC__min_samples_leaf=1, DTC__min_samples_split=2, LOG__C=0.001;, score=0.990 total time=   0.0s
[CV 5/5] END DTC__max_depth=None, DTC__min_samples_leaf=1, DTC__min_samples_split=2, LOG__C=0.001;, score=0.993 total time=   0.0s
[CV 1/5] END DTC__max_depth=None, DTC__min_samples_leaf=1, DTC__min_samples_split=2, LOG__C=0.75075;, score=0.995 total time=   0.0s
[CV 2/5] END DTC__max_depth=None, DTC__min_samples_leaf=1, DTC__min_samples_split=2, LOG__C=0.75075;, score=0.995 total time=   0.0s
[CV 3/5] END DTC

In [41]:
print(gcv.best_score_)
print(gcv.best_params_)

0.9937986416496842
{'DTC__max_depth': None, 'DTC__min_samples_leaf': 1, 'DTC__min_samples_split': 2, 'LOG__C': np.float64(0.001)}


# HR Dataset

In [42]:
# ct + ohe & ct + voting


In [44]:
hr = pd.read_csv('HR_comma_sep.csv')
X = hr.drop(columns='left')
y = hr.left

In [50]:
# CT + OHE
ohe = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object)),
                            (ohe, make_column_selector(dtype_include=object)),
                            verbose_feature_names_out=False)

In [54]:

dtc = DecisionTreeClassifier(random_state=24,max_depth=3)
log = LogisticRegression(random_state=24)
nb = GaussianNB()
voting = VotingClassifier([('DTC',dtc),('LOG',log),('NB',nb)],voting='soft')

pipe = Pipeline([('CT',ct),('VT',voting)])
# pipe.get_params()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'VT__DTC__min_samples_leaf' : [1,10,20],
'VT__DTC__min_samples_split' : [2,10,20],
'VT__DTC__max_depth' : [None, 2, 3], 'VT__LOG__C': np.linspace(0.001,3,5)  }

gcv = GridSearchCV(pipe, param_grid=params, cv = kfold, scoring='roc_auc', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5] END VT__DTC__max_depth=None, VT__DTC__min_samples_leaf=1, VT__DTC__min_samples_split=2, VT__LOG__C=0.001;, score=0.981 total time=   0.1s
[CV 2/5] END VT__DTC__max_depth=None, VT__DTC__min_samples_leaf=1, VT__DTC__min_samples_split=2, VT__LOG__C=0.001;, score=0.979 total time=   0.1s
[CV 3/5] END VT__DTC__max_depth=None, VT__DTC__min_samples_leaf=1, VT__DTC__min_samples_split=2, VT__LOG__C=0.001;, score=0.976 total time=   0.1s
[CV 4/5] END VT__DTC__max_depth=None, VT__DTC__min_samples_leaf=1, VT__DTC__min_samples_split=2, VT__LOG__C=0.001;, score=0.981 total time=   0.1s
[CV 5/5] END VT__DTC__max_depth=None, VT__DTC__min_samples_leaf=1, VT__DTC__min_samples_split=2, VT__LOG__C=0.001;, score=0.983 total time=   0.1s
[CV 1/5] END VT__DTC__max_depth=None, VT__DTC__min_samples_leaf=1, VT__DTC__min_samples_split=2, VT__LOG__C=0.75075;, score=0.971 total time=   0.1s
[CV 2/5] END VT__DTC__max_depth=None, VT__DTC__min_sa

In [55]:
print(gcv.best_score_)
print(gcv.best_params_)

0.9800906153613582
{'VT__DTC__max_depth': None, 'VT__DTC__min_samples_leaf': 1, 'VT__DTC__min_samples_split': 2, 'VT__LOG__C': np.float64(0.001)}


# Voting Regressor

## With Concrete DataSet

In [94]:
cc = pd.read_csv('Concrete_Data.csv')
X = cc.drop(columns='Strength')
y = cc.Strength
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size=0.3)

In [105]:
# Individual Linear regressor
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_lr = r2_score(y_test,y_pred)
r2_lr

0.5771752777048791

In [97]:
# Individual Decision Tree
dtr = DecisionTreeRegressor(random_state=24)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)
r2_dtr = r2_score(y_test,y_pred)
r2_dtr

0.83089474226832

In [99]:
# Individual ElasticNet
en = ElasticNet()
en.fit(X_train, y_train)
y_pred = en.predict(X_test)
r2_en = r2_score(y_test,y_pred)
r2_en

0.5766806310401152

In [91]:
# Using VotingRegressor
voting = VotingRegressor([('DTR',dtr),('EN',en),('LR',lr)])
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test,y_pred))

0.7641542071921111


# Weighted Average

In [92]:
# Using Weighted Average
voting = VotingRegressor([('DTR',dtr),('EN',en),('LR',lr)],weights=[ 0.7, 0.15, 0.15])
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test,y_pred))

0.8516273862659319


In [93]:
# You can pass each of r2 score as weight[]
voting = VotingRegressor([('DTR',dtr),('EN',en),('LR',lr)],weights=[r2_dtr, r2_en,  r2_lr])
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test,y_pred))

0.7955821089551822


### Using Multiple Estimator of same algorith wijt diff parmeter

In [100]:
dtr1 = DecisionTreeRegressor(random_state = 24, max_depth = None)
dtr2 = DecisionTreeRegressor(random_state = 24, max_depth = 3)
dtr3 = DecisionTreeRegressor(random_state = 24, max_depth = 4)
voting = VotingRegressor([('M1',dtr1),('M2',dtr2),('M3',dtr3)],weights=[ 0.7, 0.15, 0.15])
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test,y_pred))

0.8436797107172187


## Medical Cost Personal(Insaurance) DataSet

In [111]:
insaurance = pd.read_csv('insurance.csv')
insaurance.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [112]:
X = insaurance.drop(columns='charges')
y = insaurance.charges
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size=0.3)

In [113]:
ohe = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object)),
                            (ohe, make_column_selector(dtype_include=object)),
                            verbose_feature_names_out=False)

In [114]:
# Individual Linear regressor
lr = LinearRegression()
pipe = Pipeline([('CT',ct),('LR',lr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_lr = r2_score(y_test,y_pred)
r2_lr

0.7665391799816872

In [115]:
# Individual Decision Tree
dtr = DecisionTreeRegressor(random_state=24)
pipe = Pipeline([('CT',ct),('DTR',dtr)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_dtr = r2_score(y_test,y_pred)
r2_dtr

0.7073555236369116

In [116]:
# Individual ElasticNet
en = ElasticNet()
pipe = Pipeline([('CT',ct),('EN',en)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_en = r2_score(y_test,y_pred)
r2_en

0.49673651151300835

In [118]:
# Using VotingRegressor
voting = VotingRegressor([('DTR',dtr),('EN',en),('LR',lr)])
pipe = Pipeline([('CT',ct),('VT',voting)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test,y_pred))

0.7702125264403765


In [119]:
# Using Weighted Average
voting = VotingRegressor([('DTR',dtr),('EN',en),('LR',lr)],weights=[0.4,0.2,0.4])
pipe = Pipeline([('CT',ct),('VT',voting)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test,y_pred))

0.79295229109966


In [132]:
# Weighted Average with K-fold and GridSearchCV
voting = VotingRegressor([('DTR',dtr),('EN',en),('LR',lr)])
kfold = KFold(n_splits=5, shuffle=True, random_state=24)

pipe = Pipeline([('CT',ct),('VT',voting)])
pipe.get_params()
params = {'VT__weights': [[0.4, 0.2, 0.4]]}
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='r2', verbose=3)
gcv.fit(X,y)
# y_pred = gcv.predict(X_test)
# print(r2_score(y_test,y_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END .......VT__weights=[0.4, 0.2, 0.4];, score=0.819 total time=   0.0s
[CV 2/5] END .......VT__weights=[0.4, 0.2, 0.4];, score=0.752 total time=   0.0s
[CV 3/5] END .......VT__weights=[0.4, 0.2, 0.4];, score=0.832 total time=   0.0s
[CV 4/5] END .......VT__weights=[0.4, 0.2, 0.4];, score=0.767 total time=   0.0s
[CV 5/5] END .......VT__weights=[0.4, 0.2, 0.4];, score=0.803 total time=   0.0s


In [133]:
print("Best Score: ",gcv.best_score_)
print("Best Params: ",gcv.best_params_)

Best Score:  0.7943764486109827
Best Params:  {'VT__weights': [0.4, 0.2, 0.4]}
