# Bagging

# Custom Bagging Classifier

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X,y = make_classification(n_samples=1000,n_features=20,n_classes=2,random_state=1)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [None]:
X_train.shape,X_test.shape

((700, 20), (300, 20))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
clf = LogisticRegression()
svm_clf = SVC()
df_clf = DecisionTreeClassifier()
nb_clf = GaussianNB()

In [None]:
from sklearn.ensemble import VotingClassifier
ensemble_clf = VotingClassifier(estimators=[('log_reg',clf),("support vect clf",svm_clf),("decision tree",df_clf),("Gaussian NB",nb_clf)])


In [None]:
ensemble_clf

In [None]:
ensemble_clf.fit(X_train,y_train)

In [None]:
y_pred =ensemble_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test,y_pred)

0.8566666666666667

# Custom Bagging Regression

In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
X,y = make_regression(n_samples=1000,n_features=20,noise=0.1,random_state=1)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [None]:
lr = LinearRegression()
dtr = DecisionTreeRegressor()
svr = SVR(kernel="linear")

In [None]:
from sklearn.ensemble import VotingRegressor
ensemble_regressor = VotingRegressor(estimators = [('mlr',lr),("dtr",dtr),("svr",svr)])

In [None]:
ensemble_regressor

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor

# Create instances of the estimators
lr = LinearRegression()  # Create an instance of LinearRegression
dtr = DecisionTreeRegressor()  # Create an instance of DecisionTreeRegressor
svr = SVR(kernel="linear")  # This was already an instance

# Now create the VotingRegressor with the instances
ensemble_regressor = VotingRegressor(estimators=[('mlr', lr), ("dtr", dtr), ("svr", svr)])

ensemble_regressor  # This should now display correctly

In [None]:
ensemble_regressor.fit(X_train,y_train)

In [None]:
ensemble_regressor.predict(X_test)

array([-151.50259587,   37.21699963, -214.65159049,   67.50653128,
         88.67785167, -161.64740773, -102.81085489,  -40.62160975,
          3.64424643,  -97.15927224,  299.00025453,   24.0233115 ,
         73.61727893,  -59.12863875, -242.50223254, -227.19899056,
       -194.49764186,  -13.67938661, -176.3556873 ,  153.70958043,
        139.72287669, -110.05837891, -114.33449195, -136.00804209,
        -38.23884524,  122.59244134,   81.71606692,   23.64087208,
        208.40988962,   29.94873952,  230.04898282,  223.43756207,
        -49.71768228,  -39.00734984,    5.05066449,   20.51134499,
       -132.91294597,  397.15235958, -171.09754009,   -2.42481075,
         63.34801502,  -48.60937083,  -95.02301651, -132.36962914,
        165.54097633,  -21.70983704,   27.56230527,  109.67655992,
        -94.65177037,  157.13504832, -285.09502813, -198.59329826,
        -88.10946924,   87.09498206,   -6.51876158,  306.871099  ,
        -36.88569175,  -54.7607106 ,   29.9513012 ,  -89.43011

In [None]:
# If M1 = 1 , M2 = 1, M3 = 0, M4 = 0 then how will you select votingclass classifier ?

# Model Training Pipeline & Transform

In [None]:
# See class notes
# Before Model Training => Missing Value Treatment, One Hot Encoding, Scaling
# See Notes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = sns.load_dataset("tips")

In [None]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [None]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [None]:
# EDA (Since it is subjective so EDA will not be part of pipeline)

In [None]:
# Encoding, Scaling, Missing Value Treatment

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])



In [None]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [None]:
# Aim >> To predict time

In [None]:
df.time.unique() # Lunch-1 , Dinner-0

array([0, 1])

In [None]:
x = df.drop('time',axis=1)
y=df['time']

In [None]:
x

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.50,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,3
240,27.18,2.00,Female,Yes,Sat,2
241,22.67,2.00,Male,Yes,Sat,2
242,17.82,1.75,Male,No,Sat,2


In [None]:
y

Unnamed: 0,time
0,0
1,0
2,0
3,0
4,0
...,...
239,0
240,0
241,0
242,0


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
x_train

Unnamed: 0,total_bill,tip,sex,smoker,day,size
59,48.27,6.73,Male,No,Sat,4
102,44.30,2.50,Female,Yes,Sat,3
147,11.87,1.63,Female,No,Thur,2
223,15.98,3.00,Female,No,Fri,3
88,24.71,5.85,Male,No,Thur,2
...,...,...,...,...,...,...
137,14.15,2.00,Female,No,Thur,2
72,26.86,3.14,Female,Yes,Sat,2
140,17.47,3.50,Female,No,Thur,2
235,10.07,1.25,Male,No,Sat,2


In [None]:
x_test

Unnamed: 0,total_bill,tip,sex,smoker,day,size
67,3.07,1.00,Female,Yes,Sat,1
243,18.78,3.00,Female,No,Thur,2
206,26.59,3.41,Male,Yes,Sat,3
122,14.26,2.50,Male,No,Thur,2
89,21.16,3.00,Male,No,Thur,2
...,...,...,...,...,...,...
158,13.39,2.61,Female,No,Sun,2
99,12.46,1.50,Male,No,Fri,2
173,31.85,3.18,Male,Yes,Sun,2
176,17.89,2.00,Male,Yes,Sun,2


In [None]:
y_train

Unnamed: 0,time
59,0
102,0
147,1
223,1
88,1
...,...
137,1
72,0
140,1
235,0


In [None]:
df.isnull().sum()

Unnamed: 0,0
total_bill,0
tip,0
sex,0
smoker,0
day,0
time,0
size,0


In [None]:
# Handling missing values,
# Data Encoding
# Feature Scaling

**Let's make a pipeline**

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Import Pipeline >> Pipeline is a sequence of data transformation
# Coloumn Transformer groups all the pipeline steps for each of the coloums

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [None]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [None]:
cat_cols = ['sex','smoker','day']
num_cols = ['total_bill','tip','size']

In [None]:
cat_cols

['sex', 'smoker', 'day']

In [None]:
# Feature Engeenering Automation using pipeline and coloumn transformer

# How to make a pipeline ?

pipeline_num = Pipeline(steps=[('imputations',SimpleImputer(strategy = "median")),('scaling',StandardScaler())])
pipeline_cat = Pipeline(steps=[('imputations',SimpleImputer(strategy='most_frequent')),('encoding',OneHotEncoder())])

**Simple Imputer:- See Scikit API**

In [None]:
preprocessor = ColumnTransformer([('pipeline_num',pipeline_num,num_cols),('pipeline_cat',pipeline_cat,cat_cols)])

In [None]:
preprocessor

In [None]:
X_train = preprocessor.fit_transform(x_train)
X_test = preprocessor.transform(x_test)

In [None]:
X_train

array([[ 3.27957576,  2.93016025,  1.4480456 , ...,  1.        ,
         0.        ,  0.        ],
       [ 2.82704615, -0.34414084,  0.43500958, ...,  1.        ,
         0.        ,  0.        ],
       [-0.86956223, -1.01757865, -0.57802643, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.23123331,  0.42992561, -0.57802643, ...,  0.        ,
         0.        ,  1.        ],
       [-1.07473939, -1.3117239 , -0.57802643, ...,  1.        ,
         0.        ,  0.        ],
       [-0.29278646,  0.09707704,  0.43500958, ...,  1.        ,
         0.        ,  0.        ]])

In [None]:
X_test

array([[-1.87265054, -1.50524051, -1.59106245,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.08190994,  0.04289239, -0.57802643,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.80833093,  0.36025963,  0.43500958,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.59713257, -0.34414084, -0.57802643,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.18937985,  0.04289239, -0.57802643,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-1.34032981, -1.16465127, -0.57802643,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.   

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models= {"SVC":SVC(),"dtc":DecisionTreeClassifier(),'Logreg':LogisticRegression()}

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score


def model_train_eval(X_train,y_train,X_test,y_test,models):
  evaluation = {}
  for i in range (len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    model_score = accuracy_score(y_test,y_pred)
    evaluation[list(models.keys())[i]] = model_score
  return evaluation

In [None]:
model_train_eval(X_train,y_train,X_test,y_test,models)

{'SVC': 0.9324324324324325,
 'dtc': 0.9054054054054054,
 'Logreg': 0.9324324324324325}