# Bagging & Random forest

## Bagging

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Loading the dataset
data = pd.read_csv('data_cleaned.csv')

### EDA

In [3]:
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [4]:
# Shape
data.shape

(891, 25)

In [5]:
# features
data.columns

Index(['Survived', 'Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3',
       'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2',
       'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [6]:
# Basic Info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Age         891 non-null    float64
 2   Fare        891 non-null    float64
 3   Pclass_1    891 non-null    int64  
 4   Pclass_2    891 non-null    int64  
 5   Pclass_3    891 non-null    int64  
 6   Sex_female  891 non-null    int64  
 7   Sex_male    891 non-null    int64  
 8   SibSp_0     891 non-null    int64  
 9   SibSp_1     891 non-null    int64  
 10  SibSp_2     891 non-null    int64  
 11  SibSp_3     891 non-null    int64  
 12  SibSp_4     891 non-null    int64  
 13  SibSp_5     891 non-null    int64  
 14  SibSp_8     891 non-null    int64  
 15  Parch_0     891 non-null    int64  
 16  Parch_1     891 non-null    int64  
 17  Parch_2     891 non-null    int64  
 18  Parch_3     891 non-null    int64  
 19  Parch_4     891 non-null    i

### Separating Response and predictor

In [8]:
# Predictor
X = data.drop(['Survived'],axis=1)

# Response
y = data['Survived']

### Creating Training and Testing dataset

In [10]:
# import libraries
from sklearn.model_selection import train_test_split

In [11]:
# Divide into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 101,test_size = 0.3)

## 1. Model1 :: Decision Tree

In [17]:
# Import libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [13]:
model1 = DecisionTreeClassifier(criterion = 'gini', random_state = 10)

In [14]:
# fitting the model
model1.fit(X_train, y_train)

DecisionTreeClassifier(random_state=10)

In [15]:
# Training Accuracy
model1.score(X_train, y_train)

0.9839486356340289

In [16]:
# Testing Accuracy
model1.score(X_test, y_test)

0.7313432835820896

Clearly we can see the model is suffering from overfitting. <br>
So to improve the accuracy we can do :<br>
    1. Prunning <br>
    2. Ensemble Technique

## Ensemble Technique

## 2. Model2 :: Bagging

In [20]:
# Import the library
from sklearn.ensemble import BaggingClassifier

In [21]:
model2 = BaggingClassifier(base_estimator = DecisionTreeClassifier(),
                          n_estimators = 100,
                          max_samples = 0.8,
                          bootstrap = True,
                          oob_score = True,
                          random_state = 10)

Hyperparameter of Bagging:
1. base_estimator : algorithm use for bagging
      * Here we are using Decision Tree
2. n_estimators : number of algorithms used
      * In this case we are using 100 decision tree algorithm
3. max_samples : maximum number of data sampled from training set
      * here , 80% training data are used for sampling
4. bootstrap : Allows for resampling of training dataset without replacement
      * True: Sampling done with replacemeny
5. oob_score : Allows to compute model's accuracy after training

In [22]:
# Fitting the model
model2.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.8,
                  n_estimators=100, oob_score=True, random_state=10)

In [23]:
# Out-of-bag Score
model2.oob_score_

0.78330658105939

In [24]:
# Training Accuracy
model2.score(X_train, y_train)

0.9807383627608347

In [25]:
# Testing Accuracy
model2.score(X_test, y_test)

0.7985074626865671

## 3. Model3 :: Random Forest

**Hyperparameters of RandomForest**<br>
1. ***n_estimators*** : Number of Trees we are using
2. ***Criterion*** : ***['gini' , 'Entropy' , 'log_loss']***
3. Hyperparameters at tree level:
     *  ***max_features*** : maximum no of features used to split the node in a tree<br>
     Code: ***['auto' , 'sqrt' , 'log2' , None-all features , 10 , 20 ]***<br>
     e.g., sqrt(n_features), log2(n_features)
     *  ***max_depth*** : Depth of each tree(Stopping criteria for the tree) 
     *  ***min_samples_split*** : Minimum no of samples(obs) required to split an internal node(default : 2)
     *  ***min_samples_leaf*** : After the split minimum number of samples required to be at leaf(default : 1)
4. ***bootstrap*** : ***['True' : with replacement , 'false']***
5. ***oob_score*** : Out-of-bag estimator

In [26]:
# Import library
from sklearn.ensemble import RandomForestClassifier

In [27]:
# Create a Object for a Random Forest Classifier
model3 = RandomForestClassifier(random_state = 10) # Using default hyperparameter

In [28]:
model3.fit(X_train, y_train)

RandomForestClassifier(random_state=10)

In [29]:
# Training Accuracy
model3.score(X_train, y_train)

0.9839486356340289

In [30]:
# Testing Accuracy
model3.score(X_test, y_test)

0.7798507462686567

In [31]:
# Looking at Feature Importance
model3.feature_importances_

array([0.24321503, 0.23559608, 0.02877441, 0.01537382, 0.04704995,
       0.13638433, 0.14762179, 0.01455204, 0.02012748, 0.00514674,
       0.00446934, 0.00332842, 0.00119018, 0.00231142, 0.0184192 ,
       0.01442888, 0.01196393, 0.00153653, 0.0007272 , 0.00239287,
       0.00091879, 0.01676407, 0.00884849, 0.01885901])

In [32]:
# Feature importance w.r.t each predictor
pd.Series(model3.feature_importances_, index = X_train.columns)

Age           0.243215
Fare          0.235596
Pclass_1      0.028774
Pclass_2      0.015374
Pclass_3      0.047050
Sex_female    0.136384
Sex_male      0.147622
SibSp_0       0.014552
SibSp_1       0.020127
SibSp_2       0.005147
SibSp_3       0.004469
SibSp_4       0.003328
SibSp_5       0.001190
SibSp_8       0.002311
Parch_0       0.018419
Parch_1       0.014429
Parch_2       0.011964
Parch_3       0.001537
Parch_4       0.000727
Parch_5       0.002393
Parch_6       0.000919
Embarked_C    0.016764
Embarked_Q    0.008848
Embarked_S    0.018859
dtype: float64