## Importing Libraries

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

## Stacking

### Importing Dataset

In [2]:
#reading the data
data=pd.read_csv('data_cleaned.csv')

#printing the first few rows
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [3]:
#size of the data
data.shape

(891, 25)

In [4]:
#checking missing values
data.isnull().sum()

Survived      0
Age           0
Fare          0
Pclass_1      0
Pclass_2      0
Pclass_3      0
Sex_female    0
Sex_male      0
SibSp_0       0
SibSp_1       0
SibSp_2       0
SibSp_3       0
SibSp_4       0
SibSp_5       0
SibSp_8       0
Parch_0       0
Parch_1       0
Parch_2       0
Parch_3       0
Parch_4       0
Parch_5       0
Parch_6       0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

### Separating Dependent and Independent Variables

In [5]:
#independent variables
x = data.drop(["Survived"], axis = 1)

#dependent variables
y = data['Survived']

x.shape, y.shape

((891, 24), (891,))

### Making test and training set

In [6]:
from sklearn.model_selection import train_test_split as tts

train_x, test_x, train_y, test_y = tts (x, y, random_state = 9 , stratify = y)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((668, 24), (223, 24), (668,), (223,))

<img src="stacking/stacking image.png" alt="Drawing" style="width: 400px;"/>

### Base models 

In [7]:
#importing predictive models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [8]:
from sklearn.ensemble import StackingClassifier

In [9]:
models = [('LR', LogisticRegression()),('knn', KNeighborsClassifier()), ('tree', DecisionTreeClassifier())]
model=StackingClassifier(estimators=models, final_estimator=LogisticRegression(), cv=3 )

In [10]:
model.fit(train_x, train_y)

In [11]:
pred_y=model.predict(test_x)

In [12]:
model.score(test_x,test_y)

0.8116591928251121

## Random Forest

### Loading Dataset

In [13]:
# Reading dataset
data=pd.read_csv('data_cleaned.csv')

# print first 5 rows
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


#### Separating independent and dependent variables.

In [14]:
#independent variables
x = data.drop(['Survived'], axis=1)

#dependent variable
y = data['Survived']

#### Creating the train and test dataset

In [15]:
#import the train-test split
from sklearn.model_selection import train_test_split

In [16]:
#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

### Building a Decision Tree Model

In [17]:
#Importing Decision Tree Classifier 
from sklearn.tree import DecisionTreeClassifier

In [18]:
#creating a decision tree instance
clf = DecisionTreeClassifier(random_state=96)

In [19]:
#training the model
clf.fit(train_x,train_y)

In [20]:
#calculating score on training data
clf.score(train_x, train_y)

0.9880239520958084

In [21]:
#calculating score on test data
clf.score(test_x, test_y)

0.757847533632287

### Building a Random Forest Model

In [22]:
#Importing random forest classifier 
from sklearn.ensemble import RandomForestClassifier

In [23]:
#creating a random forest instance
clf = RandomForestClassifier(random_state=96)

In [24]:
#train the model
clf.fit(train_x,train_y)

In [25]:
#score on training data
clf.score(train_x, train_y)

0.9880239520958084

In [26]:
#score on test data
clf.score(test_x, test_y)

0.7533632286995515

In [27]:
#looking at the feature importance
clf.feature_importances_

array([0.23041043, 0.23650759, 0.02766848, 0.01696085, 0.0500349 ,
       0.13608447, 0.1681588 , 0.01320989, 0.01617595, 0.00627351,
       0.00425904, 0.00480762, 0.00078874, 0.00262281, 0.01760913,
       0.01218149, 0.01192783, 0.00146993, 0.00190138, 0.00290028,
       0.00060589, 0.01284854, 0.00908377, 0.01550866])

In [28]:
#feature importance against each variable
pd.Series(clf.feature_importances_, index=train_x.columns)

Age           0.230410
Fare          0.236508
Pclass_1      0.027668
Pclass_2      0.016961
Pclass_3      0.050035
Sex_female    0.136084
Sex_male      0.168159
SibSp_0       0.013210
SibSp_1       0.016176
SibSp_2       0.006274
SibSp_3       0.004259
SibSp_4       0.004808
SibSp_5       0.000789
SibSp_8       0.002623
Parch_0       0.017609
Parch_1       0.012181
Parch_2       0.011928
Parch_3       0.001470
Parch_4       0.001901
Parch_5       0.002900
Parch_6       0.000606
Embarked_C    0.012849
Embarked_Q    0.009084
Embarked_S    0.015509
dtype: float64

## Gradient Boosting

In [29]:
#reading the data
data=pd.read_csv('data_cleaned.csv')

#first five rows of the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


#### Separating independent and dependent variables.

In [30]:
#independent variables
x = data.drop(['Survived'], axis=1)

#dependent variable
y = data['Survived']

#### Creating the train and test dataset

In [31]:
#import the train-test split
from sklearn.model_selection import train_test_split

#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

### Building an GBDT Model

In [32]:
#Importing GBDT Classifier 
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
#creating an Gradient boosting instance
clf = GradientBoostingClassifier(random_state=96)

In [34]:
#training the model
clf.fit(train_x,train_y)

In [35]:
#calculating score on test data
clf.score(test_x, test_y)

0.8116591928251121

### GBDT Hyperparameters

#### Model Based Hyperparameters

1. **n_estimators:** Total number of trees.
2. **loss:** The loss function to be minimized. 
3. **subsample:** The fraction of observations to be selected for each tree. Selection is done by random sampling.
4. **random_state:** The random number seed so that same random numbers are generated every time.
5. **learning_rate:** This determines the impact of each tree on the final outcome 

In [36]:
#creating an Gradient boosting instance
clf = GradientBoostingClassifier(random_state=96, n_estimators=200, subsample=0.7)

In [37]:
#training the model
clf.fit(train_x,train_y)

In [38]:
#calculating score on test data
clf.score(test_x, test_y)

0.8161434977578476

#### Tree Based Hyperparameters

1. **max_depth:** Maximum depth to which tree can grow (stopping criteria)
2. **max_features:** The number of features to consider while searching for a best split
3. **max_leaf_nodes:** The maximum number of terminal nodes or leaves in a tree
4. **min_samples_leaf:** Minimum samples required in a terminal node or leaf (stopping criteria)
5. **min_samples_split:** Minimum number of samples required in a node for splitting (stopping criteria)

In [39]:
#creating an Gradient boosting instance
clf = GradientBoostingClassifier(random_state=96, min_samples_split=100, max_depth=4)

In [40]:
#training the model
clf.fit(train_x,train_y)

In [41]:
#calculating score on test data
clf.score(test_x, test_y)

0.8385650224215246

## Extreme Gradient Boosting

In [42]:
#reading the data
data=pd.read_csv('data_cleaned.csv')

#first five rows of the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


#### Separating independent and dependent variables.

In [43]:
#independent variables
x = data.drop(['Survived'], axis=1)

#dependent variable
y = data['Survived']

#### Creating the train and test dataset

In [44]:
#import the train-test split
from sklearn.model_selection import train_test_split

#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

### Building an XGBM Model

In [45]:
#Importing XGBM Classifier 
from xgboost import XGBClassifier

In [46]:
#creating an extreme Gradient boosting instance
clf = XGBClassifier(random_state=96)

In [47]:
#training the model
clf.fit(train_x,train_y)

In [48]:
#calculating score on training data
clf.score(train_x, train_y)

0.9775449101796407

In [49]:
#calculating score on test data
clf.score(test_x, test_y)

0.7623318385650224

### Hyperparamter Tuning

#### Same as GBDT

1. **n_estimators:** Total number of trees
2. **learning_rate:** This determines the impact of each tree on the final outcome
3. **random_state:** The random number seed so that same random numbers are generated every time
4. **max_depth:** Maximum depth to which tree can grow (stopping criteria)
5. **subsample:** The fraction of observations to be selected for each tree. Selection is done by random sampling
6. **objective:** Defines Loss function (*binary:logistic* is for classification using probability, *reg:logistic* is for classification, *reg:linear* is for regression)
7. **colsample_bylevel:** Random feature selection at levels
8. **colsample_bytree:** Random feature selection at tree

In [50]:
#set parameters
clf = XGBClassifier(random_state=96, colsample_bytree=0.7, max_depth=6)

In [52]:
#training the model
clf.fit(train_x,train_y)

In [53]:
#calculating score on test data
clf.score(test_x, test_y)

0.7892376681614349

#### Regularization

1. **gamma:** Minimum reduction in loss at every split
2. **reg_alpha:** Makes leaf weights 0
3. **reg_lambda:** Decrease leaf weights more smoothly

In [54]:
clf = XGBClassifier(gamma=0.1, random_state=96)

In [55]:
#training the model
clf.fit(train_x,train_y)

In [56]:
#calculating score on test data
clf.score(test_x, test_y)

0.7892376681614349

## Adaptive Boosting (Ada Boosting)

In [57]:
#reading the data
data=pd.read_csv('data_cleaned.csv')

#first five rows of the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


#### Separating independent and dependent variables.

In [58]:
#independent variables
x = data.drop(['Survived'], axis=1)

#dependent variable
y = data['Survived']

#### Creating the train and test dataset

In [59]:
#import the train-test split
from sklearn.model_selection import train_test_split

#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

### Building an AdaBoost Model

In [60]:
#Importing AdaBoost Classifier 
from sklearn.ensemble import AdaBoostClassifier

In [61]:
#creating an AdaBoost instance
clf = AdaBoostClassifier(random_state=96)

In [63]:
#training the model
clf.fit(train_x,train_y)

In [64]:
#calculating score on training data
clf.score(train_x, train_y)

0.8413173652694611

In [65]:
#calculating score on test data
clf.score(test_x, test_y)

0.7982062780269058

### Hyperparameter Tuning

1. **base_estimator:** The model to ensemble. Default is decision tree.
2. **n_estimators:** Total number of models to build.
3. **learning_rate:** Shrinks the contribution of each classifier by this value.
4. **random_state:** The random number seed so that same random numbers are generated every time.

In [66]:
from sklearn.ensemble import RandomForestClassifier

In [67]:
clf = AdaBoostClassifier(random_state=96, base_estimator=RandomForestClassifier(random_state=101),
                         n_estimators=100, learning_rate=0.01)

In [68]:
#training the model
clf.fit(train_x,train_y)

In [None]:
#calculating score on test data
clf.score(test_x, test_y)