In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from scipy.stats import zscore
from scipy.stats.mstats import winsorize
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

In [4]:
df = pd.read_csv("./Data/Titanic_Dataset.csv")
df.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


## EDA - Exploratory Data Analysis

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)

## Data Preprocessing

### Imputaion - removing null values

In [8]:
imputer = SimpleImputer(strategy='mean')
print(f"Number of null values in Age column before imputation {df['Age'].isnull().sum()}")
df['Age'] = imputer.fit_transform(df[['Age']])
print(f"Number of null values in Age column before imputation {df['Age'].isnull().sum()}")

Number of null values in Age column before imputation 177
Number of null values in Age column before imputation 0


In [16]:
imputer1 = SimpleImputer(strategy="most_frequent")
print(f"Number of null values in Embark column before imputation {df['Embarked'].isnull().sum()}")
df['Embarked'] = imputer1.fit_transform(df[['Embarked']]).ravel()
print(f"Number of null values in Embark column before imputation {df['Embarked'].isnull().sum()}")

Number of null values in Embark column before imputation 2
Number of null values in Embark column before imputation 0


### metication of outliers

In [17]:
threshold = 3
numerical_features = ['Fare', 'Age']
z_scores = np.abs(zscore(df[numerical_features]))
print(z_scores)

[[0.50244517 0.5924806 ]
 [0.78684529 0.63878901]
 [0.48885426 0.2846632 ]
 ...
 [0.17626324 0.        ]
 [0.04438104 0.2846632 ]
 [0.49237783 0.17706291]]


In [18]:
outliers = np.where(z_scores > threshold)
print(outliers)

(array([ 27,  88,  96, 116, 118, 258, 299, 311, 341, 377, 380, 438, 493,
       527, 557, 630, 672, 679, 689, 700, 716, 730, 737, 742, 745, 779,
       851]), array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1]))


In [19]:
df['Age'] = winsorize(df['Age'], limits=[0.15, 0.15])
df['Fare'] = winsorize(df['Fare'], limits=[0.15, 0.15])

In [20]:
# verifying the removal 
z_scores = np.abs(zscore(df[numerical_features]))
outliers = np.where(z_scores > threshold)
print(outliers)

(array([], dtype=int64), array([], dtype=int64))


In [21]:
oe = OrdinalEncoder()
df['Sex'] = oe.fit_transform(df[['Sex']])
df['Embarked'] = oe.fit_transform(df[['Embarked']])
df['Survived'] = oe.fit_transform(df[['Survived']])

## Model Training

In [22]:
df.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1.0,22.0,1,0,7.75,2.0
1,2,1.0,1,0.0,38.0,1,0,56.4958,0.0
2,3,1.0,3,0.0,26.0,0,0,7.925,2.0
3,4,1.0,1,0.0,35.0,1,0,53.1,2.0
4,5,0.0,3,1.0,35.0,0,0,8.05,2.0
5,6,0.0,3,1.0,29.699118,0,0,8.4583,1.0


In [28]:
x = df.drop('Survived', axis=1)
y = df['Survived']

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((623, 8), (268, 8), (623,), (268,))

In [30]:
ab = AdaBoostClassifier(n_estimators=45, learning_rate=1, random_state=0)
ab.fit(x_train, y_train)

0,1,2
,"estimator  estimator: object, default=None The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper ``classes_`` and ``n_classes_`` attributes. If ``None``, then the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier` initialized with `max_depth=1`. .. versionadded:: 1.2  `base_estimator` was renamed to `estimator`.",
,"n_estimators  n_estimators: int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. Values must be in the range `[1, inf)`.",45.0
,"learning_rate  learning_rate: float, default=1.0 Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the `learning_rate` and `n_estimators` parameters. Values must be in the range `(0.0, inf)`.",1.0
,"random_state  random_state: int, RandomState instance or None, default=None Controls the random seed given at each `estimator` at each boosting iteration. Thus, it is only used when `estimator` exposes a `random_state`. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",0.0


In [31]:
ab.score(x_test, y_test)

0.7985074626865671

In [32]:
# Thank you