<a href="https://colab.research.google.com/github/ArsalanKhan17/Titanic-Prediction-Project/blob/main/Titatnic_Mortality_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<p align="center"><h1 align="center">Titanic Dataset Classification Tutorial</h1>


---



## **(1) Preprocessor Function & Setup**

> ### A more advanced example demonstrating the flexibility of a new *Column Transformer* approach.

In [None]:
# note that tabular preprocessors require scikit-learn>=0.24.0
!pip install scikit-learn --upgrade 

% tensorflow_version 1.x

In [2]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

print(data.shape)

data.head()

(1309, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
# Preprocess data using sklearn's Column Transformer approach

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']

# Replacing missing values with Modal value and then one-hot encoding.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Final preprocessor object set up with ColumnTransformer...

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Target = survived
y = data['survived']
y = y.map({0: 'died', 1: 'survived'})

X = data.drop(['survived','sibsp','parch','ticket','name','cabin','boat','body','home.dest'], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# fit preprocessor to your data
preprocess = preprocess.fit(X_train)

In [4]:
# Write function to transform data with preprocessor

def preprocessor(data):
    preprocessed_data=preprocess.transform(data)
    return preprocessed_data

In [5]:
X_train

Unnamed: 0,pclass,sex,age,fare,embarked
1118,3,male,25.0000,7.9250,S
44,1,female,41.0000,134.5000,C
1072,3,male,,7.7333,Q
1130,3,female,18.0000,7.7750,S
574,2,male,29.0000,21.0000,S
...,...,...,...,...,...
763,3,female,0.1667,20.5750,S
835,3,male,,8.0500,S
1216,3,female,,7.7333,Q
559,2,female,20.0000,36.7500,S


In [6]:
# Notice categorical feature columns have been one-hot encoded
preprocessor(X_train).shape

(1047, 10)

## **(2) Building The Model Using `sklearn`**

In [7]:
print(X_train.shape, X_test.shape, 
      y_train.shape, y_test.shape)

(1047, 5) (262, 5) (1047,) (262,)


## Logistic Classifier

In [8]:
# Penalized Logit...

hyperparameters = {'C':np.logspace(1, 10, 100), 'penalty':['l2']}

logit = LogisticRegression()
logit_cv = GridSearchCV(logit, hyperparameters, cv = 10)
logit_cv.fit(preprocessor(X_train), y_train)

print("Best Parameters {:.3f}: ", logit_cv.best_params_)

Best Parameters {:.3f}:  {'C': 10.0, 'penalty': 'l2'}


In [9]:
logit_cv.best_estimator_

LogisticRegression(C=10.0)

In [12]:
model = LogisticRegression(C=10, penalty='l2')

model.fit(preprocessor(X_train), y_train) # Fitting to the training set.

model.score(preprocessor(X_train), y_train) # Fit score, 0-1 scale.

0.7793696275071633

In [None]:
y_pred = model.predict(preprocessor(X_test))

y_pred

In [14]:
# Evaluate held out test data
from sklearn.metrics import accuracy_score

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

Accuracy: 79.01%


## Extreme Gradiet Boosting (using same preprocessor fxn)

In [17]:
#train an xgboost model

import xgboost as xgb

model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=50, seed=123)

model.fit(preprocessor(X_train), y_train)

model.score(preprocessor(X_train), y_train)


0.833810888252149

In [None]:
#predictions using xgboost

y_pred = model.predict(preprocessor(X_test))

y_pred

In [20]:
#evaluate model using test data

print("Accuracy:  {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

Accuracy:  81.68%


## Deep learning example (using same preprocessor fxn)

In [21]:
import keras
model = keras.Sequential([
    keras.layers.Dense(100, input_dim=10, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(2)
])


model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

# Fitting the ANN to the Training set...
model.fit(preprocessor(X_train), pd.get_dummies(y_train), epochs=30, verbose=2)

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/30
 - 0s - loss: 3.0949 - accuracy: 0.5798
Epoch 2/30
 - 0s - loss: 1.1747 - accuracy: 0.6466
Epoch 3/30
 - 0s - loss: 0.6709 - accuracy: 0.7278
Epoch 4/30
 - 0s - loss: 0.5637 - accuracy: 0.7436
Epoch 5/30
 - 0s - loss: 0.5201 - accuracy: 0.7569
Epoch 6/30
 - 0s - loss: 0.5071 - accuracy: 0.7646
Epoch 7/30
 - 0s - loss: 0.5001 - accuracy: 0.7722
Epoch 8/30
 - 0s - loss: 0.4947 - accuracy: 0.7703
Epoch 9/30
 - 0s - loss: 0.4905 - accuracy: 0.7713
Epoch 10/30
 - 0s - loss: 0.4904 - accuracy: 0.7775
Epoch 11/30
 - 0s - loss: 0.4903 - accuracy: 0.7775
Epoch 12/30
 - 0s - loss: 0.4910 - accuracy: 0.7794
Epoch 13/30
 - 0s - loss: 0.4852 - accuracy: 0.7861
Epoch 14/30
 - 0s - loss: 0.4752 - accuracy: 0.7789
Epoch 15/30
 - 0s - loss: 0.4749 - accuracy: 0.7884
Epoch 16/30
 - 0s - loss: 0.4715 - accuracy: 0.7899
E

<keras.callbacks.callbacks.History at 0x7f3082de4890>

In [22]:
pred_prob = model.predict(preprocessor(X_test))
prediction_index = list([np.argmax(i) for i in pred_prob])
labels = ['died', 'survived']

predictions_keras=[]
for i in prediction_index:
 predictions_keras.append(labels[i])

predictions_keras[0:4]

['died', 'survived', 'died', 'died']

In [23]:
# Evaluate held out test data
from sklearn.metrics import accuracy_score

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, predictions_keras) * 100)) #test


Accuracy: 79.39%
