# Titanic Survival Prediction 🚢

In [1]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch import nn
import pandas as pd

## Data Exploration

In [2]:
X_full = pd.read_csv("train.csv")
X_test_full = pd.read_csv("test.csv")
test_ids = X_test_full["PassengerId"]

In [3]:
X_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
X_full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
X_full.Survived.value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [6]:
X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

X_full.drop(['PassengerId'], axis=1, inplace=True)
X_test_full.drop(['PassengerId'], axis=1, inplace=True)

## Feature Engineering

In [7]:
X_full["Cabin_Initials"] = X_full["Cabin"].str.split().str[0].str[0]
X_test_full["Cabin_Initials"] = X_test_full["Cabin"].str.split().str[0].str[0]

X_full.drop(['Cabin'], axis=1, inplace=True)
X_test_full.drop(['Cabin'], axis=1, inplace=True)

X_full["Cabin_Initials"] = X_full["Cabin_Initials"].fillna("N")
X_test_full["Cabin_Initials"] = X_test_full["Cabin_Initials"].fillna("N")

In [8]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [9]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if
                X_train_full[cname].dtype in ['int64', 'float64']]

In [10]:
my_cols = categorical_cols + numerical_cols

## Data Preprocessing

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

### Pipelining

In [12]:
feat_float64_cols = [col for col in X_full.columns if X_full[col].dtype == "float64"]
feat_int64_cols = [col for col in X_full.columns if X_full[col].dtype == "int64"]
feat_cat_cols = [col for col in X_full.columns if X_full[col].dtype == "object"]
feat_float64_cols, feat_cat_cols, feat_int64_cols


(['Age', 'Fare'],
 ['Name', 'Sex', 'Ticket', 'Embarked', 'Cabin_Initials'],
 ['Pclass', 'SibSp', 'Parch'])

In [15]:
float64_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler())
    ]
)

int64_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent"))
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False, categories='auto'))
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num_float64", float64_transformer, feat_float64_cols),
        ("num_int64", int64_transformer, feat_int64_cols),
        ("cat", cat_transformer, feat_cat_cols)
    ]
)



pipeline = Pipeline(
    steps = [
        ("preprocessor", preprocessor)
    ]
)



In [16]:
X_full_pp = pipeline.fit_transform(X_full)

In [17]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full_pp, y,
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Base Model

In [19]:
model = RandomForestClassifier()
model.fit(X_train_full,y_train)

In [20]:
pred_1 = model.predict(X_valid_full)

In [21]:
print(classification_report(y_valid, pred_1))

              precision    recall  f1-score   support

           0       0.84      0.94      0.88       110
           1       0.88      0.71      0.78        69

    accuracy                           0.85       179
   macro avg       0.86      0.82      0.83       179
weighted avg       0.85      0.85      0.85       179



## XGB Model

In [22]:
xgb_model = XGBClassifier()

xgb_model.fit(X_train_full,y_train)

pred_2 = xgb_model.predict(X_valid_full)

print(classification_report(y_valid, pred_2))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       110
           1       0.84      0.74      0.78        69

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179



## Hyperparameter Tuninig XGB Model

In [23]:

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 6, 7, 9],
    'learning_rate': [0.1, 0.01, 0.001, 0.05]
}

# Create a GridSearchCV object with the model and parameter grid
grid_search = GridSearchCV(xgb_model, param_grid=param_grid, cv=4)

# Fit the grid search object to your data
grid_search.fit(X_train_full, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [24]:
best_params

In [25]:
best_score

In [26]:
best_model = XGBClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 100)

best_model.fit(X_train_full,y_train)

pred_3 = best_model.predict(X_valid_full)

print(classification_report(y_valid, pred_3))

              precision    recall  f1-score   support

           0       0.81      0.95      0.87       110
           1       0.88      0.64      0.74        69

    accuracy                           0.83       179
   macro avg       0.84      0.79      0.80       179
weighted avg       0.83      0.83      0.82       179



In [27]:
X_test_pp = pipeline.transform(X_test_full)

In [28]:
final_pred = best_model.predict(X_test_pp)

In [29]:
df = pd.DataFrame({
    "PassengerId": test_ids.values,
    "Survived": final_pred

})

In [30]:
df.to_csv("titanic_model.csv", index = False)

In [31]:
# from google.colab import files
# files.download('titanic_model.csv')