# Titanic Survival Prediction

# Introduction

This dataset has been acquired from Kaggle: https://www.kaggle.com/competitions/titanic/data


The analysis of this dataset has two phases. The first phase includes the exploratory data analysis and data visualization to better understand the relationships and patterns within the dataset. In the second phase, i have used machine learning algorithms to predict the number of survived people from Titanic. At the end, i have checked my prediction by uploading it onto Kaggle where it avaluates how good my prediction was.


# Exploratory data analysis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

# evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report  
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
df = pd.read_csv('data/trainset_Titanic.csv')
df.head()

In [None]:
df['Age'].isna().sum()

In [None]:
df.shape

In [None]:
counts = df['Survived'].value_counts()

In [None]:
plot = counts.plot(kind = 'bar', xlabel = 'Survival status', ylabel = 'count')

plt.title("Number of passengers survived and not survived", weight = 'bold', fontsize =12)
plt.xlabel('Survival Status', fontsize = 11)
plt.ylabel('Count', fontsize = 11)

plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)


Survival_status_labels = ['Not Survived', 'Survived']  # Custom labels for the levels
plot.set_xticklabels(Survival_status_labels)

# Rotating the x-tick labels
plt.xticks(rotation=0) # rotation 0 is horizontal

In [None]:
#first class passengers
first_class = df[df['Pclass'] == 1]


first_class['Survived'].value_counts(normalize = True)

In [None]:

#the first class passengers who survived
survived_firstclass = first_class[first_class['Survived'] == 1]
survived_firstclass.head()


In [None]:
total_firstclass = df[df['Pclass'] == 1]['Pclass'].sum()
total_firstclass

In [None]:
#proportaion
proportion = len(survived_firstclass) / total_firstclass
proportion

In [None]:
newdf = df[['Pclass', 'Sex']]
newdf

In [None]:
plt.figure(figsize=(8, 6)) 

custom_palette = ["orange", "green", "blue"]

plot = sns.barplot(x = 'Sex', y = 'Survived', hue = 'Pclass', palette = custom_palette,data = df)
plt.xlabel("Sex", size = 12)
plt.ylabel("Survival rate", size = 12)

plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)


Survival_status_labels = ['Not Survived', 'Survived']  # Custom labels for the levels
plot.set_xticklabels(Survival_status_labels)


legend = plt.legend(title='Passenger Class', loc='upper left', fontsize = 9)
legend.get_frame().set_linewidth(0)  # Remove legend frame border
legend.texts[0].set_text('First')  # Change legend label text
legend.texts[1].set_text('Second')  
legend.texts[2].set_text('Third')




In [None]:
plt.figure(figsize=(8, 6)) 
plot = sns.histplot(x = 'Age' ,hue = 'Survived', data = df, element = 'poly')

plt.xlabel("Age", size = 12)
plt.ylabel("Survival count", size = 12)


plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)



In [None]:
survived = df[df['Survived'] == 1]

survived = survived['Age'].mean()

print('The average age of the survived passengers is:', survived)

In [None]:
drowned = df[df['Survived'] == 0]
drowned = drowned['Age'].mean()

print('The average age of the dronwed passengers is:', drowned)

In [None]:
df.head(10)

In [None]:
mean_age = df['Age'].mean()
mean_age

In [None]:
# Replace missing age values by the mean age.
df['Age'] = df['Age'].fillna(mean_age)
df.head(20)
df.Age.isna().sum()

In [None]:
#Create a table counting the number of surviving/dead passengers separately for 1st/2nd/3rd class and male/female.

group_df = df.groupby(['Pclass', 'Sex'])

In [None]:
survived = group_df['Survived'].agg(['sum', 'count']).rename(columns = {'sum': 'Survived', 'count': 'Total'})

In [None]:
survived['Drowned'] = survived['Total'] - survived['Survived']
survived

In [None]:
print(survived.columns)

In [None]:
survived = survived.reset_index()
survived

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize = (8, 6))

custom_palette = ['lightgreen', 'lightblue']

plot= sns.barplot(x = 'Pclass', y = 'Survived', hue = 'Sex', palette = custom_palette, data = survived)

plt.xlabel("Passenger Class", weight = 'bold', size= 11)
plt.ylabel("Survival Count",  weight = 'bold', size= 11)
plt.title("Survival Count by Passenger Class and Sex",  weight = 'bold', size= 11)


plot.spines['top'].set_visible(False)
plot.spines['right'].set_visible(False)


Survival_status_labels = ['First', 'Second', 'Third']  # Custom labels for the levels
plot.set_xticklabels(Survival_status_labels)


legend = plt.legend(title='Sex', loc='upper right', fontsize = 9)
legend.get_frame().set_linewidth(0)  # Remove legend frame border
legend.texts[0].set_text('Female')  # Change legend label text
legend.texts[1].set_text('Male')  

# Machine learning algorithms

In [None]:
df = pd.read_csv('data/train.csv')
df.head()

In [None]:
# check the missing values
df.isna().sum()

In [None]:
mean_age = df['Age'].mean()
mean_age

In [None]:
df['Age'] = df['Age'].fillna(mean_age)
df.head(20)
df.Age.isna().sum()

In [None]:
df.Sex.unique()

In [None]:
df.Pclass.unique()

## Test-train-Split

In [None]:
x = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

In [None]:
x.head()

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x,y, random_state=42)

x_train.shape, x_val.shape, y_train.shape, y_val.shape
#x_train
#x_train.isna().sum().sum()

In [None]:
#df['Sex'] = pd.to_numeric(df['Sex'], errors = 'coerce')

#print(df['Sex'].dtype)

In [None]:
#df['Age'].unique()
#df['Age'] = pd.to_numeric(df['Age'], errors = 'coerce')

print(df['Age'].dtype)


In [None]:
column_transform = ColumnTransformer(
    [("encoder", OneHotEncoder(handle_unknown = "ignore", sparse = False, drop = 'first'), ['Sex']),
     ("binner" , KBinsDiscretizer(n_bins=4, encode='onehot-dense', strategy='quantile'), ['Age'])
    ],
    remainder = 'passthrough'
)

In [None]:
x_train_transform = column_transform.fit_transform(x_train)
x_train_transform

In [None]:
# run the column transformation for the validation data set too so that they both have the similar and updated columns
x_val_transform = column_transform.transform(x_val)
x_val_transform

## Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train_transform, y_train)

In [None]:
logreg.score(x_train_transform, y_train)

In [None]:
logreg.score(x_val_transform, y_val)

## Decision Trees

### Run Training Model

In [None]:
clf_DT = DecisionTreeClassifier(max_depth = 4, random_state = 42) # ccp_alpha = 0.01  - tree pruning

In [None]:
clf_DT.fit(x_train_transform, y_train)

### Make predictions

In [None]:
clf_DT_predict = clf_DT.predict(x_val_transform)
clf_DT_predict

In [None]:
# compare our predictions with the actual info and see whether they are good or not!

accuracy_score(y_val, clf_DT_predict)

In [None]:
confusion_matrix(y_val, clf_DT_predict)

In [None]:
##ConfusionMatrixDisplay.from_predictions

In [None]:
precision_score(y_val, clf_DT_predict)

In [None]:
recall_score(y_val, clf_DT_predict)

In [None]:
print(classification_report(y_val, clf_DT_predict, target_names=['Not Survived', 'Survived']))

In [None]:
# check which features are more important the others
feature_names = x_train_transform.columns
feature_names

- Note about **feature importances**: The importance score represents the relative contribution of a feature to the model's ability to make accurate predictions. Higher values indicate greater importance, while lower values indicate lesser importance. 
    The sum of all feature importances is usually equal to 1.

In [None]:
# the list correspond to the features

clf_DT.feature_importances_

In [None]:
feature_imporatance = pd.DataFrame(clf_DT.feature_importances_, index = feature_names).sort_values(by = 0, ascending=False)
feature_imporatance

In [None]:
# plot of the feature importances
feature_imporatance.plot(kind = 'bar')


### Plotting the Decision Trees

In [None]:
# plotting the actual tree of the model decides
from sklearn import tree

fig = plt.figure(figsize=(25,20))
plot = tree.plot_tree(clf_DT, 
                   feature_names=feature_names,  #refers to columns
                   class_names={0:'Not Survived', 1:'Survived'},
                   filled=True,
                  fontsize=12)

## Random Forest

In [None]:
clf_RF = RandomForestClassifier(n_estimators = 10, max_depth = 2)

clf_RF.fit(x_train_transform, y_train)

In [None]:
clf_RF_preds = pd.DataFrame(clf_RF.predict(x_val_transform))

In [None]:
accuracy_score(y_true = y_val, y_pred = clf_RF_preds)


## Evaluating of the prediction on Kaggle

### Based on logistic Regression model

In [None]:
test_df = pd.read_csv('data/test.csv')
test_df.head()
test_df.shape

In [None]:
test_df.isna().sum()

In [None]:
meanAge = test_df['Age'].mean()
meanAge

In [None]:
test_df['Age'] = test_df['Age'].fillna(meanAge)
test_df.isna().sum()

In [None]:
x = test_df[['Pclass', 'Sex', 'Age']]
x.head()

In [None]:
column_transformation = ColumnTransformer(
    [("encoder", OneHotEncoder(handle_unknown = "ignore", sparse = False, drop = 'first'), ['Sex']),
     ("binner" , KBinsDiscretizer(n_bins=4, encode='onehot-dense', strategy='quantile'), ['Age'])
    ],
    remainder = 'passthrough'
)

In [None]:
x_columntransformation = column_transformation.fit_transform(x)
x_columntransformation.head()

In [None]:
predict = logreg.predict(x_columntransformation)
predict.shape

In [None]:
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predict})
submission.to_csv('submission.csv', index=False)