In [27]:

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt


In [28]:
titanic_file_path = "./titanic.csv"
titanic_data = pd.read_csv(titanic_file_path)


FEATURESELECTION

In [29]:

#Print out the target value to be predicted 
y = titanic_data.Survived
titanic_features = ["Age", "Sex", "Embarked", "Fare", "Pclass"]
#Features of target to be predicted
X = titanic_data[titanic_features].copy()
X['Sex'] = X['Sex'].map({'male': 1, 'female': 0})
X['Embarked'] = X['Embarked'].fillna('S')  # Fill missing with mode
X = pd.get_dummies(X, columns=['Embarked'])


DATA CLEANING

In [30]:
titanic_data['Cabin'] = titanic_data['Cabin'].fillna(titanic_data['Cabin'].mode()[0])
X.loc[:, 'Age'] = X['Age'].fillna(X['Age'].median()) 

DATA SPLIT

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=1)

LOG MODEL


In [32]:
log_model = LogisticRegression(random_state=1,max_iter=1000)
#fit the model 
log_model.fit(X_train, y_train)
#Prediction
log_y_pred = log_model.predict(X_test)
log_accuracy = accuracy_score(y_test, log_y_pred)
print("Accuracy for the logisticRegression:", log_accuracy)

Accuracy for the logisticRegression: 0.770949720670391


Random Model

In [33]:
#Define The Model
rf_model = RandomForestClassifier(random_state=1)
#Fit the model
rf_model.fit(X_train, y_train)
#Prediction
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy for the RandomForestRegressor:",rf_accuracy )


Accuracy for the RandomForestRegressor: 0.8324022346368715


XGBOOST Model

In [34]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, rf_y_pred)
print('XGBOOST ACCURACY:', xgb_accuracy)

XGBOOST ACCURACY: 0.8324022346368715


PERFORMANCE OF MODELS

In [35]:
#Performance between models
print("\nModel Comparison:")
print(f"Logistic Regression: {log_accuracy:.4f}")
print(f"Random Forest      : {rf_accuracy:.4f}")
print(f"XGBoost            : {xgb_accuracy:.4f}")


Model Comparison:
Logistic Regression: 0.7709
Random Forest      : 0.8324
XGBoost            : 0.8324


In [36]:
print("Making predictions for the survival of the shipwreck:")
print(X.head())
print("The predictions are")
print(log_model.predict(X.head()))

Making predictions for the survival of the shipwreck:
    Age  Sex     Fare  Pclass  Embarked_C  Embarked_Q  Embarked_S
0  22.0    1   7.2500       3       False       False        True
1  38.0    0  71.2833       1        True       False       False
2  26.0    0   7.9250       3       False       False        True
3  35.0    0  53.1000       1       False       False        True
4  35.0    1   8.0500       3       False       False        True
The predictions are
[0 1 1 1 0]
