In [7]:
# 🛳️ Titanic Survival Prediction - Logistic Regression
# Author: Daron John
# Description: A beginner-friendly machine learning project to predict survival on the Titanic dataset.

# ------------------------------------------------------
# 📦 Step 1: Import Libraries
# ------------------------------------------------------

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ------------------------------------------------------
# 📥 Step 2: Load Dataset
# ------------------------------------------------------

df = pd.read_csv('train.csv')  # Make sure 'train.csv' is in your working directory

# Optional: Explore the dataset
# print(df.head())
# print(df.info())
# print(df.describe())
# print(df.isnull().sum())

# ------------------------------------------------------
# 🧹 Step 3: Handle Missing Values
# ------------------------------------------------------

# Fill missing 'Age' values with the median age
df['Age'] = df['Age'].fillna(df['Age'].median())

# Drop the 'Cabin' column (too many missing values)
df.drop(columns='Cabin', inplace=True)

# Fill missing 'Embarked' values with the mode (most frequent value)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# ------------------------------------------------------
# 📊 Step 4: Exploratory Data Analysis (Optional Visuals)
# ------------------------------------------------------

# Overall survival count
# sns.countplot(data=df, x='Survived')
# plt.title("Overall Survival Count (0 = Did Not Survive, 1 = Survived)")
# plt.xlabel("Survival")
# plt.ylabel("Number of Passengers")
# plt.show()

# Survival by Gender
# sns.countplot(data=df, x='Sex', hue='Survived')
# plt.title("Survival by Gender")
# plt.xlabel("Gender")
# plt.ylabel("Number of Passengers")
# plt.legend(title="Survived", labels=["No", "Yes"])
# plt.show()

# Survival by Passenger Class
# sns.countplot(data=df, x='Pclass', hue='Survived')
# plt.title("Survival by Passenger Class")
# plt.xlabel("Passenger Class (1 = 1st, 2 = 2nd, 3 = 3rd)")
# plt.ylabel("Number of Passengers")
# plt.legend(title="Survived", labels=["No", "Yes"])
# plt.show()

# Survival by Age and Gender
# sns.set(style="whitegrid")
# g = sns.FacetGrid(df, col="Survived", row="Sex", height=4, aspect=1.5)
# g.map(sns.histplot, "Age", bins=20, kde=False)
# g.fig.subplots_adjust(top=0.9)
# g.fig.suptitle("Survival by Age and Gender", fontsize=16)
# plt.show()

# ------------------------------------------------------
# 🧾 Step 5: Preprocessing - Encoding Categorical Data
# ------------------------------------------------------

# Convert 'Sex' to numeric: male=0, female=1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Convert 'Embarked' to numeric: S=0, C=1, Q=2
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# ------------------------------------------------------
# 🧠 Step 6: Define Features and Target
# ------------------------------------------------------

# Select input features and target
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = df['Survived']

# ------------------------------------------------------
# 🔀 Step 7: Train/Test Split
# ------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------------------
# 🤖 Step 8: Train Logistic Regression Model
# ------------------------------------------------------

# Create logistic regression model with more iterations
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# ------------------------------------------------------
# 📈 Step 9: Evaluate the Model
# ------------------------------------------------------

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ------------------------------------------------------
# 🚀 Step 10: Predicting on Test Set & Saving Results
# ------------------------------------------------------

# Load test dataset
test_df = pd.read_csv('test.csv')

# Fill missing Age and Fare with median from train set
test_df['Age'] = test_df['Age'].fillna(df['Age'].median())
test_df['Fare'] = test_df['Fare'].fillna(df['Fare'].median())

# Drop Cabin (same as train)
test_df.drop(columns='Cabin', inplace=True)

# Fill missing Embarked values with mode (if any)
test_df['Embarked'] = test_df['Embarked'].fillna(df['Embarked'].mode()[0])

# Encode categorical variables
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})
test_df['Embarked'] = test_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Select the same features used during training
X_final = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# Make predictions
final_predictions = model.predict(X_final)

# Create a DataFrame for submission
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': final_predictions
})

# Export to CSV
submission.to_csv('titanic_submission.csv', index=False)

print("✅ Submission file 'titanic_submission.csv' has been created!")


Model Accuracy: 0.80

Confusion Matrix:
 [[89 16]
 [20 54]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

✅ Submission file 'titanic_submission.csv' has been created!
