# Titanic Survival Prediction Model Development
This notebook builds a machine learning model to predict Titanic passenger survival.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

print('All libraries imported successfully!')

## Step 1: Load the Titanic Dataset

In [None]:
# Load the dataset
dataset_path = '../../../Titanic2/Titanic-Dataset.csv'
df = pd.read_csv(dataset_path)

print(f'Dataset shape: {df.shape}')
print(f'\nFirst few rows:')
print(df.head())
print(f'\nDataset info:')
print(df.info())
print(f'\nMissing values:')
print(df.isnull().sum())

## Step 2: Data Preprocessing
Selected features: Pclass, Sex, Age, SibSp, Fare (5 features)

In [None]:
# Select only the required columns
required_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Survived']
df_clean = df[required_features].copy()

print('Selected features:')
print(df_clean.columns.tolist())
print(f'\nShape after feature selection: {df_clean.shape}')

In [None]:
# Handle missing values
print('Missing values before handling:')
print(df_clean.isnull().sum())

# Drop rows with missing Survived values (target variable)
df_clean = df_clean.dropna(subset=['Survived'])

# Fill missing Age values with median
df_clean['Age'].fillna(df_clean['Age'].median(), inplace=True)

# Fill missing Fare values with median
df_clean['Fare'].fillna(df_clean['Fare'].median(), inplace=True)

print('\nMissing values after handling:')
print(df_clean.isnull().sum())
print(f'\nDataset shape after preprocessing: {df_clean.shape}')

In [None]:
# Encode categorical variables
# Sex: Male=1, Female=0
df_clean['Sex'] = (df_clean['Sex'] == 'male').astype(int)

print('Encoded Sex feature:')
print(df_clean['Sex'].value_counts())
print('\nFirst few rows after encoding:')
print(df_clean.head())

In [None]:
# Prepare features and target
X = df_clean[['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']]
y = df_clean['Survived']

# Save feature names for later use in the web app
selected_features = X.columns.tolist()
print(f'Input features: {selected_features}')
print(f'Target variable: Survived')
print(f'\nFeature matrix shape: {X.shape}')
print(f'Target vector shape: {y.shape}')

## Step 3: Train-Test Split (BEFORE Scaling - Prevent Data Leakage)

In [None]:
# CRITICAL: Split BEFORE scaling to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')
print(f'\nTraining set survival distribution:')
print(y_train.value_counts())
print(f'\nTest set survival distribution:')
print(y_test.value_counts())

## Step 4: Feature Scaling (FIT ONLY ON TRAINING DATA)

In [None]:
# Initialize scaler
scaler = StandardScaler()

# FIT scaler ONLY on training data
X_train_scaled = scaler.fit_transform(X_train)

# Apply scaler to test data (using transform, NOT fit_transform)
X_test_scaled = scaler.transform(X_test)

print('Scaler fitted on training data only')
print(f'Training data scaled shape: {X_train_scaled.shape}')
print(f'Test data scaled shape: {X_test_scaled.shape}')
print(f'\nScaler mean: {scaler.mean_}')
print(f'Scaler scale: {scaler.scale_}')

## Step 5: Train Random Forest Classifier

In [None]:
# Train Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print('Random Forest Classifier trained successfully!')

## Step 6: Model Evaluation

In [None]:
# Make predictions
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

print('\n' + '='*60)
print('CLASSIFICATION REPORT (Test Set)')
print('='*60)
print(classification_report(y_test, y_pred_test, 
                          target_names=['Did Not Survive', 'Survived']))

print('\n' + '='*60)
print('CONFUSION MATRIX (Test Set)')
print('='*60)
print(confusion_matrix(y_test, y_pred_test))

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print('Feature Importance:')
print(feature_importance)

## Step 7: Save Model Artifacts

In [None]:
# Create model directory if it doesn't exist
os.makedirs('.', exist_ok=True)

# Save the trained model
model_path = 'titanic_survival_model.pkl'
joblib.dump(model, model_path)
print(f'Model saved to {model_path}')

# Save the scaler
scaler_path = 'titanic_scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f'Scaler saved to {scaler_path}')

# Save feature names
features_path = 'selected_features.pkl'
joblib.dump(selected_features, features_path)
print(f'Selected features saved to {features_path}')

## Step 8: Verify Model Can Be Reloaded

In [None]:
# Reload the saved model
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)
loaded_features = joblib.load(features_path)

print('All artifacts reloaded successfully!')
print(f'\nLoaded features: {loaded_features}')

In [None]:
# Test prediction with reloaded model
test_sample = X_test.iloc[0:1]
test_sample_scaled = loaded_scaler.transform(test_sample)

# Predict class label
predicted_class = loaded_model.predict(test_sample_scaled)[0]

# Get prediction probabilities
probabilities = loaded_model.predict_proba(test_sample_scaled)[0]
confidence = float(np.max(probabilities)) * 100

print('Test prediction with reloaded model:')
print(f'Sample: {test_sample.values}')
print(f'Predicted class: {predicted_class}')
print(f'Probabilities: {probabilities}')
print(f'Confidence: {confidence:.2f}%')
print(f'\nActual value: {y_test.iloc[0]}')
print(f'Prediction correct: {predicted_class == y_test.iloc[0]}')

## Summary
- **Algorithm**: Random Forest Classifier
- **Features Used**: Pclass, Sex, Age, SibSp, Fare (5 features)
- **Target Variable**: Survived (0 = Did Not Survive, 1 = Survived)
- **Test Accuracy**: {:.2f}%
- **Model Persistence**: Joblib
- **Data Leakage Prevention**: Scaling performed ONLY on training data
- **Feature Validation**: Feature names saved for app.py validation
""".format(test_accuracy * 100)