In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

# random seed for reproducibility
np.random.seed(42)

# Load the data 
def load_titanic_data(train_path='Downloads/titanic.csv'):
    """
    Loads the Titanic dataset from CSV file.
    Returns the dataset as a DataFrame.
    """
    try:
        data = pd.read_csv(train_path)
        print(f"Successfully loaded dataset with {data.shape[0]} rows and {data.shape[1]} columns")
        return data
    except FileNotFoundError:
        print(f"Could not find the dataset file at {train_path}")
        print("Creating a dummy dataset structure for demonstration purposes")
        
        # Creating dummy dataset
        dummy_data = pd.DataFrame({
            'PassengerId': range(1, 101),
            'Survived': np.random.choice([0, 1], size=100),
            'Pclass': np.random.choice([1, 2, 3], size=100),
            'Name': ['Person ' + str(i) for i in range(1, 101)],
            'Sex': np.random.choice(['male', 'female'], size=100),
            'Age': np.random.normal(30, 14, 100),
            'SibSp': np.random.choice(range(0, 5), size=100),
            'Parch': np.random.choice(range(0, 5), size=100),
            'Ticket': ['TICKET_' + str(i) for i in range(1, 101)],
            'Fare': np.random.normal(30, 50, 100),
            'Cabin': [np.random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G']) + str(np.random.randint(1, 100)) if np.random.random() > 0.3 else None for _ in range(100)],
            'Embarked': np.random.choice(['C', 'Q', 'S', None], size=100, p=[0.3, 0.1, 0.58, 0.02])
        })
        
        # Adding missing values to the dataset
        dummy_data.loc[np.random.choice(dummy_data.index, 20), 'Age'] = None
        
        return dummy_data

# Load the dataset
titanic_data = load_titanic_data()

# Exploratory Data Analysis Function
def explore_data(data):
    """Perform exploratory data analysis on the dataset."""
    print("\nDataset Overview:")
    print(f"Shape: {data.shape}")
    print("\nData Types:")
    print(data.dtypes)
    print("\nMissing Values:")
    print(data.isnull().sum())
    print(f"Missing Values Percentage: {data.isnull().sum() / len(data) * 100}")
    print("\nSummary Statistics:")
    print(data.describe())
    
    if 'Survived' in data.columns:
        print("\nTarget Variable Distribution:")
        survival_counts = data['Survived'].value_counts()
        print(f"Survived: {survival_counts[1]} ({survival_counts[1]/len(data)*100:.2f}%)")
        print(f"Did not survive: {survival_counts[0]} ({survival_counts[0]/len(data)*100:.2f}%)")
        
        print("\nSurvival by Sex:")
        sex_survival = data.groupby('Sex')['Survived'].mean().sort_values(ascending=False)
        print(sex_survival)
        
        print("\nSurvival by Passenger Class:")
        class_survival = data.groupby('Pclass')['Survived'].mean().sort_values(ascending=False)
        print(class_survival)
        
        # Create age groups for analysis
        data['AgeGroup'] = pd.cut(data['Age'], 
                                  bins=[0, 12, 18, 35, 60, 100], 
                                  labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])
        
        print("\nSurvival by Age Group:")
        if data['AgeGroup'].notna().any():
            age_survival = data.groupby('AgeGroup')['Survived'].mean().sort_values(ascending=False)
            print(age_survival)
        else:
            print("No age data available for this analysis.")
        
        # Calculate family size
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        
        print("\nSurvival by Family Size:")
        family_survival = data.groupby('FamilySize')['Survived'].mean().sort_values(ascending=False)
        print(family_survival)
        
        # Check if Embarked data is available
        if 'Embarked' in data.columns and data['Embarked'].notna().any():
            print("\nSurvival by Port of Embarkation:")
            embark_survival = data.groupby('Embarked')['Survived'].mean().sort_values(ascending=False)
            print(embark_survival)
    
    # Cleaning up temporary columns
    if 'AgeGroup' in data.columns:
        data.drop('AgeGroup', axis=1, inplace=True)
    if 'FamilySize' in data.columns:
        data.drop('FamilySize', axis=1, inplace=True)
        
    print("\n===== END OF EDA =====")

# Perform exploratory data analysis
explore_data(titanic_data)

# Data Preprocessing
def preprocess_data(data):
    """
    Preprocess the Titanic dataset for machine learning.
    Handles missing values, performs feature engineering, and prepares data for modeling.
    
    Args:
        data: DataFrame containing Titanic dataset
        
    Returns:
        X: Features DataFrame
        y: Target Series (if available)
        feature_names: List of feature names after preprocessing
    """
    print("\n===== DATA PREPROCESSING =====")
    
    # Created a copy of the data to avoid modifying the original
    df = data.copy()
    
    # Separated features and target if target is available
    if 'Survived' in df.columns:
        y = df['Survived']
        X = df.drop('Survived', axis=1)
        print(f"Target variable 'Survived' separated with {y.sum()} positive cases out of {len(y)} total")
    else:
        y = None
        X = df
        print("No target variable 'Survived' found in the dataset")
    
    # Feature Engineering
    print("\nPerforming feature engineering...")
    
    # 1. Extract titles from names
    X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    title_mapping = {
        "Mr": "Mr",
        "Miss": "Miss",
        "Mrs": "Mrs",
        "Master": "Master",
        "Dr": "Rare",
        "Rev": "Rare",
        "Col": "Rare",
        "Major": "Rare",
        "Mlle": "Miss",
        "Countess": "Rare",
        "Ms": "Miss",
        "Lady": "Rare",
        "Jonkheer": "Rare",
        "Don": "Rare",
        "Dona": "Rare",
        "Mme": "Mrs",
        "Capt": "Rare",
        "Sir": "Rare"
    }
    
    X['Title'] = X['Title'].map(lambda x: title_mapping.get(x, "Rare"))
    print(f"Created 'Title' feature with categories: {X['Title'].unique()}")
    
    # 2.Created family size and is_alone features
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    X['IsAlone'] = (X['FamilySize'] == 1).astype(int)
    print(f"Created 'FamilySize' and 'IsAlone' features")
    
    # 3.Extract cabin information if available
    if 'Cabin' in X.columns:
        X['CabinLetter'] = X['Cabin'].str[0]
        X['HasCabin'] = (~X['Cabin'].isna()).astype(int)
        print(f"Created 'CabinLetter' and 'HasCabin' features")
    
    # 4.Create fare per person feature
    X['FarePerPerson'] = X['Fare'] / X['FamilySize']
    print(f"Created 'FarePerPerson' feature")
    
    # Identify numerical and categorical features for preprocessing
    numerical_features = ['Age', 'Fare', 'FamilySize', 'FarePerPerson']
    categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone']
    
    if 'CabinLetter' in X.columns:
        categorical_features.append('CabinLetter')
    if 'HasCabin' in X.columns:
        categorical_features.append('HasCabin')
    
    # Features to drop
    drop_features = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin']
    
    #preprocessing steps for numerical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Define the preprocessing steps for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'  
    )
    
    print(f"\nNumerical features: {numerical_features}")
    print(f"Categorical features: {categorical_features}")
    print(f"Features to drop: {drop_features}")
        
    return X, y, preprocessor

# Preprocess the data
X, y, preprocessor = preprocess_data(titanic_data)

# Split the data into training and testing sets
def split_data(X, y, test_size=0.2):
    """
    Split the data into training and testing sets.
    
    Args:
        X: Features DataFrame
        y: Target Series
        test_size: Proportion of data to use for testing
        
    Returns:
        X_train, X_test, y_train, y_test
    """
    print(f"\nSplitting data into {100-test_size*100}% training and {test_size*100}% testing sets...")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Testing set: {X_test.shape[0]} samples")
    
    return X_train, X_test, y_train, y_test

# Split the data if target is available
if y is not None:
    X_train, X_test, y_train, y_test = split_data(X, y)
else:
    print("\nWARNING: No target variable available, cannot split data.")
    X_train, X_test, y_train, y_test = X, None, y, None

# Model Selection and Training
def train_and_evaluate_models(X_train, y_train, X_test, y_test, preprocessor):
    """
    Train and evaluate multiple models on the Titanic dataset.
    
    Args:
        X_train: Training features
        y_train: Training target
        X_test: Testing features
        y_test: Testing target
        preprocessor: Data preprocessing ColumnTransformer
        
    Returns:
        Dictionary of trained models and their performance metrics
    """
    
    # Define models to try
    models = {
        'LogisticRegression': Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(max_iter=1000, random_state=42))
        ]),
        'RandomForest': Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42))
        ]),
        'GradientBoosting': Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', GradientBoostingClassifier(random_state=42))
        ])
    }
    
    # Results dictionary
    results = {}
    
    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Store results
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
        
        # Print results
        print(f"\n{name} Performance:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        
        # Print classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Display confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print("Confusion Matrix:")
        print(cm)
        
        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print(f"\nCross-validation scores: {cv_scores}")
        print(f"Mean CV accuracy: {cv_scores.mean():.4f}")
        print(f"Standard deviation of CV accuracy: {cv_scores.std():.4f}")
    
    # Find the best model
    best_model_name = max(results, key=lambda x: results[x]['accuracy'])
    best_model = results[best_model_name]
    
    print(f"\nBest Model: {best_model_name}")
    print(f"Best Model Accuracy: {best_model['accuracy']:.4f}")
    
    print("\n===== END OF MODEL TRAINING AND EVALUATION =====")
    
    return results, best_model_name

# Train and evaluate models if target is available
if y_train is not None and y_test is not None:
    model_results, best_model_name = train_and_evaluate_models(X_train, y_train, X_test, y_test, preprocessor)
    best_model = model_results[best_model_name]['model']
else:
    print("\nWARNING: No target variable available, cannot train models.")
    model_results, best_model_name, best_model = None, None, None

# Hyperparameter Tuning
def tune_hyperparameters(best_model, X_train, y_train, X_test, y_test):
    """
    Perform hyperparameter tuning on the best model.
    
    Args:
        best_model: Best performing model
        X_train: Training features
        y_train: Training target
        X_test: Testing features
        y_test: Testing target
        
    Returns:
        Tuned model and its performance
    """
    
    # Get the classifier name and instance
    classifier_name = best_model.steps[-1][0]
    classifier = best_model.steps[-1][1]
    
    # Define parameter grid based on the classifier type
    param_grid = {}
    
    if isinstance(classifier, LogisticRegression):
        param_grid = {
            f'{classifier_name}__C': [0.01, 0.1, 1, 10, 100],
            f'{classifier_name}__solver': ['liblinear', 'saga'],
            f'{classifier_name}__penalty': ['l1', 'l2']
        }
        print("Tuning LogisticRegression hyperparameters: C, solver, penalty")
    
    elif isinstance(classifier, RandomForestClassifier):
        param_grid = {
            f'{classifier_name}__n_estimators': [50, 100, 200],
            f'{classifier_name}__max_depth': [None, 5, 10, 15],
            f'{classifier_name}__min_samples_split': [2, 5, 10],
            f'{classifier_name}__min_samples_leaf': [1, 2, 4]
        }
        print("Tuning RandomForest hyperparameters: n_estimators, max_depth, min_samples_split, min_samples_leaf")
    
    elif isinstance(classifier, GradientBoostingClassifier):
        param_grid = {
            f'{classifier_name}__n_estimators': [50, 100, 200],
            f'{classifier_name}__learning_rate': [0.01, 0.1, 0.2],
            f'{classifier_name}__max_depth': [3, 5, 7],
            f'{classifier_name}__min_samples_split': [2, 5, 10]
        }
        print("Tuning GradientBoosting hyperparameters: n_estimators, learning_rate, max_depth, min_samples_split")
    
    # Create GridSearchCV
    grid_search = GridSearchCV(
        estimator=best_model,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit GridSearchCV
    print("\nPerforming grid search (this may take a while)...")
    grid_search.fit(X_train, y_train)
    
    # Get the best parameters and best estimator
    print(f"\nBest Parameters: {grid_search.best_params_}")
    print(f"Best Cross-validation Score: {grid_search.best_score_:.4f}")
    
    # Evaluate the tuned model on the test set
    tuned_model = grid_search.best_estimator_
    y_pred = tuned_model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print("\nTuned Model Performance on Test Set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Check if tuning improved performance
    original_accuracy = accuracy_score(y_test, best_model.predict(X_test))
    
    if accuracy > original_accuracy:
        print(f"\nHyperparameter tuning improved accuracy by {(accuracy - original_accuracy) * 100:.2f}%")
    else:
        print("\nHyperparameter tuning did not improve model performance. Using original model.")
        tuned_model = best_model
    
    print("\n===== END OF HYPERPARAMETER TUNING =====")
    
    return tuned_model, accuracy, precision, recall, f1

# Tune hyperparameters if best model is available
if best_model is not None and y_train is not None and y_test is not None:
    tuned_model, final_accuracy, final_precision, final_recall, final_f1 = tune_hyperparameters(
        best_model, X_train, y_train, X_test, y_test
    )
else:
    print("\nWARNING: No best model available, cannot perform hyperparameter tuning.")
    tuned_model, final_accuracy, final_precision, final_recall, final_f1 = None, None, None, None, None

# Feature Importance Analysis
def analyze_feature_importance(model, X):
    """
    Analyze and visualize feature importance.
    
    Args:
        model: Trained model
        X: Feature DataFrame (for column names)
    """
    print("\n===== FEATURE IMPORTANCE ANALYSIS =====")
    
    # Get the classifier
    classifier = model.named_steps['classifier']
    
    # Check if the classifier has feature_importances_ attribute
    if hasattr(classifier, 'feature_importances_'):
        try:
            # For newer scikit-learn versions
            feature_names = model.named_steps['preprocessor'].get_feature_names_out()
        except:
            # Fallback for older versions or if the above fails
            feature_names = [f"feature_{i}" for i in range(len(classifier.feature_importances_))]
        
        # Create a DataFrame of feature importances
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': classifier.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        # Print the top features
        print("\nTop 15 Most Important Features:")
        print(importance_df.head(15))
        
        # Group features by type (numerical vs categorical)
        if len(feature_names) > 0:
            feature_types = []
            for name in feature_names:
                if name.startswith('num__'):
                    feature_types.append('Numerical')
                elif name.startswith('cat__'):
                    feature_types.append('Categorical')
                else:
                    feature_types.append('Other')
            
            importance_df['Type'] = feature_types
            
            # Average importance by feature type
            type_importance = importance_df.groupby('Type')['Importance'].mean().sort_values(ascending=False)
            print("\nAverage Importance by Feature Type:")
            print(type_importance)
    
    elif hasattr(classifier, 'coef_'):
        # For linear models like LogisticRegression
        try:
            # For newer scikit-learn versions
            feature_names = model.named_steps['preprocessor'].get_feature_names_out()
        except:
            # Fallback for older versions or if the above fails
            feature_names = [f"feature_{i}" for i in range(classifier.coef_.shape[1])]
        
        # Created a DataFrame of coefficients
        coef_df = pd.DataFrame({
            'Feature': feature_names,
            'Coefficient': classifier.coef_[0]
        }).sort_values('Coefficient', ascending=False)
        
        # Print the top positive and negative coefficients
        print("\nTop 10 Positive Coefficients:")
        print(coef_df.head(10))
        
        print("\nTop 10 Negative Coefficients:")
        print(coef_df.tail(10))
    
    else:
        print("\nThis model doesn't provide feature importance information.")
    

# Analyze feature importance if tuned model is available
if tuned_model is not None:
    analyze_feature_importance(tuned_model, X)
else:
    print("\nWARNING: No tuned model available, cannot analyze feature importance.")

# Final Model Evaluation
def evaluate_final_model(model, X_test, y_test):
    """
    Perform final evaluation of the tuned model.
    
    Args:
        model: Tuned model
        X_test: Test features
        y_test: Test target
    """
    print("\n===== FINAL MODEL EVALUATION =====")
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Classification report
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    print(f"True Positives: {cm[1][1]}")
    print(f"True Negatives: {cm[0][0]}")
    print(f"False Positives: {cm[0][1]}")
    print(f"False Negatives: {cm[1][0]}")
    
    # Calculate additional metrics
    tn, fp, fn, tp = cm.ravel()
    
    # Specificity (true negative rate)
    specificity = tn / (tn + fp)
    
    # Negative predictive value
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    
    print("\nAdditional Metrics:")
    print(f"Specificity (True Negative Rate): {specificity:.4f}")
    print(f"Negative Predictive Value: {npv:.4f}")
    
    print("\n===== END OF FINAL MODEL EVALUATION =====")

# Evaluate final model if available
if tuned_model is not None and y_test is not None:
    evaluate_final_model(tuned_model, X_test, y_test)
else:
    print("\nWARNING: No tuned model or test data available, cannot perform final evaluation.")

# Project Summary
def summarize_project(model_results=None, best_model_name=None, final_accuracy=None):
    """
    Summarize the Titanic survival prediction project.
    
    Args:
        model_results: Dictionary of model results
        best_model_name: Name of the best model
        final_accuracy: Final accuracy after tuning
    """
    print("\n" + "="*80)
    print("                 TITANIC SURVIVAL PREDICTION PROJECT SUMMARY")
    print("="*80)
    
    print("\n1. DATA PREPROCESSING:")
    print("   - Handled missing values in Age, Cabin, and Embarked columns")
    print("   - Extracted title from passenger names")
    print("   - Created family size feature and 'IsAlone' indicator")
    print("   - Created 'HasCabin' feature based on cabin information availability")
    print("   - Normalized fare by family size")
    print("   - Applied one-hot encoding to categorical variables")
    print("   - Scaled numerical features")
    
    print("\n2. MODEL DEVELOPMENT:")
    if model_results:
        model_accuracies = {name: results['accuracy'] for name, results in model_results.items()}
        for name, accuracy in model_accuracies.items():
            print(f"   - {name}: {accuracy:.4f} accuracy")
        
        if best_model_name:
            print(f"   - Selected {best_model_name} as the best base model")
            
        if final_accuracy:
            print(f"   - Final model accuracy after tuning: {final_accuracy:.4f}")
    else:
        print("   - No model results available")
    
    print("\n3. KEY FINDINGS:")
    print("   - Gender was a strong predictor (women had higher survival rates)")
    print("   - Passenger class was important (higher classes had better survival rates)")
    print("   - Age was a factor (children had better survival chances)")
    print("   - Cabin information indicated social status and proximity to lifeboats")
    print("   - Family size affected survival (very large families had lower survival rates)")
    
    print("\n4. RECOMMENDATIONS FOR FURTHER IMPROVEMENT:")
    print("   - Collect more data if available")
    print("   - Try more advanced feature engineering:")
    print("     * Deck information from cabin numbers")
    print("     * More sophisticated family grouping")
    print("     * Better handling of rare categories")
    print("   - Experiment with ensemble methods (voting, stacking)")
    print("   - Try advanced models like XGBoost or neural networks")
    print("   - Perform more extensive hyperparameter tuning")
    
    print("\n5. NEXT STEPS:")
    print("   - Deploy the model via a simple web interface")
    print("   - Create a data pipeline for new predictions")
    print("   - Document the model for other data scientists")
    print("   - Prepare a presentation for stakeholders")
    
    print("\n" + "="*80)

# Print project summary
summarize_project(model_results, best_model_name, final_accuracy)

# Instructions for using the model with new data
def provide_usage_instructions():
    """Provide instructions for using the trained model with new data."""
    print("\n" + "="*80)
    print("             INSTRUCTIONS FOR USING THE MODEL WITH NEW DATA")
    print("="*80)
    
    print("""
To use this model with new passenger data:

1. Ensure your data contains the same features as the training data:
   - PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked

2. Load your new data:
   ```python
   new_data = pd.read_csv('new_passengers.csv')
   ```

3. Preprocess the new data using the same preprocessor:
   ```python
   X_new = preprocess_data(new_data)[0]
   ```

4. Make predictions:
   ```python
   predictions = tuned_model.predict(X_new)
   prediction_probs = tuned_model.predict_proba(X_new)[:, 1]
   ```

5. Format and save the predictions:
   ```python
   results = pd.DataFrame({
       'PassengerId': new_data['PassengerId'],
       'Survived': predictions,
       'Survival_Probability': prediction_probs
   })
   results.to_csv('prediction_results.csv', index=False)
   ```

Note: For consistent results, ensure that your new data follows the same format
and has the same column names as the original training data.
    """)
    
    print("="*80)

# Provide usage instructions
provide_usage_instructions()

print("\nProject complete. The model is ready for use!")

  X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


Successfully loaded dataset with 891 rows and 12 columns

Dataset Overview:
Shape: (891, 12)

Data Types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Missing Values Percentage: PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

Sum

  age_survival = data.groupby('AgeGroup')['Survived'].mean().sort_values(ascending=False)



Cross-validation scores: [0.77622378 0.8041958  0.85211268 0.84507042 0.83098592]
Mean CV accuracy: 0.8217
Standard deviation of CV accuracy: 0.0280

Training RandomForest...

RandomForest Performance:
Accuracy: 0.7821
Precision: 0.7344
Recall: 0.6812
F1-Score: 0.7068

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       110
           1       0.73      0.68      0.71        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.78      0.78      0.78       179

Confusion Matrix:
[[93 17]
 [22 47]]

Cross-validation scores: [0.76923077 0.72727273 0.82394366 0.82394366 0.85211268]
Mean CV accuracy: 0.7993
Standard deviation of CV accuracy: 0.0449

Training GradientBoosting...

GradientBoosting Performance:
Accuracy: 0.8156
Precision: 0.7903
Recall: 0.7101
F1-Score: 0.7481

Classification Report:
              precision    recall  f1-sc