Aude Sustronck

---



---

# Convert your code to production-level

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

## 1. Refactoring my code

Data Loading and Preprocessing:

In [None]:
def load_and_preprocess_data(filepath: str):
    """Loads and preprocesses the Titanic dataset."""
    df = pd.read_csv(filepath)
    
    # Handle missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df.drop('Cabin', axis=1, inplace=True)
    
    # Encode categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
    
    # Feature and target separation
    X = df.drop(['Survived', 'Name', 'Ticket', 'PassengerId'], axis=1)
    y = df['Survived']
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

Model Training:

In [None]:
def train_model(X_train, y_train):
    """Trains Logistic Regression and Random Forest models."""
    # Logistic Regression
    logreg = LogisticRegression(random_state=42, max_iter=1000)
    logreg.fit(X_train, y_train)
    
    # Random Forest with hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
    grid_rf.fit(X_train, y_train)
    best_rf = grid_rf.best_estimator_
    
    return logreg, best_rf

Predictions and Evaluation:

In [None]:
def predict_and_evaluate(models, X_test, y_test):
    """Evaluates the models and prints metrics."""
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        print(f"{model_name} Metrics:")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Precision:", precision_score(y_test, y_pred))
        print("Recall:", recall_score(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print("-" * 50)

## 2. Create Pipelines

In [None]:
def titanic_pipeline(filepath: str):
    """Main pipeline to process data, train models, and evaluate them."""
    # Step 1: Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data(filepath)
    
    # Normalize numeric features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Step 2: Train models
    logreg, best_rf = train_model(X_train_scaled, y_train)
    
    # Step 3: Predict and evaluate
    models = {"Logistic Regression": logreg, "Random Forest": best_rf}
    predict_and_evaluate(models, X_test_scaled, y_test)

# Run the pipeline
if __name__ == "__main__":
    filepath = "Titanic-Dataset.csv"  # Replace with the actual dataset path
    titanic_pipeline(filepath)

## 3. Repository Structure

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from typing import Tuple, Dict
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_and_preprocess_data(filepath: str) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Load and preprocess the Titanic dataset.
    
    Args:
        filepath (str): Path to the CSV file containing the Titanic dataset
        
    Returns:
        Tuple[pd.DataFrame, pd.Series]: Preprocessed features (X) and target variable (y)
    """
    try:
        # Load data
        logging.info(f"Loading data from {filepath}")
        df = pd.read_csv(filepath)
        
        # Handle missing values
        df['Age'].fillna(df['Age'].median(), inplace=True)
        df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
        
        # Feature engineering
        df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
        df = pd.get_dummies(df, columns=['Embarked'])
        
        # Drop unnecessary columns
        columns_to_drop = ['Survived', 'Name', 'Ticket', 'PassengerId', 'Cabin']
        X = df.drop(columns_to_drop, axis=1)
        y = df['Survived']
        
        logging.info("Data preprocessing completed successfully")
        return X, y
        
    except Exception as e:
        logging.error(f"Error in data preprocessing: {str(e)}")
        raise

def train_model(X: pd.DataFrame, y: pd.Series, 
                hyperparameter_tuning: bool = True) -> RandomForestClassifier:
    """
    Train a Random Forest model with optional hyperparameter tuning.
    
    Args:
        X (pd.DataFrame): Feature matrix
        y (pd.Series): Target variable
        hyperparameter_tuning (bool): Whether to perform GridSearchCV
        
    Returns:
        RandomForestClassifier: Trained model
    """
    try:
        logging.info("Starting model training")
        
        if hyperparameter_tuning:
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            
            model = GridSearchCV(
                RandomForestClassifier(random_state=42),
                param_grid,
                cv=5,
                scoring='accuracy'
            )
            
            logging.info("Performing hyperparameter tuning")
            model.fit(X, y)
            logging.info(f"Best parameters: {model.best_params_}")
            return model.best_estimator_
            
        else:
            model = RandomForestClassifier(random_state=42)
            model.fit(X, y)
            return model
            
    except Exception as e:
        logging.error(f"Error in model training: {str(e)}")
        raise

def evaluate_model(model: RandomForestClassifier, X_test: pd.DataFrame, 
                  y_test: pd.Series) -> Dict:
    """
    Evaluate the model's performance.
    
    Args:
        model (RandomForestClassifier): Trained model
        X_test (pd.DataFrame): Test features
        y_test (pd.Series): Test target
        
    Returns:
        Dict: Dictionary containing evaluation metrics
    """
    try:
        logging.info("Evaluating model performance")
        
        y_pred = model.predict(X_test)
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
        
        logging.info(f"Model performance metrics: {metrics}")
        return metrics
        
    except Exception as e:
        logging.error(f"Error in model evaluation: {str(e)}")
        raise

def main(filepath: str):
    """
    Main function to run the complete ML pipeline.
    
    Args:
        filepath (str): Path to the input dataset
    """
    try:
        # Load and preprocess data
        X, y = load_and_preprocess_data(filepath)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Train model
        trained_model = train_model(X_train, y_train, hyperparameter_tuning=True)
        
        # Evaluate model
        metrics = evaluate_model(trained_model, X_test, y_test)
        
        return trained_model, metrics
        
    except Exception as e:
        logging.error(f"Error in pipeline execution: {str(e)}")
        raise

if __name__ == "__main__":
    filepath = "Titanic-Dataset.csv"
    model, metrics = main(filepath)