Aude Sustronck

---



---

# Convert your code to production-level

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

## 1. Refactoring my code

Data Loading and Preprocessing:

In [None]:
def load_and_preprocess_data(filepath: str):
    """Loads and preprocesses the Titanic dataset."""
    df = pd.read_csv(filepath)
    
    # Handle missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df.drop('Cabin', axis=1, inplace=True)
    
    # Encode categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
    
    # Feature and target separation
    X = df.drop(['Survived', 'Name', 'Ticket', 'PassengerId'], axis=1)
    y = df['Survived']
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

Model Training:

In [None]:
def train_model(X_train, y_train):
    """Trains Logistic Regression and Random Forest models."""
    # Logistic Regression
    logreg = LogisticRegression(random_state=42, max_iter=1000)
    logreg.fit(X_train, y_train)
    
    # Random Forest with hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
    grid_rf.fit(X_train, y_train)
    best_rf = grid_rf.best_estimator_
    
    return logreg, best_rf

Predictions and Evaluation:

In [None]:
def predict_and_evaluate(models, X_test, y_test):
    """Evaluates the models and prints metrics."""
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        print(f"{model_name} Metrics:")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Precision:", precision_score(y_test, y_pred))
        print("Recall:", recall_score(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print("-" * 50)