In [5]:
import argparse
import json
import os
from typing import Tuple, Optional
from dataclasses import dataclass

In [3]:
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import(
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
@dataclass 
class TrainConfig:
    features_path: str = ""
    lables_path: str = ""
    output_dir: str = "artifacts"
    model_type: str = "rf"
    test_size: float = 0.2
    random_state: int = 42
    max_iter: int = 1000
    n_estimator: int = 300
    n_jobs: int = -1

In [8]:
def load_features_and_label(features_path, labels_path):
    # Read with index_col=0 to drop the first unnamed index column if present
    features_df = pd.read_csv(features_path, index_col = 0)
    labels_df = pd.read_csv(labels_path, index_col=0)

    # Align by index to ensure consistent row ordering and length
    missing_in_features = labels.index.difference(features_df.index)
    if len(missing_in_features) > 0:
        raise ValueError(
            f"Labels contain indices not present in features: {len(missing_in_features)} missing"
        )
    
    labels = labels_df["Exited"].astype(int)
    
    # align features rows by labels indexes
    aligned_features_df = features_df.loc[labels.index]

    return aligned_features_df, labels

In [9]:
def build_model_pipeline(config: TrainConfig):
    if config.model_type == "logreg":
        model = LogisticRegression(
            max_iter = config.max_iter,
            class_weight="balanced",
            random_state=config.random_state,
            n_jobs=config.n_jobs
        )
        steps = [("scaler", StandardScaler()), ("model", model)]
        return Pipeline(steps=steps)
    elif config.model_type == "rf":
        model = RandomForestClassifier(
            n_estimators=config.n_estimator,
            random_state=config.random_state,
            n_jobs=config.n_jobs,
            class_weight="balanced_subsample",
        )
        return Pipeline(steps=[("model", model)])
    
    else:
        raise ValueError ("Unsuported model_type")

In [11]:
def evaluate_model(y_true, y_pred, y_prob):
    metrics: dict = {}
    metrics["accuracy"] = float(accuracy_score(y_true, y_pred))
    metrics["precision"] = float(precision_score(y_true, y_pred))
    metrics["recall"] = float(recall_score(y_true, y_pred))
    metrics["F1_score"] = float(f1_score(y_true, y_pred))
    metrics["ROC-AUC_score"] = float(roc_auc_score(y_true, y_pred))

    cm = confusion_matrix(y_true, y_pred)
    metrics["confusion_matrix"] = cm.tolist()
    metrics["classification_report"] = classification_report(y_true, y_pred, digits=4)

    return metrics