In [1]:
# ==========================================
# Train & Select Best Model from Processed Data
# ==========================================
import os
import json
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, RocCurveDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import joblib


In [2]:
# -------- Paths --------
PROJECT_ROOT = r"C:\Projects\fitness_prediction"
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
MODELS_DIR = os.path.join(PROJECT_ROOT, "models")
os.makedirs(MODELS_DIR, exist_ok=True)

In [3]:
# -------- Load processed splits --------
def _load_xy(processed_dir: str):
    """
    Expects files saved as either:
      - X_train.csv, X_test.csv, y_train.csv, y_test.csv  (with y files having one column: target)
    or:
      - train.csv, test.csv (with a 'target' column inside)
    """
    x_train_path = os.path.join(processed_dir, "X_train.csv")
    x_test_path  = os.path.join(processed_dir, "X_test.csv")
    y_train_path = os.path.join(processed_dir, "y_train.csv")
    y_test_path  = os.path.join(processed_dir, "y_test.csv")

    train_path = os.path.join(processed_dir, "train.csv")
    test_path  = os.path.join(processed_dir, "test.csv")

    if all(os.path.exists(p) for p in [x_train_path, x_test_path, y_train_path, y_test_path]):
        X_train = pd.read_csv(x_train_path)
        X_test  = pd.read_csv(x_test_path)
        y_train = pd.read_csv(y_train_path).squeeze()
        y_test  = pd.read_csv(y_test_path).squeeze()
        # Standardize target column name
        y_train.name = "is_fit" if y_train.name is None else y_train.name
        y_test.name  = "is_fit" if y_test.name is None else y_test.name
    elif all(os.path.exists(p) for p in [train_path, test_path]):
        train_df = pd.read_csv(train_path)
        test_df  = pd.read_csv(test_path)
        target_col = "target" if "target" in train_df.columns else "is_fit"
        if target_col not in train_df.columns:
            raise ValueError(f"Could not find target column ('target' or 'is_fit') in {train_path}")
        X_train = train_df.drop(columns=[target_col])
        y_train = train_df[target_col]
        X_test  = test_df.drop(columns=[target_col])
        y_test  = test_df[target_col]
    else:
        raise FileNotFoundError(
            f"Could not find expected processed files in {processed_dir}.\n"
            f"Looked for (X_train/X_test/y_train/y_test) OR (train/test)."
        )

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = _load_xy(PROCESSED_DIR)
print("Loaded shapes ->",
      f"X_train: {X_train.shape}, X_test: {X_test.shape},",
      f"y_train: {y_train.shape}, y_test: {y_test.shape}")

Loaded shapes -> X_train: (1583, 11), X_test: (396, 11), y_train: (1583,), y_test: (396,)


In [7]:
# ------------------------------
# Load train and test data
# ------------------------------
processed_dir = r"C:\Projects\fitness_prediction\data\processed"

train_path = os.path.join(processed_dir, "train.csv")
test_path  = os.path.join(processed_dir, "test.csv")

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Separate features and target
X_train = train_df.drop(columns=["target"])
y_train = train_df["target"]

X_test = test_df.drop(columns=["target"])
y_test = test_df["target"]


In [9]:
# ------------------------------
# Define candidate models
# ------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, solver="lbfgs"),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42)
}

# If you have xgboost or lightgbm installed, uncomment these:
try:
    from xgboost import XGBClassifier
    models["XGBoost"] = XGBClassifier(
        n_estimators=400, max_depth=5, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9,
        eval_metric="logloss", random_state=42, n_jobs=-1
    )
except ImportError:
    print("XGBoost not installed, skipping...")

try:
    from lightgbm import LGBMClassifier
    models["LightGBM"] = LGBMClassifier(
        n_estimators=400, max_depth=-1, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9,
        random_state=42, n_jobs=-1
    )
except ImportError:
    print("LightGBM not installed, skipping...")

In [11]:
# ------------------------------
# Train & Evaluate
# ------------------------------
from sklearn.metrics import accuracy_score, classification_report
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    results[name] = acc

    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))


Training Logistic Regression...
Accuracy: 0.7601
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       237
           1       0.70      0.70      0.70       159

    accuracy                           0.76       396
   macro avg       0.75      0.75      0.75       396
weighted avg       0.76      0.76      0.76       396


Training Random Forest...
Accuracy: 0.7576
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       237
           1       0.70      0.69      0.69       159

    accuracy                           0.76       396
   macro avg       0.75      0.75      0.75       396
weighted avg       0.76      0.76      0.76       396


Training XGBoost...
Accuracy: 0.7500
              precision    recall  f1-score   support

           0       0.81      0.76      0.78       237
           1       0.67      0.74      0.70       159

    accuracy                           0.75       

In [12]:
# ------------------------------
# Select best model
# ------------------------------
best_model = max(results, key=results.get)
print("\nBest Model:", best_model, "with accuracy:", results[best_model])


Best Model: Logistic Regression with accuracy: 0.76010101010101


In [None]:
import os
import joblib

# Find the best model based on accuracy
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print(f"Best Model: {best_model_name} with Accuracy: {results[best_model_name]:.4f}")

# Define models directory
models_dir = r"C:\Projects\fitness_prediction\models"

# Create directory if it doesn't exist
os.makedirs(models_dir, exist_ok=True)

# Save best model
model_path = os.path.join(models_dir, f"best_model_{best_model_name}.pkl")
joblib.dump(best_model, model_path)

print(f"✅ Best model saved at: {model_path}")

import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

model = LogisticRegression()
model.fit(X_scaled, y_train)

# Save both model and scaler
joblib.dump(model, "../models/best_model_Logistic Regression.pkl")
joblib.dump(scaler, "models/scaler.pkl")



Best Model: Logistic Regression with Accuracy: 0.7601
✅ Best model saved at: C:\Projects\fitness_prediction\models\best_model_Logistic Regression.pkl


FileNotFoundError: [Errno 2] No such file or directory: 'models/best_model_Logistic Regression.pkl'