In [1]:
import os
import sys
from pathlib import Path

print("Current working directory:")
print(os.getcwd())

cwd = Path.cwd()
print("\nParents of CWD:")
for i, p in enumerate(cwd.parents):
    print(f"{i}: {p}")

print("\nInitial sys.path (first 5):")
for p in sys.path[:5]:
    print(p)
# Resolve project root: wfa_xgb_cvd_prediction

Current working directory:
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\notebooks

Parents of CWD:
0: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction
1: c:\Users\dhanu\OneDrive\Desktop\CD_Main
2: c:\Users\dhanu\OneDrive\Desktop
3: c:\Users\dhanu\OneDrive
4: c:\Users\dhanu
5: c:\Users
6: c:\

Initial sys.path (first 5):
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\python311.zip
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\DLLs
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\Lib
C:\Users\dhanu\AppData\Local\Programs\Python\Python311
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\wfa_xgb_env


In [2]:
from pathlib import Path

cwd = Path.cwd()
print("CWD:", cwd)

print("\nParents:")
for i, p in enumerate(cwd.parents):
    print(f"{i}: {p}  | has src? -> {(p / 'src').exists()}")


CWD: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\notebooks

Parents:
0: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction  | has src? -> True
1: c:\Users\dhanu\OneDrive\Desktop\CD_Main  | has src? -> False
2: c:\Users\dhanu\OneDrive\Desktop  | has src? -> False
3: c:\Users\dhanu\OneDrive  | has src? -> False
4: c:\Users\dhanu  | has src? -> False
5: c:\Users  | has src? -> False
6: c:\  | has src? -> False


In [3]:
# ---- Project path fix (DO NOT SKIP) ----
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # ✅ VERIFIED CORRECT

if not (PROJECT_ROOT / "src").exists():
    raise RuntimeError(f"'src' not found at {PROJECT_ROOT}")

sys.path.insert(0, str(PROJECT_ROOT))

print("✅ Project root set to:", PROJECT_ROOT)


✅ Project root set to: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction


In [4]:
from src.config.paths import (
    HEART_VERIFIED_CSV,
    BASELINE_RESULTS_CSV,
    WFA_FEATURE_WEIGHTS_CSV,
    FEATURE_AUGMENTED_WEIGHTS_CSV,
    BASELINE_MODEL_PKL,
    WFA_XGB_MODEL_JSON
)

In [5]:
# ============================================================
# Notebook: 02_baseline_models.ipynb
# Objective: Train and evaluate baseline ML models
# Project: WFA-XGB for Cardiovascular Disease Prediction
# ============================================================

import pandas as pd

from src.data.load_data import load_dataset
from src.data.split_data import split_data
from src.models.baseline_models import BaselineModels



In [6]:
import os
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent

for root, dirs, files in os.walk(PROJECT_ROOT / "data"):
    for file in files:
        print(Path(root) / file)

c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\data\feature_metadata.json
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\data\processed\heart_disease_processed.csv
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\data\processed\heart_Verified.csv


In [7]:
df_preview = pd.read_csv("../data/processed/heart_Verified.csv")
df_preview.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'target'],
      dtype='object')

In [8]:
from src.data.load_data import load_dataset
from src.data.split_data import split_data

X, y = load_dataset(
    "../data/processed/heart_Verified.csv",
    target_col="target"
)

X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

X_train.shape, X_val.shape, X_test.shape



((1238, 11), (38, 11), (272, 11))

In [9]:
baseline = BaselineModels()
baseline.train(X_train, y_train)


In [10]:
results = baseline.evaluate(X_test, y_test)
results


{'logistic_regression': {'accuracy': 0.7279411764705882,
  'precision': 0.7578125,
  'recall': 0.6928571428571428,
  'f1_score': 0.7238805970149254,
  'roc_auc': 0.7839285714285714},
 'random_forest': {'accuracy': 0.8125,
  'precision': 0.8296296296296296,
  'recall': 0.8,
  'f1_score': 0.8145454545454545,
  'roc_auc': 0.8922077922077922},
 'xgboost': {'accuracy': 0.8014705882352942,
  'precision': 0.8307692307692308,
  'recall': 0.7714285714285715,
  'f1_score': 0.8,
  'roc_auc': 0.8766774891774891}}

In [11]:
import os

os.makedirs("experiments", exist_ok=True)

import pandas as pd

baseline_df = pd.DataFrame(results).T
baseline_df
baseline_df.to_csv(BASELINE_RESULTS_CSV)




In [12]:
import os

os.makedirs("models/baselines", exist_ok=True)

In [13]:
baseline.save(BASELINE_MODEL_PKL)

In [14]:
import joblib

models = joblib.load("models/baselines/baseline_models.pkl")
models.keys()

dict_keys(['logistic_regression', 'random_forest', 'xgboost'])