In [1]:
import os
import sys
from pathlib import Path

print("Current working directory:")
print(os.getcwd())

cwd = Path.cwd()
print("\nParents of CWD:")
for i, p in enumerate(cwd.parents):
    print(f"{i}: {p}")

print("\nInitial sys.path (first 5):")
for p in sys.path[:5]:
    print(p)
# Resolve project root: wfa_xgb_cvd_prediction

Current working directory:
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\notebooks

Parents of CWD:
0: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction
1: c:\Users\dhanu\OneDrive\Desktop\CD_Main
2: c:\Users\dhanu\OneDrive\Desktop
3: c:\Users\dhanu\OneDrive
4: c:\Users\dhanu
5: c:\Users
6: c:\

Initial sys.path (first 5):
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\python311.zip
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\DLLs
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\Lib
C:\Users\dhanu\AppData\Local\Programs\Python\Python311
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\wfa_xgb_env


In [2]:
from pathlib import Path

cwd = Path.cwd()
print("CWD:", cwd)

print("\nParents:")
for i, p in enumerate(cwd.parents):
    print(f"{i}: {p}  | has src? -> {(p / 'src').exists()}")


CWD: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\notebooks

Parents:
0: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction  | has src? -> True
1: c:\Users\dhanu\OneDrive\Desktop\CD_Main  | has src? -> False
2: c:\Users\dhanu\OneDrive\Desktop  | has src? -> False
3: c:\Users\dhanu\OneDrive  | has src? -> False
4: c:\Users\dhanu  | has src? -> False
5: c:\Users  | has src? -> False
6: c:\  | has src? -> False


In [3]:
# ---- Project path fix (DO NOT SKIP) ----
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # ✅ VERIFIED CORRECT

if not (PROJECT_ROOT / "src").exists():
    raise RuntimeError(f"'src' not found at {PROJECT_ROOT}")

sys.path.insert(0, str(PROJECT_ROOT))

print("✅ Project root set to:", PROJECT_ROOT)


✅ Project root set to: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction


In [4]:
from src.config.paths import (
    HEART_VERIFIED_CSV,
    BASELINE_RESULTS_CSV,
    WFA_FEATURE_WEIGHTS_CSV,
    FEATURE_AUGMENTED_WEIGHTS_CSV,
    BASELINE_MODEL_PKL,
    WFA_XGB_MODEL_JSON
)

In [5]:
# ============================================================
# Notebook: 02_baseline_models.ipynb
# Objective: Train and evaluate baseline ML models
# Project: WFA-XGB for Cardiovascular Disease Prediction
# ============================================================

from src.data.load_data import load_dataset

DATA_PATH = PROJECT_ROOT / "data" / "processed" / "heart_Verified.csv"

X, y = load_dataset(
    path=str(DATA_PATH),
    target_col="target"
)

X.shape, y.shape



((1548, 11), (1548,))

In [6]:
from src.data.split_data import split_data

X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    X,
    y,
    test_size=0.2,
    val_size=0.1,
    stratify=True
)

(
    X_train.shape,
    X_val.shape,
    X_test.shape
)

((1238, 11), (38, 11), (272, 11))

In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

baseline_xgb = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

baseline_xgb.fit(X_train, y_train)

y_pred = baseline_xgb.predict(X_test)

baseline_accuracy = accuracy_score(y_test, y_pred)
baseline_accuracy

0.7977941176470589

In [8]:
import pandas as pd
import os

os.makedirs(PROJECT_ROOT / "notebooks" / "experiments", exist_ok=True)

baseline_df = pd.DataFrame([{
    "model": "XGBoost-Baseline",
    "accuracy": baseline_accuracy
}])

baseline_df.to_csv(
    PROJECT_ROOT / "notebooks" / "experiments" / "baseline_reference.csv",
    index=False
)

baseline_df


Unnamed: 0,model,accuracy
0,XGBoost-Baseline,0.797794


In [9]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb_tuned = XGBClassifier(
    n_estimators=600,          # ↑ more trees = finer decision boundary
    max_depth=5,               # ↑ allow slightly more interaction
    learning_rate=0.03,        # ↓ safer learning
    subsample=0.9,             # ↑ reduce variance
    colsample_bytree=0.9,      # ↑ preserve signal
    min_child_weight=1,        # default, stable
    gamma=0,                   # no pruning yet
    reg_lambda=1.0,            # L2 regularization
    eval_metric="logloss",
    random_state=42
)

xgb_tuned.fit(X_train, y_train)

y_pred = xgb_tuned.predict(X_test)

tuned_accuracy = accuracy_score(y_test, y_pred)
tuned_accuracy


0.8161764705882353

In [10]:
import pandas as pd

tuned_df = pd.DataFrame([{
    "model": "XGBoost-Tuned",
    "accuracy": tuned_accuracy
}])

tuned_df.to_csv(
    PROJECT_ROOT / "notebooks" / "experiments" / "xgb_tuned_reference.csv",
    index=False
)

tuned_df

Unnamed: 0,model,accuracy
0,XGBoost-Tuned,0.816176


In [11]:
# import os

# os.makedirs("experiments", exist_ok=True)

# import pandas as pd

# baseline_df = pd.DataFrame(results).T
# baseline_df
# baseline_df.to_csv(BASELINE_RESULTS_CSV)




In [12]:
# import os

# os.makedirs("models/baselines", exist_ok=True)

In [13]:
# baseline.save(BASELINE_MODEL_PKL)

In [14]:
# import joblib

# models = joblib.load("models/baselines/baseline_models.pkl")
# models.keys()