In [3]:
import os
import sys
from pathlib import Path

print("Current working directory:")
print(os.getcwd())

cwd = Path.cwd()
print("\nParents of CWD:")
for i, p in enumerate(cwd.parents):
    print(f"{i}: {p}")

print("\nInitial sys.path (first 5):")
for p in sys.path[:5]:
    print(p)
# Resolve project root: wfa_xgb_cvd_prediction

Current working directory:
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction

Parents of CWD:
0: c:\Users\dhanu\OneDrive\Desktop\CD_Main
1: c:\Users\dhanu\OneDrive\Desktop
2: c:\Users\dhanu\OneDrive
3: c:\Users\dhanu
4: c:\Users
5: c:\

Initial sys.path (first 5):
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\python311.zip
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\DLLs
C:\Users\dhanu\AppData\Local\Programs\Python\Python311\Lib
C:\Users\dhanu\AppData\Local\Programs\Python\Python311
c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\wfa_xgb_env


In [4]:
# ---- Project path fix (DO NOT SKIP) ----
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # ✅ VERIFIED CORRECT

if not (PROJECT_ROOT / "src").exists():
    raise RuntimeError(f"'src' not found at {PROJECT_ROOT}")

sys.path.insert(0, str(PROJECT_ROOT))

print("✅ Project root set to:", PROJECT_ROOT)

RuntimeError: 'src' not found at c:\Users\dhanu\OneDrive\Desktop\CD_Main

In [None]:
from src.config.paths import (
    HEART_VERIFIED_CSV,
    BASELINE_RESULTS_CSV,
    WFA_FEATURE_WEIGHTS_CSV,
    FEATURE_AUGMENTED_WEIGHTS_CSV,
    BASELINE_MODEL_PKL,
    WFA_XGB_MODEL_JSON
)


In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

from xgboost import XGBClassifier

In [None]:
from src.data.load_data import load_dataset
from src.data.split_data import split_data

X, y = load_dataset(
    PROJECT_ROOT / "data" / "processed" / "heart_Verified.csv",
    target_col="target"
)

X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

print("✅ Data loaded")
print(X_train.shape, X_test.shape)
# Compute sample weights based on predicted probabilities

✅ Data loaded
(1238, 11) (272, 11)


In [None]:
WFA_WEIGHTS_PATH = PROJECT_ROOT / "notebooks" / "experiments" / "wfa_feature_weights.csv"

if not WFA_WEIGHTS_PATH.exists():
    raise FileNotFoundError(f"WFA weights not found at {WFA_WEIGHTS_PATH}")

feature_weights = pd.read_csv(
    WFA_WEIGHTS_PATH,
    index_col=0
).squeeze()

# align order
feature_weights = feature_weights.loc[X_train.columns]

print("✅ Feature weights loaded:", feature_weights.shape)


✅ Feature weights loaded: (11,)


In [None]:
wfa_xgb_safe = XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

# Apply feature weighting consistently
X_train_w = X_train * feature_weights
X_test_w  = X_test  * feature_weights

wfa_xgb_safe.fit(X_train_w, y_train)

print("✅ WFA-XGB (safe) trained")
# Predict probabilities

✅ WFA-XGB (safe) trained


In [None]:
y_prob = wfa_xgb_safe.predict_proba(X_test_w)[:, 1]

print("Probability range:", y_prob.min(), "→", y_prob.max())
print("Unique probs:", len(np.unique(np.round(y_prob, 4))))
# Compute sample weights based on predicted probabilities

Probability range: 0.0036155272 → 0.990346
Unique probs: 267


In [None]:
y_pred = (y_prob >= 0.5).astype(int)

wfa_results = {
    "model": "WFA-XGB (safe)",
    "threshold": 0.5,
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1_score": f1_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_prob)
}

wfa_df = pd.DataFrame([wfa_results])
wfa_df
# Compute sample weights based on predicted probabilities

Unnamed: 0,model,threshold,accuracy,precision,recall,f1_score,roc_auc
0,WFA-XGB (safe),0.5,0.8125,0.839695,0.785714,0.811808,0.887554


In [None]:
thresholds = np.linspace(0.1, 0.9, 17)
records = []

for t in thresholds:
    y_hat = (y_prob >= t).astype(int)

    records.append({
        "threshold": t,
        "accuracy": accuracy_score(y_test, y_hat),
        "precision": precision_score(y_test, y_hat, zero_division=0),
        "recall": recall_score(y_test, y_hat),
        "f1_score": f1_score(y_test, y_hat),
        "roc_auc": roc_auc_score(y_test, y_prob)  # invariant to threshold
    })

threshold_df = pd.DataFrame(records)
threshold_df.sort_values("f1_score", ascending=False).head(5)

Unnamed: 0,threshold,accuracy,precision,recall,f1_score,roc_auc
4,0.3,0.816176,0.788462,0.878571,0.831081,0.887554
5,0.35,0.816176,0.796053,0.864286,0.828767,0.887554
6,0.4,0.823529,0.828571,0.828571,0.828571,0.887554
7,0.45,0.816176,0.830882,0.807143,0.818841,0.887554
8,0.5,0.8125,0.839695,0.785714,0.811808,0.887554


In [None]:
best = threshold_df.sort_values("f1_score", ascending=False).iloc[0]

optimized_results = {
    "model": "WFA-XGB (threshold optimized)",
    **best.to_dict()
}

optimized_df = pd.DataFrame([optimized_results])
optimized_df

Unnamed: 0,model,threshold,accuracy,precision,recall,f1_score,roc_auc
0,WFA-XGB (threshold optimized),0.3,0.816176,0.788462,0.878571,0.831081,0.887554


In [None]:
OUTPUT_DIR = PROJECT_ROOT / "notebooks" / "experiments"
OUTPUT_DIR.mkdir(exist_ok=True)

eval_path = OUTPUT_DIR / "wfa_evaluation_results.csv"
optimized_df.to_csv(eval_path, index=False)

print("✅ Evaluation saved to:", eval_path)


✅ Evaluation saved to: c:\Users\dhanu\OneDrive\Desktop\CD_Main\wfa_xgb_cvd_prediction\notebooks\experiments\wfa_evaluation_results.csv
