In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

# 1) load data
df = pd.read_csv(r"C:\Users\kevin\Downloads\Heart_Disease_Prediction.csv")

# 2) try to guess the target column (binary outcome)
name_hints = ["heart", "disease", "target", "risk", "label", "output"]
binary_cols = []

for c in df.columns:
    vals = df[c].dropna().unique()
    if len(vals) == 2:
        binary_cols.append(c)

# prefer a binary column whose name looks like the label
target = None
for c in binary_cols:
    if any(h in c.lower() for h in name_hints):
        target = c
        break

# if none matched by name, just take the last binary column (usually the label)
if target is None and binary_cols:
    target = binary_cols[-1]

if target is None:
    print("I couldn't auto-detect the target column.")
    print("Here are your binary columns (good candidates):", binary_cols)
    raise ValueError("Set target manually like: target = 'YOUR_COLUMN_NAME'")

print("Using target column:", target)

# 3) clean target into 0/1
y = df[target].copy()
X = df.drop(columns=[target]).copy()

if y.dtype == "object":
    y2 = y.astype(str).str.lower().str.strip()
    y = y2.map({"yes": 1, "no": 0, "true": 1, "false": 0, "1": 1, "0": 0})

# if still messy, fallback: treat the “bigger” value as 1
if y.isna().any():
    uniq = pd.Series(df[target].dropna().unique())
    if len(uniq) == 2:
        a, b = uniq.iloc[0], uniq.iloc[1]
        y = df[target].map({a: 0, b: 1})
    else:
        raise ValueError("Target isn't cleanly binary. Check the values in your target column.")

# 4) super simple preprocessing:
# - one-hot encode categoricals
# - fill missing with 0 (not perfect, but simple and stable for a mini project)
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(0)

# 5) train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6) train a simple logistic regression risk model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

proba = model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("\nModel results:")
print("Accuracy:", round(accuracy_score(y_test, pred), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, proba), 4))

# 7) actuarial-ish part: turn probabilities into risk tiers + premium relativities
test_out = pd.DataFrame({"prob_heart_disease": proba})

# cut into 3 risk groups (bottom/middle/top third)
test_out["risk_tier"] = pd.qcut(test_out["prob_heart_disease"], q=3, labels=["Low", "Medium", "High"])

# premium relativity: base = Low tier average risk
tier_means = test_out.groupby("risk_tier")["prob_heart_disease"].mean()
base = tier_means["Low"]
relativity = (tier_means / base).round(2)

summary = pd.DataFrame({
    "avg_predicted_risk": tier_means.round(4),
    "premium_relativity_(vs_Low)": relativity
})

print("\nRisk tier pricing table (toy example):")
print(summary)


Using target column: Heart Disease

Model results:
Accuracy: 0.7593
ROC-AUC: 0.8264

Risk tier pricing table (toy example):
           avg_predicted_risk  premium_relativity_(vs_Low)
risk_tier                                                 
Low                    0.0669                         1.00
Medium                 0.5831                         8.71
High                   0.9559                        14.28


  tier_means = test_out.groupby("risk_tier")["prob_heart_disease"].mean()
