### Imports & Core Setup  
Load NumPy/Pandas, scikit-learn metrics and pipeline components, and kNN.  
This cell defines the core toolbox the rest of the notebook relies on.

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix, classification_report
import numpy as np, pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix, classification_report

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

### CSV I/O Helpers  
Convenience code for loading separate train/test CSVs and performing light cleaning  
if the instructor provided external files. Safe to skip if `X_train` / `y_train` are already defined.


In [38]:
# separate train/test CSVs:
df_train = pd.read_csv("NHANES_Data_P1_MI_train.csv")
df_test  = pd.read_csv("NHANES_Data_P1_MI_test.csv")

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)


Train shape: (14470, 19)
Test shape: (230, 19)


### Leakage Guard: Drop Non-Features  
Define and drop columns the model must not see (IDs, labels, or any obvious target leakage).  
This keeps the design matrix clean and reproducible.


In [39]:
# Columns we do NOT want the model to see
DROP_COLS = ["ID", "Insurance", "Edu", "Income", "LDL"]

# Map MI to {0,1}: 1 = MI (positive), 2 = No MI (negative)
mi_mapped = df_train["MI"].map({1: 1, 2: 0})
mask = mi_mapped.notna()

y = mi_mapped[mask].astype(int).reset_index(drop=True)
X = df_train.loc[mask].drop(columns=DROP_COLS + ["MI"], errors="ignore").reset_index(drop=True)

print("X columns:", list(X.columns))
print("Class balance (train):", y.value_counts(normalize=True).to_dict())
print("Shapes:", X.shape, y.shape)


X columns: ['Sex', 'Age', 'Race', 'Systolic', 'Diastolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'TCHOL', 'eGFR', 'CurrentSmoker', 'Diabetes']
Class balance (train): {0: 0.9596891342242883, 1: 0.040310865775711795}
Shapes: (13768, 13) (13768,)


### Feature Engineering: Interactions  
Create clinically-sensible interaction features (e.g., Age×BP, Age×Lipids, lipid ratios).  
These transformations can sharpen distances for kNN.


In [40]:
def add_interactions(X_arr_or_df):
    # Accept ndarray or DataFrame, return DataFrame
    if isinstance(X_arr_or_df, pd.DataFrame):
        Xdf = X_arr_or_df.copy()
    else:
        Xdf = pd.DataFrame(X_arr_or_df)

    # Guard: only add if columns exist
    if {"Age","Systolic"}.issubset(Xdf.columns):
        Xdf["age_x_Systolic"] = Xdf["Age"] * Xdf["Systolic"]
    if {"Age","TCHOL"}.issubset(Xdf.columns):
        Xdf["age_x_TCHOL"] = Xdf["Age"] * Xdf["TCHOL"]
    if {"Age","HDL"}.issubset(Xdf.columns):
        Xdf["age_x_HDL"] = Xdf["Age"] * Xdf["HDL"]
    if "Age" in Xdf.columns and "CurrentSmoker" in Xdf.columns:
        # CurrentSmoker is coded 1/2 in the spec; if your preprocessing made it 0/1 already, this still works
        smoker_binary = (Xdf["CurrentSmoker"] == 1).astype(int)
        Xdf["age_x_smoker"] = Xdf["Age"] * smoker_binary

    return Xdf

interaction_tf = FunctionTransformer(add_interactions, validate=False)

# Categorical columns to one-hot
CAT_COLS = ["Sex", "Race", "CurrentSmoker", "Diabetes"]

# Robust OHE across sklearn versions
def make_ohe():
    try:
        return OneHotEncoder(drop=None, handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(drop=None, handle_unknown="ignore", sparse=False)


### Preprocessing Pipeline  
Use a ColumnTransformer to:  
- Impute missing values  
- StandardScale numeric features  
- One-Hot Encode categorical features  

This preprocessing is applied consistently for training and inference.


In [41]:
ohe = make_ohe()

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

# After interactions, select by dtype so interaction columns are treated as numeric
preprocess = ColumnTransformer(
    transformers=[
        ("cat", categorical_pipe, CAT_COLS),
        ("num", numeric_pipe, selector(dtype_include=np.number)),
    ],
    remainder="drop"
)

pipeline = Pipeline([
    ("interactions", interaction_tf),
    ("preprocess", preprocess),
    ("clf", KNeighborsClassifier())
])

param_grid = {
    "clf__n_neighbors": [17],
    "clf__weights": ["uniform"],
    "clf__p": [2]  # Manhattan often wins on tabular; try fractional too
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

print("Pipeline is ready.")


Pipeline is ready.


### Train/Validation Split  
Stratified split into:  
- **Imbalanced TRAIN** (reflects original dataset, ~96/4)  
- **Balanced VALIDATION** (~50/50 distribution)  

The balanced validation slice mimics the grader’s conditions for accuracy/log-loss scoring.


In [42]:
# Stratified split preserves original imbalance in training portion
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# Build a ~50/50 validation set to mimic the test distribution
pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
k = min(len(pos_valid), len(neg_valid))

pos_valid = pos_valid.sample(n=k, random_state=RANDOM_STATE)
neg_valid = neg_valid.sample(n=k, random_state=RANDOM_STATE)

X_valid_bal = pd.concat([pos_valid, neg_valid])
y_valid_bal = pd.concat([y_valid.loc[pos_valid.index], y_valid.loc[neg_valid.index]])

# Shuffle
perm = np.random.RandomState(RANDOM_STATE).permutation(len(X_valid_bal))
X_valid_bal = X_valid_bal.iloc[perm]
y_valid_bal = y_valid_bal.iloc[perm]

print("Train shape:", X_train.shape, y_train.shape)
print("Valid (orig) balance:", y_valid.value_counts(normalize=True).to_dict())
print("Valid (balanced) balance:", y_valid_bal.value_counts(normalize=True).to_dict())


Train shape: (11014, 13) (11014,)
Valid (orig) balance: {0: 0.9596949891067538, 1: 0.04030501089324619}
Valid (balanced) balance: {0: 0.5, 1: 0.5}


### Undersampling for kNN Fit  
Construct a balanced training set (50/50) by randomly undersampling the majority class.  
This prevents kNN neighbors from being dominated by negatives and preserves minority signal.


In [43]:
# 1:1 undersampling on the training split
pos_idx = y_train[y_train == 1].index
neg_idx = y_train[y_train == 0].index

n_pos = len(pos_idx)
neg_keep = np.random.RandomState(RANDOM_STATE).choice(neg_idx, size=n_pos, replace=False)
keep_idx = np.concatenate([pos_idx, neg_keep])

X_train_us = X_train.loc[keep_idx]
y_train_us = y_train.loc[keep_idx]

# Shuffle
perm = np.random.RandomState(RANDOM_STATE).permutation(len(X_train_us))
X_train_us = X_train_us.iloc[perm]
y_train_us = y_train_us.iloc[perm]

print("Original train balance:", y_train.value_counts(normalize=True).to_dict())
print("Undersampled train balance:", y_train_us.value_counts(normalize=True).to_dict())
print("Undersampled shapes:", X_train_us.shape, y_train_us.shape)

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=cv,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train_us, y_train_us)

print("\n=== Best params (undersampled) ===")
print(grid.best_params_)
print("Best CV log loss:", -grid.best_score_)

best_model = grid.best_estimator_


Original train balance: {0: 0.959687670237879, 1: 0.04031232976212094}
Undersampled train balance: {1: 0.5, 0: 0.5}
Undersampled shapes: (888, 13) (888,)
Fitting 5 folds for each of 1 candidates, totalling 5 fits

=== Best params (undersampled) ===
{'clf__n_neighbors': 17, 'clf__p': 2, 'clf__weights': 'uniform'}
Best CV log loss: 0.6500531166794496


### Validation Scoring @ t=0.50  
Evaluate the kNN model on the balanced validation split:  
- Accuracy and log loss  
- Confusion matrix  
- Quick threshold table (even though the grading threshold is fixed at 0.50)


In [44]:
probs = best_model.predict_proba(X_valid_bal)[:, 1]
preds = (probs >= 0.50).astype(int)

acc = accuracy_score(y_valid_bal, preds)
ll  = log_loss(y_valid_bal, probs)
cm  = confusion_matrix(y_valid_bal, preds)

print(f"Accuracy @ t=0.50: {acc:.4f}")
print(f"Log loss:          {ll:.4f}")
print("Confusion matrix:\n", cm)
print(classification_report(y_valid_bal, preds, digits=3))

# Threshold sweep
rows = []
for t in np.linspace(0.1, 0.9, 9):
    pp = (probs >= t).astype(int)
    rows.append({"t": round(t, 2), "positives": int(pp.sum()), "pos_rate": float(pp.mean())})
thr_table = pd.DataFrame(rows)
print("\nThreshold table:")
print(thr_table.to_string(index=False))


Accuracy @ t=0.50: 0.7748
Log loss:          0.4994
Confusion matrix:
 [[82 29]
 [21 90]]
              precision    recall  f1-score   support

           0      0.796     0.739     0.766       111
           1      0.756     0.811     0.783       111

    accuracy                          0.775       222
   macro avg      0.776     0.775     0.774       222
weighted avg      0.776     0.775     0.774       222


Threshold table:
  t  positives  pos_rate
0.1        201  0.905405
0.2        178  0.801802
0.3        154  0.693694
0.4        147  0.662162
0.5        119  0.536036
0.6         85  0.382883
0.7         67  0.301802
0.8         37  0.166667
0.9          7  0.031532


### Final Model Selector  
Choose the exact kNN configuration for submission:  
- k = 17  
- p = 2 (Euclidean distance)  
- weights = "uniform"  

Downstream code assumes this object name for inference.

In [45]:
# === FINAL MODEL PICK ===
final_model = best_model 

### Sanity Probe on Validation  
Re-check the chosen `final_model` on the balanced validation set to confirm  
it still produces the expected ~0.75 accuracy and ~0.499 log loss.  
If this drifts, you may be pointing at the wrong model.


In [46]:
def threshold_table(probs, t_values=(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9)):
    rows = []
    for t in t_values:
        preds = (probs >= t).astype(int)
        rows.append({"t": t, "positives": int(preds.sum()), "pos_rate": float(preds.mean())})
    return pd.DataFrame(rows)

def summarize_predictions(name, y_true, probs, t=0.5):
    print(f"\n=== {name} @ t={t:.2f} ===")
    preds = (probs >= t).astype(int)
    if y_true is not None:
        print(f"Accuracy: {accuracy_score(y_true, preds):.6f}")
        print(f"Log loss: {log_loss(y_true, probs):.6f}")
        print("Confusion matrix:\n", confusion_matrix(y_true, preds))
        print(classification_report(y_true, preds))
        print(f"Actual positive rate (y): {np.mean(y_true):.6f}")
    else:
        print("(No labels provided for this split; skipping accuracy/log loss/confusion.)")
    print(f"Predicted positive rate @ {t:.2f}: {preds.mean():.6f}")
    print(f"Mean predicted probability: {probs.mean():.6f}")
    print(f"Min/Max predicted probability: {probs.min():.6f} / {probs.max():.6f}")
    return threshold_table(probs)


### Refit on Undersampled TRAIN → Validate → TEST Export  
Steps performed here:  
1. Rebuild the baseline pipeline.  
2. Refit on 1:1 undersampled TRAIN (the setup that matched validation).  
3. Verify on balanced VALID (target ~0.75 acc / ~0.499 log loss).  
4. Predict on TEST and run a sanity check (predicted positive rate ≈ 45–55% if TEST is ~50/50).  
5. Export the required `submission_predictions.csv` with exactly two columns: **RowID, Probability**.


In [47]:
# refit baseline on 1:1 undersampled train, verify on VALID, export TEST predictions
# CONFIG 
TEST_CSV_PATH = "NHANES_Data_P1_MI_test.csv"
RANDOM_STATE = 42

# Build the EXACT baseline pipeline (preprocess -> kNN k=17,p=2,uniform)
# Assumes you already defined `preprocess` (your ColumnTransformer with imputer+scaler+OHE)
baseline_pipe = Pipeline([
    ("preprocess", preprocess),
    ("clf", KNeighborsClassifier(n_neighbors=17, p=2, weights="uniform"))
])

# Recreate the 1:1 undersampled training set
pos_idx = y_train[y_train == 1].index
neg_idx = y_train[y_train == 0].index

n_pos = len(pos_idx)
rng = np.random.RandomState(RANDOM_STATE)
neg_keep = rng.choice(neg_idx, size=n_pos, replace=False)
keep_idx = np.concatenate([pos_idx, neg_keep])

X_train_us = X_train.loc[keep_idx]
y_train_us = y_train.loc[keep_idx]

perm = rng.permutation(len(X_train_us))
X_train_us = X_train_us.iloc[perm]
y_train_us = y_train_us.iloc[perm]

print("[INFO] Train balance original:", y_train.value_counts(normalize=True).to_dict())
print("[INFO] Train balance undersampled:", y_train_us.value_counts(normalize=True).to_dict())
print("[INFO] Shapes (US):", X_train_us.shape, y_train_us.shape)

# Fit baseline on UNDERSAMPLED train
baseline_pipe.fit(X_train_us, y_train_us)

# Validate on your balanced validation split
probs_val = baseline_pipe.predict_proba(X_valid_bal)[:, 1]
preds_val = (probs_val >= 0.5).astype(int)
print("\n=== VALID (balanced) check @ t=0.50 ===")
print(f"Accuracy: {accuracy_score(y_valid_bal, preds_val):.6f}")
print(f"Log loss: {log_loss(y_valid_bal, probs_val):.6f}")
print("Confusion matrix:\n", confusion_matrix(y_valid_bal, preds_val))
print(classification_report(y_valid_bal, preds_val))
print(f"Pred pos rate @0.50: {preds_val.mean():.6f}")
print(f"Mean prob: {probs_val.mean():.6f}   Min/Max: {probs_val.min():.6f} / {probs_val.max():.6f}")

# Load TEST, keep ID, drop only ID (and label if present)
test_df = pd.read_csv(TEST_CSV_PATH)

# ID column detection
candidate_id_cols = ["RowID", "SEQN", "row_id", "ID", "Id", "id"]
id_col = next((c for c in candidate_id_cols if c in test_df.columns), None)
RowID = (test_df[id_col] if id_col else test_df.index.to_series()).reset_index(drop=True).rename("RowID")

X_test = test_df.drop(columns=[id_col] if id_col else []).copy()

# drop label if the test file unexpectedly contains one
for label_col in ["MI", "Outcome", "Label", "target", "y"]:
    if label_col in X_test.columns:
        X_test = X_test.drop(columns=[label_col])

# Predict TEST and export
probs_test = baseline_pipe.predict_proba(X_test)[:, 1]
preds_test = (probs_test >= 0.5).astype(int)

print("\n=== TEST sanity (t=0.50) ===")
print(f"Pred pos rate @0.50: {preds_test.mean():.4f}")
print(f"Mean prob: {probs_test.mean():.4f}   Min/Max: {probs_test.min():.4f} / {probs_test.max():.4f}")

out_df = pd.DataFrame({"RowID": RowID, "Probability": probs_test})
out_df = out_df[["RowID", "Probability"]]
out_df.to_csv("submission_predictions.csv", index=False)
print(f"[OK] Wrote {len(out_df):,} rows to submission_predictions.csv")
display(out_df.head(10))


[INFO] Train balance original: {0: 0.959687670237879, 1: 0.04031232976212094}
[INFO] Train balance undersampled: {0: 0.5, 1: 0.5}
[INFO] Shapes (US): (888, 13) (888,)

=== VALID (balanced) check @ t=0.50 ===
Accuracy: 0.747748
Log loss: 0.498849
Confusion matrix:
 [[86 25]
 [31 80]]
              precision    recall  f1-score   support

           0       0.74      0.77      0.75       111
           1       0.76      0.72      0.74       111

    accuracy                           0.75       222
   macro avg       0.75      0.75      0.75       222
weighted avg       0.75      0.75      0.75       222

Pred pos rate @0.50: 0.472973
Mean prob: 0.484102   Min/Max: 0.000000 / 1.000000

=== TEST sanity (t=0.50) ===
Pred pos rate @0.50: 0.4522
Mean prob: 0.4404   Min/Max: 0.0000 / 1.0000
[OK] Wrote 230 rows to submission_predictions.csv


Unnamed: 0,RowID,Probability
0,101,0.529412
1,102,0.0
2,103,0.588235
3,104,0.647059
4,105,0.411765
5,106,0.0
6,107,0.058824
7,108,0.0
8,109,0.882353
9,110,0.588235
