In [None]:
import os
import numpy as np

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
# ====== 0) Set Repetition Count======
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=50, random_state=42)

lr_scores, rf_scores, mlp_scores = [], [], []
skipped = 0

In [None]:
# ====== 1) Set the working directory (replace the path with your qml_project)======
os.chdir(r"C:\Users\93539\qml_project")  # ←real position
print("CWD =", os.getcwd())

# ====== 2) load ======
Z = np.load(r"C:\Users\93539\qml_project\D8\Z_latent_D8.npy")   # shape: (590000, 16)
y = np.load(r"C:\Users\93539\qml_project\D8\y_labels_D8.npy")       # shape: (590000,)
print("Z shape:", Z.shape, "y shape:", y.shape, "pos rate:", y.mean())



In [None]:
# ====== 3) Draw a small sample N (stratified, ensuring the positive case proportion approximates the population)======
N = 200  #  100/200/300/1000 is flex
rng = np.random.default_rng(42)

pos_idx = np.where(y == 1)[0]
neg_idx = np.where(y == 0)[0]

# Sample positive and negative examples proportionally (ensure at least 2 positive examples to prevent training failure)
target_pos = max(2, int(round(N * y.mean())))
target_pos = min(target_pos, len(pos_idx))
target_neg = N - target_pos

sel_pos = rng.choice(pos_idx, size=target_pos, replace=False)
sel_neg = rng.choice(neg_idx, size=target_neg, replace=False)
sel = np.concatenate([sel_pos, sel_neg])
rng.shuffle(sel)

X_small = Z[sel]
y_small = y[sel]
print(f"N={len(y_small)}, pos={int(y_small.sum())}, pos_rate={y_small.mean():.4f}")

In [None]:
#Save

np.savez(
    "Xy_small_N100_seed42.npz",
    X=X_small,
    y=y_small
)

print("Saved:", X_small.shape, y_small.shape)


#load
data = np.load("Xy_small_N100_seed42.npz")
X_small = data["X"]
y_small = data["y"]

print("Loaded:", X_small.shape, y_small.shape)
print("pos:", int(y_small.sum()), "pos_rate:", y_small.mean())

#=====The two sections above are separate. Use the first section when you need to save the sample from this sampling session;
#=====use the second section when you need to load a specific sample or fix a sample throughout the entire experiment.

In [None]:
# ====== 4) three models ======
lr = LogisticRegression(max_iter=2000, class_weight="balanced")
rf = RandomForestClassifier(n_estimators=300, max_depth=6, class_weight="balanced", random_state=42, n_jobs=-1)
mlp = MLPClassifier(hidden_layer_sizes=(32,), alpha=1e-2, max_iter=800, random_state=42)

In [None]:
# ====== 5) repeat ======
for train_idx, test_idx in rskf.split(X_small, y_small):
    X_train, X_test = X_small[train_idx], X_small[test_idx]
    y_train, y_test = y_small[train_idx], y_small[test_idx]

# Prevent single-class folds (especially when positive examples are scarce)
    if len(np.unique(y_train)) < 2 or len(np.unique(y_test)) < 2:
        skipped += 1
        continue



    lr.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    mlp.fit(X_train, y_train)

    lr_scores.append(roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]))
    rf_scores.append(roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))
    mlp_scores.append(roc_auc_score(y_test, mlp.predict_proba(X_test)[:, 1]))

print("Skipped folds:", skipped)
print(f"LR : {np.mean(lr_scores):.4f} ± {np.std(lr_scores):.4f}")
print(f"RF : {np.mean(rf_scores):.4f} ± {np.std(rf_scores):.4f}")
print(f"MLP: {np.mean(mlp_scores):.4f} ± {np.std(mlp_scores):.4f}")

In [None]:
##If you only want to run it once without repeating to verify the code's correctness, use the following code:
# ====== 4) Split Training/Testing ======
X_train, X_test, y_train, y_test = train_test_split(
    X_small, y_small, test_size=0.3, stratify=y_small, random_state=42
)

# ======  single test + AUC ======
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
mlp.fit(X_train, y_train)

auc_lr = roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1])
auc_rf = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
auc_mlp = roc_auc_score(y_test, mlp.predict_proba(X_test)[:, 1])

print(f"LR  AUC: {auc_lr:.4f}")
print(f"RF  AUC: {auc_rf:.4f}")
print(f"MLP AUC: {auc_mlp:.4f}")