In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# ============================
# 1. LOAD DATA
# ============================

df = pd.read_excel("../data/raw/final_data_mondinoxicot.xlsx")
df.columns = df.columns.str.strip()

prodromes = ['Constipation', 'Hyposmia', 'REM', 'Depression']

# Binarizzazione prodromi
for p in prodromes:
    df[p + "_bin"] = df[p].apply(lambda x: 1 if x == 1 else 0)

# Feature biomeccaniche
biomech_cols = [
    'HR V','HR ML','HR AP',
    'iHR V','iHR ML','iHR AP',
    '%det V','%det ML','%det AP',
    'MSE V','MSE ML','MSE AP',
    'Tilt','Obliquity','Rotation (range)',
    'Stance','Swing','Double Support','Single Support',
    'Stride Length','Cadence','Gait Speed'
]

df.dropna(subset=biomech_cols, inplace=True)

results = []

# ============================
# 2. DEFINIZIONI TARGET
# ============================

all_combos = []
for k in range(1, len(prodromes)+1):
    for combo in combinations(prodromes, k):
        all_combos.append(combo)

print(f"Numero totale combinazioni generate: {len(all_combos)}")
print("Combinazioni:", all_combos)

# Funzione utilitaria per lazy RF
def evaluate_lazy_RF(target, X, label):
    if target.sum() < 8:
        return None

    X_train, X_test, y_train, y_test = train_test_split(
        X, target, test_size=0.3, random_state=42, stratify=target
    )

    clf = RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        class_weight='balanced'
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:,1]

    auc = roc_auc_score(y_test, y_prob)
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)

    return {
        "Target_Definition": label,
        "Positives": int(target.sum()),
        "AUC": auc,
        "F1": f1,
        "Accuracy": acc,
        "Recall": rec
    }

# ============================
# 3. TEST SU TUTTE LE COMBINAZIONI
# ============================

X = df[biomech_cols].copy()

for combo in all_combos:

    combo_name = "_".join(combo)

    # 1 se TUTTI i prodromi presenti
    y = (df[[c + "_bin" for c in combo]].sum(axis=1) == len(combo)).astype(int)

    res = evaluate_lazy_RF(y, X, label=f"AND_{combo_name}")
    if res:
        results.append(res)

# ============================
# 4. DEFINIZIONI "CUMULATIVE"
# ============================

df["prodrome_sum"] = df[[p + "_bin" for p in prodromes]].sum(axis=1)

definitions = {
    ">=1 prodrome": (df["prodrome_sum"] >= 1).astype(int),
    ">=2 prodromes": (df["prodrome_sum"] >= 2).astype(int),
    ">=3 prodromes (MODEL TARGET)": (df["prodrome_sum"] >= 3).astype(int),
    "=4 prodromes": (df["prodrome_sum"] == 4).astype(int),
}

for label, y in definitions.items():
    res = evaluate_lazy_RF(y, X, label)
    if res:
        results.append(res)

# ============================
# 5. OUTPUT FINALE
# ============================

results_df = pd.DataFrame(results).sort_values(by="AUC", ascending=False)
results_df.reset_index(drop=True, inplace=True)

results_df.to_csv("../tables/lazy_RF_prodrome_models_FULL.csv", index=False)

results_df.head(20)

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')

# ============================
# 1. LOAD DATA
# ============================

df = pd.read_excel("../data/raw/final_data_mondinoxicot.xlsx")
df.columns = df.columns.str.strip()

prodromes = ['Constipation', 'Hyposmia', 'REM', 'Depression']

# Binarizzazione
for p in prodromes:
    df[p + "_bin"] = df[p].apply(lambda x: 1 if x == 1 else 0)

# Feature biomeccaniche
biomech_cols = [
    'HR V','HR ML','HR AP',
    'iHR V','iHR ML','iHR AP',
    '%det V','%det ML','%det AP',
    'MSE V','MSE ML','MSE AP',
    'Tilt','Obliquity','Rotation (range)',
    'Stance','Swing','Double Support','Single Support',
    'Stride Length','Cadence','Gait Speed'
]

df.dropna(subset=biomech_cols, inplace=True)
X = df[biomech_cols].copy()

results = []


# ============================
# 2. FUNZIONE CON AUC + 95% CI
# ============================

def bootstrap_auc_ci(y_true, y_prob, n_boot=500, alpha=0.95):
    """Bootstrap stratificato per intervalli di confidenza dell’AUC."""
    rng = np.random.default_rng(42)
    aucs = []

    y_true = np.array(y_true)
    y_prob = np.array(y_prob)

    for _ in range(n_boot):
        idx = rng.integers(0, len(y_true), len(y_true))
        try:
            aucs.append(roc_auc_score(y_true[idx], y_prob[idx]))
        except:
            continue

    lower = np.percentile(aucs, (1 - alpha) / 2 * 100)
    upper = np.percentile(aucs, (1 + alpha) / 2 * 100)
    return np.mean(aucs), lower, upper


def evaluate_lazy_RF(target, X, label):
    """Lazy RF + F1 + AUC + AUC 95% CI."""
    if target.sum() < 8:
        return None

    X_train, X_test, y_train, y_test = train_test_split(
        X, target, test_size=0.3, random_state=42, stratify=target
    )

    clf = RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        class_weight='balanced'
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:,1]

    auc = roc_auc_score(y_test, y_prob)
    f1 = f1_score(y_test, y_pred)

    # Bootstrap AUC CI
    auc_mean, ci_low, ci_high = bootstrap_auc_ci(y_test, y_prob)

    return {
        "Target_Definition": label,
        "Positives": int(target.sum()),
        "AUC": auc,
        "AUC_mean_boot": auc_mean,
        "AUC_CI_low": ci_low,
        "AUC_CI_high": ci_high,
        "F1": f1
    }


# ============================
# 3. TUTTE LE COMBINAZIONI AND
# ============================

all_combos = []
for k in range(1, len(prodromes)+1):
    for combo in combinations(prodromes, k):
        all_combos.append(combo)

print("Combinazioni generate:", all_combos)

for combo in all_combos:
    combo_name = "_".join(combo)
    y = (df[[c + "_bin" for c in combo]].sum(axis=1) == len(combo)).astype(int)
    res = evaluate_lazy_RF(y, X, f"AND_{combo_name}")
    if res:
        results.append(res)


# ============================
# 4. CATEGORIE ≥1, ≥2, ≥3, =4
# ============================

df["prodrome_sum"] = df[[p + "_bin" for p in prodromes]].sum(axis=1)

definitions = {
    "≥1 prodrome": (df["prodrome_sum"] >= 1).astype(int),
    "≥2 prodromes": (df["prodrome_sum"] >= 2).astype(int),
    "≥3 prodromes (MODEL TARGET)": (df["prodrome_sum"] >= 3).astype(int),
    "=4 prodromes": (df["prodrome_sum"] == 4).astype(int),
}

for label, y in definitions.items():
    res = evaluate_lazy_RF(y, X, label)
    if res:
        results.append(res)


# ============================
# 5. OUTPUT
# ============================

results_df = pd.DataFrame(results).sort_values(by="AUC", ascending=False)
results_df.reset_index(drop=True, inplace=True)

results_df.to_csv("../tables/lazy_RF_prodrome_models_FULL_CI.csv", index=False)

results_df.head(20)