In [14]:
# -*- coding: utf-8 -*-
"""
Ridge 逻辑回归 + CV + 解释性
0/1 列 & 定序 → 不标准化
连续变量 → StandardScaler
缺失值 → 简单插补
"""
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer  # 强制补缺失

# ========= 1. 路径参数 =========
DATA_DIR = pathlib.Path(r"C:\Users\DXW\Desktop\半月板部分切除术\OAI test\test 2")
X_FILE = DATA_DIR / "半月板手术_predictor_processed.xlsx"
Y_FILE = DATA_DIR / '半月板手术_outcome_3y_processed.xlsx'
OOF_FILE = DATA_DIR / 'Ridge_OOF_prob.xlsx'
COEF_PLOT = DATA_DIR / 'Ridge_coef_top30.png'
PERM_PLOT = DATA_DIR / 'Ridge_perm_top30.png'

RANDOM_STATE = 42
N_SPLITS = 5
LASSO_ALPHA = 0.01  

# ========= 2. 读数据 =========
X_df = pd.read_excel(X_FILE, sheet_name="KneeReformatted")
y_df = pd.read_excel(Y_FILE)[['WOMKP']].values.ravel()
ID_side = X_df[['ID', 'side']]
X_df = X_df.drop(columns=['ID', 'side'], errors='ignore')

# ========= 3. 把列分成 3 类 =========
# 0/1 列
bin_like = [c for c in X_df.columns if X_df[c].dropna().isin([0,1]).all()]
# 连续变量：数值型且非 0/1
continuous_cols = [
    c for c in X_df.select_dtypes(include=['int64','float64']).columns
    if c not in bin_like
]
# 定序变量：如果你知道列名，直接写列表；不知道就默认空
ordinal_cols = []          # 例：['KL_grade', 'pain_scale']
# 字符串类别：你说过没有，留空
string_cols = []

# 合并 0/1 与定序 → 都不标准化
no_scale_cols = bin_like + ordinal_cols

scale_pipe   = Pipeline([('scaler', StandardScaler())])
passthrough_pipe = Pipeline([('identity', 'passthrough')])

preprocessor = ColumnTransformer(
    transformers=[
        ('scale', scale_pipe, continuous_cols),
        ('passthrough', passthrough_pipe, no_scale_cols)
    ],
    remainder='drop'
)


# ========= 5. Lasso 逻辑回归管道 =========
lasso_logit = Pipeline(steps=[
    ('prep', preprocessor),
    ('clf', LogisticRegression(
        penalty='l1', solver='liblinear',          # ← 1. L1 惩罚
        C=1 / LASSO_ALPHA,                         # ← 2. 强度换算
        max_iter=2000, class_weight='balanced',
        random_state=RANDOM_STATE))
])

# ========= 6. 交叉验证 & OOF =========
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_prob = np.zeros_like(y_df, dtype=float)

for fold, (tr_idx, va_idx) in enumerate(cv.split(X_df, y_df)):
    X_tr, X_va = X_df.iloc[tr_idx], X_df.iloc[va_idx]
    y_tr, y_va = y_df[tr_idx], y_df[va_idx]

    lasso_logit.fit(X_tr, y_tr)
    oof_prob[va_idx] = lasso_logit.predict_proba(X_va)[:, 1]

    fold_auc = roc_auc_score(y_va, oof_prob[va_idx])
    print(f'Fold {fold+1} AUC = {fold_auc:.4f}')

auc_lasso = roc_auc_score(y_df, oof_prob)
print(f'\nLasso OOF AUC = {auc_lasso:.4f}')

# ========= 7. 保存 OOF =========
OOF_FILE = DATA_DIR / 'Lasso_OOF_prob.xlsx'      # ← 可选：文件名同步
(pd.DataFrame({'ID': ID_side['ID'], 'side': ID_side['side'],
               'WOMKP': y_df, 'lasso_prob': oof_prob})
   .to_excel(OOF_FILE, index=False))
print(f'OOF 概率已保存 → {OOF_FILE}')

# ========= 8. 系数 TOP30 =========
preprocessor.fit(X_df)
all_names = preprocessor.get_feature_names_out()
coef = lasso_logit.named_steps['clf'].coef_.ravel()
coef_df = (pd.Series(coef, index=all_names)
             .sort_values(key=abs, ascending=False)
             .head(30))

COEF_PLOT = DATA_DIR / 'Lasso_coef_top30.png'     # ← 可选：图片名同步
plt.figure(figsize=(6, 8))
coef_df.plot(kind='barh', color=['firebrick' if c < 0 else 'steelblue' for c in coef_df])
plt.title('Top30 |Lasso Coefficients| (after StandardScaler + One-Hot)')
plt.xlabel('Standardized coefficient')
plt.tight_layout()
plt.savefig(COEF_PLOT, dpi=300)
plt.close()
print(f'系数图已保存 → {COEF_PLOT}')

# ========= 9. 置换重要性 TOP30 =========
perm = permutation_importance(lasso_logit, X_df, y_df,
                              n_repeats=30, random_state=RANDOM_STATE,
                              scoring='roc_auc', n_jobs=-1)
perm_df = (pd.Series(perm.importances_mean, index=all_names)
             .sort_values(ascending=False)
             .head(30))

PERM_PLOT = DATA_DIR / 'Lasso_perm_top30.png'     # ← 可选：图片名同步
plt.figure(figsize=(6, 8))
perm_df.plot(kind='barh', color='darkgreen')
plt.title('Top30 Permutation Importance (Lasso)')
plt.xlabel('Drop in AUC if feature shuffled')
plt.tight_layout()
plt.savefig(PERM_PLOT, dpi=300)
plt.close()
print(f'置换重要性图已保存 → {PERM_PLOT}')

Fold 1 AUC = 0.6907
Fold 2 AUC = 0.6710
Fold 3 AUC = 0.8249
Fold 4 AUC = 0.7274
Fold 5 AUC = 0.6962

Lasso OOF AUC = 0.7193
OOF 概率已保存 → C:\Users\DXW\Desktop\半月板部分切除术\OAI test\test 2\Lasso_OOF_prob.xlsx
系数图已保存 → C:\Users\DXW\Desktop\半月板部分切除术\OAI test\test 2\Lasso_coef_top30.png
置换重要性图已保存 → C:\Users\DXW\Desktop\半月板部分切除术\OAI test\test 2\Lasso_perm_top30.png
