# classify

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)


# Read data
E = pd.read_csv(r'result\machine_learning\merged_emotion_features.csv')
I = pd.read_csv(r'result\machine_learning\merged_au_intensities.csv')
C = pd.read_csv(r'result\machine_learning\merged_au_correlations.csv')

# Reorder rows
name_order = E['姓名'].tolist()
I = I.set_index('姓名').reindex(name_order).reset_index()
C = C.set_index('姓名').reindex(name_order).reset_index()

group, ABC, CABS = E['group'], E['ABC'], E['克氏']
E = E[[col for col in E.columns if col not in ['姓名', 'group', 'ABC', '克氏']]]
I = I[[col for col in I.columns if col not in ['姓名', 'group', 'ABC', '克氏']]]
C = C[[col for col in C.columns if col not in ['姓名', 'group', 'ABC', '克氏']]]

E = E.fillna(0)
I = I.fillna(0)
C = C.fillna(0)

# 保存原始特征集（未进行PCA）
feature_sets_raw = {
    'E': E,
    'I': I,
    'C': C,
}

group = group.reset_index(drop=True)

# Print header
print("=" * 120)
print(
    f"{'Feature Type':<15} {'Sensitivity':<12} {'Specificity':<12} {'PPV':<10} {'NPV':<10} "
    f"{'Accuracy':<10} {'F1-score':<10} {'AUC':<10}"
)
print("-" * 120)


# 定义特征组合
feature_combinations = {
    'E': ['E'],
    'EI': ['E', 'I'],
    'EC': ['E', 'C'],
    'IC': ['I', 'C'],
    'EIC': ['E', 'I', 'C']
}

for feature_type, feature_list in feature_combinations.items():
    # 合并原始特征
    X_raw_list = [feature_sets_raw[f] for f in feature_list]
    X_raw = pd.concat(X_raw_list, axis=1) if len(X_raw_list) > 1 else X_raw_list[0]
    
    X_raw = X_raw.fillna(X_raw.mean())
    y_array = group.values

    # 设置随机种子
    np.random.seed(1)
    indices = np.random.permutation(len(y_array))
    X_raw_shuffled = X_raw.iloc[indices].reset_index(drop=True)
    y_shuffled = y_array[indices]

    # 交叉验证
    cv_splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    
    # 存储所有fold的预测结果
    y_pred_all = np.zeros(len(y_shuffled), dtype=int)
    y_prob_all = np.zeros((len(y_shuffled), 2))
    
    tuned_info = None
    
    for fold_idx, (train_idx, test_idx) in enumerate(cv_splitter.split(X_raw_shuffled, y_shuffled)):
        # 分离训练集和测试集
        X_train_raw = X_raw_shuffled.iloc[train_idx]
        X_test_raw = X_raw_shuffled.iloc[test_idx]
        y_train = y_shuffled[train_idx]
        y_test = y_shuffled[test_idx]
        
        # 在训练集上拟合scaler和PCA
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_raw)
        
        # PCA降维
        pca = PCA(n_components=25, random_state=42)
        X_train_pca = pca.fit_transform(X_train_scaled)
        
        # 对测试集进行相同的变换（使用训练集的scaler和pca）
        X_test_scaled = scaler.transform(X_test_raw)
        X_test_pca = pca.transform(X_test_scaled)
        
        # 模型训练和预测
        if feature_type == 'EIC' and fold_idx == 0:
            # 只在第一个fold进行网格搜索，获取最佳参数
            param_grid = [
                {
                    'kernel': ['linear'],
                    'C': [0.1, 1, 10],
                },
                {
                    'kernel': ['rbf'],
                    'C': [0.1, 1, 10],
                    'gamma': ['scale', 'auto'],
                },
                {
                    'kernel': ['poly'],
                    'C': [0.1, 1, 10],
                    'gamma': ['scale', 'auto'],
                    'degree': [2, 3, 4],
                    'coef0': [0, 1],
                },
            ]
            cv_grid = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            grid_search = GridSearchCV(
                estimator=SVC(probability=True, random_state=1),
                param_grid=param_grid,
                cv=cv_grid,
                scoring='f1',
                n_jobs=-1,
                refit=True,
            )
            grid_search.fit(X_train_pca, y_train)
            model = grid_search.best_estimator_
            tuned_info = {
                'best_params': grid_search.best_params_,
                'best_score': grid_search.best_score_,
            }
        elif feature_type == 'EIC':
            # 其他fold使用第一个fold找到的最佳参数
            if tuned_info:
                best_params = tuned_info['best_params']
                model = SVC(**best_params, probability=True, random_state=1)
                model.fit(X_train_pca, y_train)
        else:
            # 非EIC特征集使用默认参数
            model = SVC(kernel='rbf', random_state=1, probability=True)
            model.fit(X_train_pca, y_train)
        
        # 预测
        y_pred_fold = model.predict(X_test_pca)
        y_prob_fold = model.predict_proba(X_test_pca)
        
        # 保存预测结果
        y_pred_all[test_idx] = y_pred_fold
        y_prob_all[test_idx] = y_prob_fold
    
    # 使用所有fold的预测结果计算指标
    y_pred = y_pred_all
    y_prob = y_prob_all
    y_cv = y_shuffled

    # 计算评估指标
    tn, fp, fn, tp = confusion_matrix(y_cv, y_pred, labels=[0, 1]).ravel()

    specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan
    sensitivity = recall_score(y_cv, y_pred)
    ppv = precision_score(y_cv, y_pred)
    npv = tn / (tn + fn) if (tn + fn) > 0 else np.nan
    try:
        auc = roc_auc_score(y_cv, y_prob[:, 1])
    except ValueError:
        auc = np.nan

    metrics = {
        'Precision': ppv,
        'Sensitivity': sensitivity,
        'Accuracy': accuracy_score(y_cv, y_pred),
        'F1-score': f1_score(y_cv, y_pred),
        'AUC-ROC': auc,
        'Specificity': specificity,
        'PPV': ppv,
        'NPV': npv,
    }

    print(
        f" {feature_type:<15} {metrics['Sensitivity']:<12.3f} {metrics['Specificity']:<12.3f} "
        f"{metrics['PPV']:<10.3f} {metrics['NPV']:<10.3f} {metrics['Accuracy']:<10.3f} "
        f"{metrics['F1-score']:<10.3f} {metrics['AUC-ROC']:<10.3f}"
    )
    if tuned_info is not None:
        print(
            f"    -> Tuned best params: {tuned_info['best_params']}, "
            f"CV best F1: {tuned_info['best_score']:.3f}"
        )

print("=" * 120)

Feature Type    Sensitivity  Specificity  PPV        NPV        Accuracy   F1-score   AUC       
------------------------------------------------------------------------------------------------------------------------
 E               0.758        0.682        0.735      0.707      0.723      0.746      0.789     
 EI              0.828        0.882        0.891      0.815      0.853      0.859      0.941     
 EC              0.909        0.812        0.849      0.885      0.864      0.878      0.925     
 IC              0.949        0.776        0.832      0.930      0.870      0.887      0.936     
 EIC             0.960        0.882        0.905      0.949      0.924      0.931      0.977     
    -> Tuned best params: {'C': 0.1, 'coef0': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}, CV best F1: 0.948
