In [1]:
"""
基于动画片任务特征的分类
参考 classify.py 的实现方法
使用看动画片任务提取的特征进行分类
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)


def scale_and_pca(data, prefix=None, n_components=25, random_state=42):
    """
    标准化和PCA降维
    
    Parameters:
    -----------
    data : pd.DataFrame
        输入数据
    prefix : str
        列名前缀
    n_components : int
        PCA主成分数量
    random_state : int
        随机种子
    
    Returns:
    --------
    pd.DataFrame: 降维后的数据
    """
    scaler = StandardScaler()
    pca = PCA(n_components=n_components, random_state=random_state)
    transformed = pca.fit_transform(scaler.fit_transform(data))
    columns = [f'{prefix}_PC{i+1}' for i in range(n_components)]
    return pd.DataFrame(transformed, columns=columns)


# 读取动画片任务的特征数据
E = pd.read_csv(r'result\machine_learning\merged_emotion_features_cartoon.csv', encoding='utf-8-sig')
I = pd.read_csv(r'result\machine_learning\merged_au_intensities_cartoon.csv', encoding='utf-8-sig')
C = pd.read_csv(r'result\machine_learning\merged_au_correlations_cartoon.csv', encoding='utf-8-sig')

# 重新排序行，确保所有数据集的顺序一致
name_order = E['姓名'].tolist()
I = I.set_index('姓名').reindex(name_order).reset_index()
C = C.set_index('姓名').reindex(name_order).reset_index()

# 提取组别和临床量表数据
group = E['group'].reset_index(drop=True)
ABC = E['ABC'].reset_index(drop=True) if 'ABC' in E.columns else None
CABS = E['克氏'].reset_index(drop=True) if '克氏' in E.columns else None

# 移除非特征列
E = E[[col for col in E.columns if col not in ['姓名', 'group', 'ABC', '克氏']]]
I = I[[col for col in I.columns if col not in ['姓名', 'group', 'ABC', '克氏']]]
C = C[[col for col in C.columns if col not in ['姓名', 'group', 'ABC', '克氏']]]

# 填充缺失值
E = E.fillna(0)
I = I.fillna(0)
C = C.fillna(0)

# PCA降维
E = scale_and_pca(E, 'E', n_components=25)
I = scale_and_pca(I, 'I', n_components=25)
C = scale_and_pca(C, 'IC', n_components=25)

# 合并特征集
EIC = pd.concat([E, I, C], axis=1)

# 可选：保存处理后的特征数据
# output_dir = Path('result/classify_and_regression_cartoon')
# output_dir.mkdir(parents=True, exist_ok=True)
# E.to_csv(output_dir / 'E.csv', index=False)
# EI.to_csv(output_dir / 'EI.csv', index=False)
# EC.to_csv(output_dir / 'EC.csv', index=False)
# IC.to_csv(output_dir / 'IC.csv', index=False)
# EIC.to_csv(output_dir / 'EIC.csv', index=False)
# group.to_csv(output_dir / 'group.csv', index=False)

# 打印表头
print("\n" + "=" * 120)
print(
    f"{'Feature Type':<15} {'Sensitivity':<12} {'Specificity':<12} {'PPV':<10} {'NPV':<10} "
    f"{'Accuracy':<10} {'F1-score':<10} {'AUC':<10}"
)
print("-" * 120)

# 对EIC特征集进行分类
feature_type = 'EIC'
X = EIC

# 填充缺失值（使用均值）
X = X.fillna(X.mean())
y_series = group.reset_index(drop=True)

# 标准化
scaler = StandardScaler()
X_scaled_full = scaler.fit_transform(X)
y_array = y_series.values

# 对EIC特征集进行网格搜索调参
param_grid = [
    {
        'kernel': ['linear'],
        'C': [0.1, 1, 10],
    },
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
    },
    {
        'kernel': ['poly'],
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'degree': [2, 3, 4],
        'coef0': [0, 1],
    },
]
cv_grid = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=SVC(probability=True, random_state=1),
    param_grid=param_grid,
    cv=cv_grid,
    scoring='f1',
    n_jobs=-1,
    refit=True,
)
print(f"\n对 {feature_type} 特征集进行网格搜索调参...")
grid_search.fit(X_scaled_full, y_array)
model = grid_search.best_estimator_
tuned_info = {
    'best_params': grid_search.best_params_,
    'best_score': grid_search.best_score_,
}

# 随机打乱数据
np.random.seed(1)
indices = np.random.permutation(len(y_array))
X_cv = X_scaled_full[indices]
y_cv = y_array[indices]

# 交叉验证
cv_splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
y_pred = cross_val_predict(model, X_cv, y_cv, cv=cv_splitter)
y_prob = cross_val_predict(model, X_cv, y_cv, cv=cv_splitter, method='predict_proba')

# 计算混淆矩阵
tn, fp, fn, tp = confusion_matrix(y_cv, y_pred, labels=[0, 1]).ravel()

# 计算评估指标
specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan
sensitivity = recall_score(y_cv, y_pred)
ppv = precision_score(y_cv, y_pred)
npv = tn / (tn + fn) if (tn + fn) > 0 else np.nan
try:
    auc = roc_auc_score(y_cv, y_prob[:, 1])
except ValueError:
    auc = np.nan

metrics = {
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'PPV': ppv,
    'NPV': npv,
    'Accuracy': accuracy_score(y_cv, y_pred),
    'F1-score': f1_score(y_cv, y_pred),
    'AUC': auc,
}

# 打印结果
print(
    f" {feature_type:<15} {metrics['Sensitivity']:<12.3f} {metrics['Specificity']:<12.3f} "
    f"{metrics['PPV']:<10.3f} {metrics['NPV']:<10.3f} {metrics['Accuracy']:<10.3f} "
    f"{metrics['F1-score']:<10.3f} {metrics['AUC']:<10.3f}"
)
print(
    f"    -> Tuned best params: {tuned_info['best_params']}, "
    f"CV best F1: {tuned_info['best_score']:.3f}"
)

print("=" * 120)
print("\n分类完成！")
print("=" * 120)




Feature Type    Sensitivity  Specificity  PPV        NPV        Accuracy   F1-score   AUC       
------------------------------------------------------------------------------------------------------------------------

对 EIC 特征集进行网格搜索调参...
 EIC             0.887        0.882        0.896      0.872      0.885      0.891      0.953     
    -> Tuned best params: {'C': 0.1, 'kernel': 'linear'}, CV best F1: 0.896

分类完成！
