In [None]:
import os
from IPython.display import display
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
from pixelmed_calc import get_param_in_cwd
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 300

os.makedirs('img', exist_ok=True)
os.makedirs('results', exist_ok=True)
#特征文件
featurePath='DL_radmico_imagenet.csv'
# 对应的标签文件
group_info = get_param_in_cwd('dataset_column')
labelf = get_param_in_cwd('label_file')
# 读取标签数据列名
labels = [get_param_in_cwd('task_column') or 'label']
#读取特征文件
if os.path.exists(featurePath):
    rad_data = pd.read_csv(featurePath, header=0)
    rad_data.columns = [c.replace('-', '_') for c in rad_data.columns]
rad_data.head()

# 特征统计

In [None]:
import matplotlib.pyplot as plt
sorted_counts = pd.DataFrame([c.split('_')[-2] for c in rad_data.columns if c !='ID']).value_counts()
sorted_counts = pd.DataFrame(sorted_counts, columns=['count']).reset_index()
sorted_counts = sorted_counts.sort_values(0)
display(sorted_counts)
plt.pie(sorted_counts['count'], labels=[i for i in sorted_counts[0]], startangle=0,
        counterclock = False, autopct = '%.1f%%')
plt.savefig(f'img/Rad_feature_ratio.svg', bbox_inches = 'tight')

# 标注数据

In [None]:
label_data = pd.read_csv(labelf)
label_data['ID'] = label_data['ID'].map(lambda x: f"{x}.nii.gz" if not (f"{x}".endswith('.nii.gz') or  f"{x}".endswith('.nii')) else x)
label_data = label_data[['ID', 'group'] + labels]
label_data.head()

# 特征拼接

In [None]:
combined_data = pd.merge(rad_data, label_data, on=['ID'], how='inner')
ids = combined_data['ID']
combined_data = combined_data.drop(['ID'], axis=1)
print(combined_data[labels].value_counts())
combined_data

# 获取数据统计信息

In [None]:
combined_data.describe()

# z值标准化

In [None]:
from pixelmed_calc.custom.components.comp1 import normalize_df
data = normalize_df(combined_data, not_norm=labels, group=group_info)
data = data.dropna(axis=1)
data.describe()

# 统计检验，不是标配

In [None]:
import seaborn as sns
from pixelmed_calc.custom.components.stats import clinic_stats

stats = clinic_stats(data[data['group'] == 'train'], stats_columns=list(data.columns[0:-2]), label_column=labels[0],
                     continuous_columns=list(data.columns[0:-2]))
stats

In [None]:
import matplotlib.pyplot as plt

def map2float(x):
    try:
        return float(str(x)[1:])
    except:
        return 1

stats[['pvalue']] = stats[['pvalue']].applymap(map2float)
stats[['group']] = stats[['feature_name']].applymap(lambda x: x.split('_')[-2])
stats = stats[['feature_name', 'pvalue', 'group']]
g = sns.catplot(x="group", y="pvalue", data=stats, kind="violin")
g.fig.set_size_inches(15,10)
sns.stripplot(x="group", y="pvalue", data=stats, ax=g.ax, color='black')
plt.savefig(f'img/Rad_feature_stats.svg', bbox_inches = 'tight')

# 通过p值筛选特征

In [None]:
pvalue = 0.01
sel_feature = list(stats[stats['pvalue'] < pvalue]['feature_name']) + labels + [group_info]
data = data[sel_feature]
data

# 通过相关系数筛选特征

In [None]:
pearson_corr = data[data['group'] == 'train'][[c for c in data.columns if c not in labels]].corr('pearson')
# kendall_corr = data[[c for c in data.columns if c not in labels]].corr('kendall')
# spearman_corr = data[[c for c in data.columns if c not in labels]].corr('spearman')

In [None]:
from pixelmed_calc.custom.components.comp1 import select_feature
sel_feature = select_feature(pearson_corr, threshold=0.9, topn=10, verbose=False)
sel_feature = sel_feature + labels + [group_info]
sel_feature

In [None]:
#过滤特征
sel_data = data[sel_feature]

sel_data

# 构建数据集

In [None]:
n_classes = 3
train_data = sel_data[(sel_data[group_info] == 'train')]
train_ids = ids[train_data.index]
train_data = train_data.reset_index()
train_data = train_data.drop('index', axis=1)
y_data = train_data[labels]
X_data = train_data.drop(labels + [group_info], axis=1)

test_data = sel_data[sel_data[group_info] == 'test']
test_ids = ids[test_data.index]
test_data = test_data.reset_index()
test_data = test_data.drop('index', axis=1)
y_test_data = test_data[labels]
X_test_data = test_data.drop(labels + [group_info], axis=1)


val_data = sel_data[sel_data[group_info] == 'val']
val_ids = ids[val_data.index]
val_data = val_data.reset_index()
val_data = val_data.drop('index', axis=1)
y_val_data = val_data[labels]
X_val_data = val_data.drop(labels + [group_info], axis=1)

val1_data = sel_data[sel_data[group_info] == 'val1']
val1_ids = ids[val1_data.index]
val1_data = val1_data.reset_index()
val1_data = val1_data.drop('index', axis=1)
y_val1_data = val1_data[labels]
X_val1_data = val1_data.drop(labels + [group_info], axis=1)

y_all_data = sel_data[labels]
X_all_data = sel_data.drop(labels + [group_info], axis=1)

column_names = X_data.columns
print(f"训练集样本数：{X_data.shape}, 测试集样本数：{X_test_data.shape}，验证集样本数：{X_val_data.shape},验证集1样本数：{X_val1_data.shape}")

# lasso 回归

In [None]:
import numpy as np
import pixelmed_calc.custom.components as okcomp

alpha = okcomp.comp1.lasso_cv_coefs(X_data, y_data, column_names=None)
plt.savefig(f'img/Rad_feature_lasso.svg', bbox_inches = 'tight')


In [None]:
#模型效能
okcomp.comp1.lasso_cv_efficiency(X_data, y_data, points=50)
plt.savefig(f'img/Rad_feature_mse.svg', bbox_inches = 'tight')

In [None]:
from sklearn import linear_model

models = []
for label in labels:
    clf = linear_model.Lasso(alpha=alpha) #如果系数都等于0可以调小alpha ，比如alpha=0.01
    clf.fit(X_data, y_data[label])
    models.append(clf)

### 特征筛选

筛选出其中coef > 0的特征。并且打印出相应的公式。

In [None]:
COEF_THRESHOLD = 1e-6 # 筛选的特征阈值
scores = []
selected_features = []
for label, model in zip(labels, models):
    feat_coef = [(feat_name, coef) for feat_name, coef in zip(column_names, model.coef_)
                 if COEF_THRESHOLD is None or abs(coef) > COEF_THRESHOLD]
    print(feat_coef)
    selected_features.append([feat for feat, _ in feat_coef])
    formula = ' '.join([f"{coef:+.6f} * {feat_name}" for feat_name, coef in feat_coef])
    score = f"{label} = {model.intercept_} {'+' if formula[0] != '-' else ''} {formula}"
    scores.append(score)

print(scores[0])

In [None]:
#特征权重
feat_coef = sorted(feat_coef, key=lambda x: x[1])
feat_coef_df = pd.DataFrame(feat_coef, columns=['feature_name', 'Coefficients'])
feat_coef_df.plot(x='feature_name', y='Coefficients', kind='barh')

plt.savefig(f'img/Rad_feature_weights.svg', bbox_inches = 'tight')

In [None]:
#进一步筛选特征
X_data = X_data[selected_features[0]]
X_test_data = X_test_data[selected_features[0]]
X_val_data=X_val_data[selected_features[0]]
X_val1_data=X_val1_data[selected_features[0]]

#X_data.columns
print(X_test_data.shape)


# 模型筛选

In [None]:
#model_names = ['SVM', 'KNN', 'RandomForest', 'ExtraTrees', 'XGBoost', 'LightGBM', 'MLP', 'LR']

model_names={'LightGBM':{"objective":"multiclass","num_classes":3,"n_estimators":10, "max_depth":-1},
             'SVM':{"probability":True, "random_state":0},
             "KNN":{"algorithm":'kd_tree'},
             "RandomForest":{"n_estimators":10, "max_depth":None,"min_samples_split":2, "random_state":0},
             "ExtraTrees":{"n_estimators":10, "max_depth":None,"min_samples_split":2, "random_state":0},
             "XGBoost":{"n_estimators":10, "objective":'binary:logistic',"use_label_encoder":False, "eval_metric":'error'},
             "MLP":{"hidden_layer_sizes":(128, 64, 32), "max_iter":300, "solver":'sgd', "random_state":0},
           
            }

models = okcomp.comp1.create_clf_model(model_names)
model_names = list(models.keys())
print(type(model_names))

## 交叉验证挑数据

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
# 随机使用n_trails次数据划分，找到最好的一次划分方法，并且保存在results中。
results = okcomp.comp1.get_bst_split(X_data, y_data, models, test_size=0.2, metric_fn=accuracy_score, n_trails=5, cv=True, random_state=0)
_, (X_train_sel, X_test_sel, y_train_sel, y_test_sel) = results['results'][results['max_idx']]
trails, _ = zip(*results['results'])
cv_results = pd.DataFrame(trails, columns=model_names)
print(cv_results)
# 可视化每个模型在不同的数据划分中的效果。
sns.boxplot(data=cv_results)
plt.ylabel('Accuracy %')
plt.xticks(rotation=30)
plt.xlabel('Model Nmae')
plt.savefig(f'img/model_csv.svg', bbox_inches = 'tight')

## 使用最好的数据划分

In [None]:
import joblib
from pixelmed_calc.custom.components.comp1 import plot_feature_importance
import shap
model_names={'LightGBM':{"objective":"multiclass","num_classes":3,"n_estimators":10, "max_depth":-1},
             'SVM':{"probability":True, "random_state":0},
             "KNN":{"algorithm":'kd_tree'},
             "RandomForest":{"n_estimators":10, "max_depth":None,"min_samples_split":2, "random_state":0},
             "ExtraTrees":{"n_estimators":10, "max_depth":None,"min_samples_split":2, "random_state":0},
             "XGBoost":{"n_estimators":10, "objective":'binary:logistic',"use_label_encoder":False, "eval_metric":'error'},
             "MLP":{"hidden_layer_sizes":(128, 64, 32), "max_iter":300, "solver":'sgd', "random_state":0},
             "LR":{"random_state":0}
            }
targets = []
os.makedirs('models', exist_ok=True)
for l in labels:
    new_models = list(okcomp.comp1.create_clf_model(model_names).values())
    for mn, m in zip(model_names, new_models):
        print(m)
        m=m.fit(X_train_sel, y_train_sel[l])
        # 保存训练的模型
        joblib.dump(m, f'models/Rad_{mn}_{l}.pkl')
        # 输出模型特征重要性，只针对高级树模型有用
        plot_feature_importance(m, selected_features[0], save_dir='img')
  
    targets.append(new_models)

# 预测结果

* predictions，二维数据，每个label对应的每个模型的预测结果。
* pred_scores，二维数据，每个label对应的每个模型的预测概率值。

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from pixelmed_calc.custom.components.delong import calc_95_CI
from pixelmed_calc.custom.components.metrics import analysis_pred_binary

predictions = [[(model.predict(X_train_sel), model.predict(X_test_sel),model.predict(X_val_data),model.predict(X_val1_data))
                for model in target] for label, target in zip(labels, targets)]
pred_scores = [[(model.predict_proba(X_train_sel), model.predict_proba(X_test_sel),model.predict_proba(X_val_data),model.predict_proba(X_val1_data))
                for model in target] for label, target in zip(labels, targets)]

metric = []
pred_sel_idx = []
for label, prediction, scores in zip(labels, predictions, pred_scores):
    pred_sel_idx_label = []
    for mname, (train_pred, test_pred,val_pred,val1_pred), (train_score, test_score,val_score,val1_score) in zip(model_names, prediction, scores):
        # 计算训练集指数
        metric.append((mname, accuracy_score(y_train_sel[label], train_pred), f"{label}-train"))

        # 计算验证集指标
        metric.append((mname, accuracy_score(y_test_sel[label], test_pred), f"{label}-test"))

        metric.append((mname, accuracy_score(y_val_data[label], val_pred), f"{label}-val"))

        metric.append((mname, accuracy_score(y_val1_data[label], val1_pred), f"{label}-val1"))

    pred_sel_idx.append(pred_sel_idx_label)
metric = pd.DataFrame(metric, index=None, columns=['model_name', 'Accuracy', 'Task'])
metric

## shap vlaue 特征重要性
### 二分类任务时
当使用 LightGBM 模型时,shap values 会分为两类:

每棵树的 SHAP values
所有树的总体 SHAP values
而 XGBoost 只会返回总体的 SHAP values。

LightGBM 在计算 SHAP values 时,会先计算每棵树的 SHAP values,然后再把所有树的 SHAP values 汇总起来得到一个总体的 SHAP values。
对 LightGBM 模型时,返回的 shap_values 包含:

shap_values[0]: 每棵树的 SHAP values
shap_values[1]: 总体的 SHAP values
而对 XGBoost 模型时,只返回总体的 SHAP values,所以 shap_values 只有一个元素。

这是 LightGBM 和 XGBoost 在计算 SHAP 的实现上的不同所导致的。总体来说,我们更关心的是最后的总体 SHAP values,所以可以只使用 shap_values[1]。

### 当使用多分类任务时：
当进行多分类任务时,LightGBM 模型计算出的 SHAP values 矩阵形状中的最后一个数字表示的是类别的个数。

也就是说,对于3分类的任务,shap_values 的形状是 (样本数,特征数,3)。

这里的3表示有3个类别,并不是树的个数。树的个数是计算SHAP values时的中间结果,最终输出到shap_values中的是按类别划分好的SHAP values。

所以如果是二分类,shap_values 形状会是(样本数,特征数,2)。如果是单分类回归,那就只有(样本数,特征数)两个维度。

综上,最后一个数字表示类别个数,而不是树的个数。对于多分类问题,我们需要为每个类别计算单独的一组 SHAP values,才能 fully explain 模型的预测。

In [None]:

shap_model=models['XGBoost'].fit(X_train_sel, y_train_sel[l])

import shap
explainer = shap.TreeExplainer(shap_model)
shap_values = explainer.shap_values(X_train_sel) 
shap_values2=explainer(X_train_sel)[:, :, 0] # [:, :, 0]只保留对第一个类别的shap值,否则LightGBM模型会多输出会报错，XGBoost则不会
print('shap_values:',np.array(shap_values).shape)
print('shap_values2:',explainer(X_train_sel)[:, :, 0].shape)

#print(shap_values2)
shap.summary_plot(shap_values, X_train_sel,plot_type="bar",max_display=8)
shap.plots.beeswarm(shap_values2,max_display=20)

In [None]:
expected_value = explainer.expected_value
shap.decision_plot(base_value=expected_value[1], shap_values=shap_values[1][:1], features=X_train_sel[:1])

## 分类报告说明
多分类中 accuracy=precision ；recall=敏感性（二分类也一样）；


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from pixelmed_calc.custom.components.delong import calc_95_CI
from pixelmed_calc.custom.components.metrics import analysis_pred_binary
from sklearn.metrics import classification_report,roc_auc_score
from sklearn.metrics import confusion_matrix
import random
def specificity_per_class(y_true, y_pred, class_label):
    conf_matrix = confusion_matrix(y_true, y_pred)
    tn = sum(conf_matrix[i, j] for i in range(len(conf_matrix)) if i != class_label for j in range(len(conf_matrix)) if j != class_label)
    fp = sum(conf_matrix[i, class_label] for i in range(len(conf_matrix)) if i != class_label)
    fn = sum(conf_matrix[class_label, j] for j in range(len(conf_matrix)) if j != class_label)
    tp = conf_matrix[class_label, class_label]
    specificity = tn / (tn + fp)
    ppv = tp / (tp + fp)
    npv = tn / (tn + fn)
    return specificity,ppv,npv


def macro_specificity(y_true, y_pred):
  num_classes = len(set(y_true))
  
  class_specificities = {}
  for class_label in range(num_classes):
    specificity = specificity_per_class(y_true, y_pred, class_label)
    class_specificities[f'Class {class_label}'] = specificity[0]

  macro_spec = sum(class_specificities.values()) / len(class_specificities)
  
  return macro_spec

def micro_specificity(y_true, y_pred):
  conf_matrix = confusion_matrix(y_true, y_pred)
  
  fp = conf_matrix.sum(axis=0) - np.diag(conf_matrix) 
  tn = conf_matrix.sum() - (fp + conf_matrix.sum(axis=1))
  
  micro_spec = tn.sum() / (tn.sum() + fp.sum())  

  return micro_spec

def macro_ppv(y_true, y_pred):
  num_classes = len(set(y_true))

  class_ppvs = {}
  for class_label in range(num_classes):
    tp = np.diag(confusion_matrix(y_true, y_pred))[class_label]
    fp = confusion_matrix(y_true, y_pred)[class_label].sum() - tp

    ppv = tp / (tp + fp)
    class_ppvs[f'Class {class_label}'] = ppv

  macro_ppv = sum(class_ppvs.values()) / len(class_ppvs)

  return macro_ppv


def macro_npv(y_true, y_pred):
  num_classes = len(set(y_true))
  
  class_npvs = {}
  for class_label in range(num_classes):
    tn = sum(confusion_matrix(y_true, y_pred)[i, j] for i in range(num_classes) for j in range(num_classes) if i != class_label and j != class_label)
    fn = confusion_matrix(y_true, y_pred)[class_label].sum() - np.diag(confusion_matrix(y_true, y_pred))[class_label]

    npv = tn / (tn + fn)
    class_npvs[f'Class {class_label}'] = npv

  macro_npv = sum(class_npvs.values()) / len(class_npvs)
  
  return macro_npv
  

def micro_ppv(y_true, y_pred):

  conf_matrix = confusion_matrix(y_true, y_pred)

  tp = np.diagonal(conf_matrix).sum()
  fp = conf_matrix.sum(axis=0) - np.diagonal(conf_matrix)

  micro_ppv = tp / (tp + fp.sum())

  return micro_ppv


def micro_npv(y_true, y_pred):

  conf_matrix = confusion_matrix(y_true, y_pred)

  tn = conf_matrix.sum() - (conf_matrix.sum(axis=0) + conf_matrix.sum(axis=1) - np.diagonal(conf_matrix).sum())
  fn = conf_matrix.sum(axis=1) - np.diagonal(conf_matrix)

  micro_npv = tn.sum() / (tn.sum() + fn.sum())

  return micro_npv

from scipy.stats import norm
def AUC_CI(auc, label, alpha = 0.05):
	label = np.array(label)#防止label不是array类型
	n1, n2 = np.sum(label == 1), np.sum(label == 0)
	q1 = auc / (2-auc)
	q2 = (2 * auc ** 2) / (1 + auc)
	se = np.sqrt((auc * (1 - auc) + (n1 - 1) * (q1 - auc ** 2) + (n2 -1) * (q2 - auc ** 2)) / (n1 * n2))
	confidence_level = 1 - alpha
	z_lower, z_upper = norm.interval(confidence_level)
	lowerb, upperb = auc + z_lower * se, auc + z_upper * se
	return (lowerb, upperb)




from sklearn.metrics import roc_curve, auc, mean_squared_error



def convert2onehot(data, n_classes):
    data = np.reshape(data, -1)
    onehot_encoder = []
    for d in data:
        onehot = [0] * n_classes
        onehot[d] = 1
        onehot_encoder.append(onehot)
    return np.array(onehot_encoder)

def mac_mic_auc(y_test, y_score, n_classes, include_spec_class: bool = True,
                       mapping=None):
    """

    Args:
        mapping: label的映射
        y_test: 真实标签
        y_score: 预测标签
        n_classes: 类别数
        title: 标题
        include_spec_class: 是否包括每个细分标签的ROC曲线。

    Returns:

    """
    if mapping is None:
        mapping = {}
    y_test_binary = convert2onehot(y_test, n_classes=n_classes)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    # Compute micro-average ROC curve and ROC area
    # print(y_test_binary.ravel().shape, y_score.ravel().shape)
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_binary.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    try:
        for i in range(n_classes):
            try:
                fpr[i], tpr[i], _ = roc_curve(y_test_binary[:, i], y_score[:, i])
                roc_auc[i] = auc(fpr[i], tpr[i])
            except Exception as e:
                logger.error(f'解析{i}类别出错, {e}')
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

        # Finally average it and compute AUC
        mean_tpr /= n_classes

        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    except:
        logger.error(f'解析每个类别的ROC出错，大概率是应为数据没有指定类别的样本！')
    micro_auc=roc_auc["micro"]
    macro_auc=roc_auc["macro"]
    return macro_auc,micro_auc





num_classes = len(set(y_train_sel[label]))
class_specificities_train = {}
class_specificities_test = {}
class_specificities_val = {}
class_specificities_val1 = {}
predictions = [[(model.predict(X_train_sel), model.predict(X_test_sel),model.predict(X_val_data),model.predict(X_val1_data))
                for model in target] for label, target in zip(labels, targets)]
pred_scores = [[(model.predict_proba(X_train_sel), model.predict_proba(X_test_sel),model.predict_proba(X_val_data),model.predict_proba(X_val1_data))
                for model in target] for label, target in zip(labels, targets)]

for label, prediction, scores in zip(labels, predictions, pred_scores):
    for mname, (train_pred, test_pred,val_pred,val1_pred), (train_score, test_score,val_score,val1_score) in zip(model_names, prediction, scores):
        # 计算训练集指数
        print(f"{label}-train:\n",mname,classification_report(y_train_sel[label], train_pred,digits=5))
        for class_label in range(num_classes):
            specificity = specificity_per_class(y_train_sel[label], train_pred, class_label)
            class_specificities_train[f'Class {class_label}'] = specificity
        print("Class- Specificities,PPV,NPV:")
        for class_label, specificity in class_specificities_train.items():
            print(f"{class_label}: {specificity}")
        macro_specificity_value=macro_specificity(y_train_sel[label], train_pred)
        micro_specificity_value=micro_specificity(y_train_sel[label], train_pred)
        macro_ppv_value=macro_ppv(y_train_sel[label], train_pred)
        macro_npv_value=macro_npv(y_train_sel[label], train_pred)
        micro_ppv_value=micro_ppv(y_train_sel[label], train_pred)
        micro_npv_value=micro_npv(y_train_sel[label], train_pred)
        print('macro avg:',macro_specificity_value,macro_ppv_value,macro_npv_value)
        print('weighted avg:',micro_specificity_value,micro_ppv_value,micro_npv_value)
        for i in range(n_classes):
              auc_i = roc_auc_score(y_train_sel[label]==i, train_score[:,i]) 
              print(f'auc{i}:',auc_i,'95%CI',AUC_CI(auc_i, y_train_sel[label]==i, alpha = 0.05))
        auc_macro,auc_micro = mac_mic_auc(y_train_sel[label], train_score,n_classes)
        print('macro avg_auc:',auc_macro,'95%CI',AUC_CI(auc_macro, y_train_sel[label], alpha = 0.05))
        print('micro auc:',auc_micro,'95%CI',AUC_CI(auc_micro, y_train_sel[label], alpha = 0.05))
        
        # 计算y_test_sel指数
        print(f"{label}-test:\n",mname,classification_report(y_test_sel[label], test_pred,digits=5))
        for class_label in range(num_classes):
            specificity = specificity_per_class(y_test_sel[label], test_pred, class_label)
            class_specificities_test[f'Class {class_label}'] = specificity
        print("Class- Specificities,PPV,NPV::")
        for class_label, specificity in class_specificities_test.items():
            print(f"{class_label}: {specificity}")
        macro_specificity_value=macro_specificity(y_test_sel[label], test_pred)
        micro_specificity_value=micro_specificity(y_test_sel[label], test_pred)
        macro_ppv_value=macro_ppv(y_test_sel[label], test_pred)
        macro_npv_value=macro_npv(y_test_sel[label], test_pred)
        micro_ppv_value=micro_ppv(y_test_sel[label], test_pred)
        micro_npv_value=micro_npv(y_test_sel[label], test_pred)
        print('macro avg:',macro_specificity_value,macro_ppv_value,macro_npv_value)
        print('weighted avg:',micro_specificity_value,micro_ppv_value,micro_npv_value)
        for i in range(n_classes):
              auc_i = roc_auc_score(y_test_sel[label]==i, test_score[:,i]) 
              print(f'auc{i}:',auc_i,'95%CI',AUC_CI(auc_i, y_test_sel[label]==i, alpha = 0.05))
        auc_macro,auc_micro = mac_mic_auc(y_test_sel[label], test_score,n_classes)
        print('macro avg_auc:',auc_macro,'95%CI',AUC_CI(auc_macro, y_test_sel[label], alpha = 0.05))
        print('micro auc:',auc_micro,'95%CI',AUC_CI(auc_micro, y_test_sel[label], alpha = 0.05))
        
            
        # 计算y_val_data指数
        print(f"{label}-val:\n",mname,classification_report(y_val_data[label], val_pred,digits=5))
        for class_label in range(num_classes):
            specificity = specificity_per_class(y_val_data[label], val_pred, class_label)
            class_specificities_val[f'Class {class_label}'] = specificity
        print("Class- Specificities,PPV,NPV:")
        for class_label, specificity in class_specificities_val.items():
            print(f"{class_label}: {specificity}")
        macro_specificity_value=macro_specificity(y_val_data[label], val_pred)
        micro_specificity_value=micro_specificity(y_val_data[label], val_pred)
        macro_ppv_value=macro_ppv(y_val_data[label], val_pred)
        macro_npv_value=macro_npv(y_val_data[label], val_pred)
        micro_ppv_value=micro_ppv(y_val_data[label], val_pred)
        micro_npv_value=micro_npv(y_val_data[label], val_pred)
        print('macro avg:',macro_specificity_value,macro_ppv_value,macro_npv_value)
        print('weighted avg:',micro_specificity_value,micro_ppv_value,micro_npv_value)
        for i in range(n_classes):
              auc_i = roc_auc_score(y_val_data[label]==i, val_score[:,i]) 
              print(f'auc{i}:',auc_i,'95%CI',AUC_CI(auc_i, y_val_data[label]==i, alpha = 0.05))
        auc_macro,auc_micro = mac_mic_auc(y_val_data[label], val_score,n_classes)
        print('macro avg_auc:',auc_macro,'95%CI',AUC_CI(auc_macro, y_val_data[label], alpha = 0.05))
        print('micro auc:',auc_micro,'95%CI',AUC_CI(auc_micro, y_val_data[label], alpha = 0.05))
        
        # 计算y_val1_data指数
        print(f"{label}-val1:\n",mname,classification_report(y_val1_data[label], val1_pred,digits=5))
        for class_label in range(num_classes):
            specificity = specificity_per_class(y_val1_data[label], val1_pred, class_label)
            class_specificities_val1[f'Class {class_label}'] = specificity
        print("Class- Specificities,PPV,NPV:")
        for class_label, specificity in class_specificities_val1.items():
            print(f"{class_label}: {specificity}")
        macro_specificity_value=macro_specificity(y_val1_data[label], val1_pred)
        micro_specificity_value=micro_specificity(y_val1_data[label], val1_pred)
        macro_ppv_value=macro_ppv(y_val1_data[label], val1_pred)
        macro_npv_value=macro_npv(y_val1_data[label], val1_pred)
        micro_ppv_value=micro_ppv(y_val1_data[label], val1_pred)
        micro_npv_value=micro_npv(y_val1_data[label], val1_pred)
        print('macro avg:',macro_specificity_value,macro_ppv_value,macro_npv_value)
        print('weighted avg:',micro_specificity_value,micro_ppv_value,micro_npv_value)
        for i in range(n_classes):
              auc_i = roc_auc_score(y_val1_data[label]==i, val1_score[:,i]) 
              print(f'auc{i}:',auc_i,'95%CI',AUC_CI(auc_i, y_val1_data[label]==i, alpha = 0.05))
        auc_macro,auc_micro = mac_mic_auc(y_val1_data[label], val1_score,n_classes)
        print('macro avg_auc:',auc_macro,'95%CI',AUC_CI(auc_macro, y_val1_data[label], alpha = 0.05))
        print('micro auc:',auc_micro,'95%CI',AUC_CI(auc_micro, y_val1_data[label], alpha = 0.05))


# 绘制概率柱状图和折线图曲线

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 10))
plt.subplot(211)
sns.barplot(x='model_name', y='Accuracy', data=metric, hue='Task')
plt.subplot(212)
sns.lineplot(x='model_name', y='Accuracy', data=metric, hue='Task')
plt.savefig(f'img/Rad_model_acc.svg', bbox_inches = 'tight')

# 绘制ROC曲线-train

In [None]:
model_names = list(models.keys())
sel_model = model_names

for sm in sel_model:
    if sm in model_names:
        sel_model_idx = model_names.index(sm)
        # Plot all ROC curves
        plt.figure(figsize=(8, 8))
        for pred_score, label in zip(pred_scores, labels):
            okcomp.comp1.draw_roc_per_class(np.array(y_train_sel[label]), pred_score[sel_model_idx][0], n_classes=n_classes,
                                            include_spec_class=True, title=f"Model: {sm}")
            plt.savefig(f'img/model_{sm}_test_roc.svg', bbox_inches = 'tight')

# 绘制ROC曲线-test

In [None]:
sel_model = model_names

for sm in sel_model:
    if sm in model_names:
        sel_model_idx = model_names.index(sm)
        # Plot all ROC curves
        plt.figure(figsize=(8, 8))
        for pred_score, label in zip(pred_scores, labels):
            okcomp.comp1.draw_roc_per_class(np.array(y_test_sel[label]), pred_score[sel_model_idx][1], n_classes=n_classes,
                                            include_spec_class=True, title=f"Model: {sm}")
            plt.savefig(f'img/model_{sm}_test_roc.svg', bbox_inches = 'tight')

# 绘制ROC曲线-val


In [None]:
sel_model = model_names

for sm in sel_model:
    if sm in model_names:
        sel_model_idx = model_names.index(sm)
        # Plot all ROC curves
        plt.figure(figsize=(8, 8))
        for pred_score, label in zip(pred_scores, labels):
            okcomp.comp1.draw_roc_per_class(np.array(y_val_data[label]), pred_score[sel_model_idx][2], n_classes=n_classes,
                                            include_spec_class=True, title=f"Model: {sm}")
            plt.savefig(f'img/model_{sm}_val_roc.svg', bbox_inches = 'tight')

# 绘制ROC曲线-val1

In [None]:
sel_model = model_names

for sm in sel_model:
    if sm in model_names:
        sel_model_idx = model_names.index(sm)
        # Plot all ROC curves
        plt.figure(figsize=(8, 8))
        for pred_score, label in zip(pred_scores, labels):
            okcomp.comp1.draw_roc_per_class(np.array(y_val1_data[label]), pred_score[sel_model_idx][3], n_classes=n_classes,
                                            include_spec_class=True, title=f"Model: {sm}")
            plt.savefig(f'img/model_{sm}_val1_roc.svg', bbox_inches = 'tight')

# 混淆矩阵

In [None]:
# 设置绘制参数
sel_model = model_names
c_matrix = {}

for sm in sel_model:
    if sm in model_names:
        sel_model_idx = model_names.index(sm)
        for idx, label in enumerate(labels):
            cm = okcomp.comp1.calc_confusion_matrix(predictions[idx][sel_model_idx][-1], y_test_sel[label], num_classes=n_classes)
            c_matrix[label] = cm
            plt.figure(figsize=(5, 4))
            plt.title(f'Model:{sm}')
            okcomp.comp1.draw_matrix(cm, norm=False, annot=True, cmap='Blues', fmt='.3g')
            plt.savefig(f'img/model_{sm}_cm.svg', bbox_inches = 'tight')

# 保存模型结果
可以把模型预测的标签结果以及每个类别的概率都保存下来。

In [None]:
import numpy as np
import os

sel_model = model_names
os.makedirs('results', exist_ok=True)
for idx, label in enumerate(labels):
    for sm in sel_model:
        if sm in model_names:
            sel_model_idx = model_names.index(sm)
            target = targets[idx][sel_model_idx]
            # 预测训练集和测试集数据。
            train_indexes = np.reshape(np.array(ids.loc[list(X_train_sel.index)]), (-1, 1)).astype(str)
            test_indexes = np.reshape(np.array(ids.loc[list(X_test_sel.index)]), (-1, 1)).astype(str)
            y_train_pred_scores = target.predict_proba(X_train_sel)
            y_test_pred_scores = target.predict_proba(X_test_sel)
            columns = ['ID'] + [f"{label}-{i}"for i in range(y_test_pred_scores.shape[1])]
            # 保存预测的训练集和测试集结果
            result_train = pd.DataFrame(np.concatenate([train_indexes, y_train_pred_scores], axis=1), columns=columns)
            result_train.to_csv(f'./results/{sm}_train.csv', index=False)
            result_test = pd.DataFrame(np.concatenate([test_indexes, y_test_pred_scores], axis=1), columns=columns)
            result_test.to_csv(f'./results/{sm}_test.csv', index=False)