In [None]:
import pandas as pd
import numpy as np
import os
from pixelmed_calc.medical_imaging.RadiologyComponents.solution import convert_chinese_columns_to_numeric
os.makedirs('results/img', exist_ok=True)
os.makedirs('results/model_weight', exist_ok=True)
os.makedirs('results/pred', exist_ok=True)
inputfile = '/Users/y2k/workstation_y2k/139fat/clinicdata/clinic.csv'
labelfile='/Users/y2k/workstation_y2k/139fat/clinicdata/group.csv'
data_feature=pd.read_csv(inputfile)
data_feature.drop(columns=['group','住院号'], inplace=True)


mean_values = data_feature.loc[:, data_feature.columns != 'ID'].mean()
data_feature.fillna(mean_values, inplace=True)
data_label=pd.read_csv(labelfile)
merged_data = pd.merge(data_feature, data_label, on='ID')
merged_data.columns = merged_data.columns.str.replace(r'-', '_')

merged_data=merged_data[['ID','T012/34','N0/1','Rad_sign','label','group']]
merged_data

数据集包含较多的异常值，使用稳键归一化（robust normalization）
value_result = (value-Media)/(Q1-Q3)  
Q1的位置 = 1 * （n + 1) / 4  
Q3的位置 =  3 *（n + 1) / 4  
n : 表示数据的个数。  
media : 中位数  
Q1 : 是第 1 个四分位数（第 25 个分位数）  
Q3 : 第 3 个四分位数（第 75 个分位数）  


In [9]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler_cloum = merged_data.columns[1:-2]
merged_data[scaler_cloum]=scaler.fit_transform(merged_data[scaler_cloum])

In [None]:
data_sel=merged_data.drop(['ID'],axis=1)
data_sel

In [11]:
x_train=data_sel[data_sel['group']=='train'].drop(['group','label'],axis=1)
y_train=data_sel[data_sel['group']=='train']['label']

x_val=data_sel[data_sel['group']=='val'].drop(['group','label'],axis=1)
y_val=data_sel[data_sel['group']=='val']['label']

x_test=data_sel[data_sel['group']=='test'].drop(['group','label'],axis=1)
y_test=data_sel[data_sel['group']=='test']['label']

x_all=data_sel.drop(['group','label'],axis=1)
y_all=data_sel['label']

In [12]:

x_train.to_csv('results/pred/x_train.csv',index=False)
x_val.to_csv('results/pred/x_val.csv',index=False)    
x_test.to_csv('results/pred/x_test.csv',index=False)
y_train.to_csv('results/pred/y_train.csv',index=False)
y_val.to_csv('results/pred/y_val.csv',index=False)    
y_test.to_csv('results/pred/y_test.csv',index=False)



In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
models = [('LR', LogisticRegression(random_state=0)),
          ('NB',GaussianNB()),
          ('linear_SVM', SVC(kernel='linear',class_weight='balanced',probability=True,max_iter=1000)),
          ('poly_SVM',SVC(kernel='poly',class_weight='balanced',probability=True)),
          ('sigmoid_SVM',SVC(kernel='sigmoid',class_weight='balanced',probability=True)),
          ('rbf_SVM',SVC(kernel='rbf',class_weight='balanced',probability=True)),
          ('DT', DecisionTreeClassifier(max_depth=3,
                                                            min_samples_split=2, random_state=0)),
          ('RF', RandomForestClassifier(n_estimators=10, max_depth=3,
                                                            min_samples_split=2, random_state=0)),
          ('ExtraTree', ExtraTreesClassifier(n_estimators=10, max_depth=3,
                                                        min_samples_split=2, random_state=0)),
          ('XGBoost', XGBClassifier(n_estimators=10, objective='binary:logistic', max_depth=3,
                                              use_label_encoder=False, eval_metric='error')),
          ('AdaBoost',AdaBoostClassifier(n_estimators=10, random_state=0)),
          ('MLP',MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=200, solver='adam', random_state=42)),
          ('GBM',GradientBoostingClassifier(n_estimators=10, random_state=0)),
          ('LightGBM',LGBMClassifier(n_estimators=10, max_depth=-1, objective='binary',verbosity=-1))]


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('LightGBM').setLevel(logging.ERROR)  # 仅输出错误信息
import joblib
from sklearn.utils import resample
from pixelmed_calc.medical_imaging.RadiologyComponents.components1 import calculate_metrics_with_ci
from imblearn.over_sampling import SMOTE, KMeansSMOTE
# 创建DataFrame存储结果
results = pd.DataFrame(columns=['Dataset', 'Model', 'Threshold', 'ACC', 'AUC', 'Sensitivity', 'Specificity', 'NPV', 'PPV', 'F1'])
ci_results = pd.DataFrame(columns=['Dataset', 'Model', 'ACC', 'AUC', 'Sensitivity', 'Specificity', 'NPV', 'PPV', 'F1'])

proba_dict_train = {}
proba_dict_test = {}
proba_dict_val={}
train_ids = merged_data[merged_data['group'] == 'train']['ID'].values
val_ids = merged_data[merged_data['group'] == 'val']['ID'].values
test_ids = merged_data[merged_data['group'] == 'test']['ID'].values

# 训练和测试所有模型
for name, model in models:
    print(f"Training {name}...")
    #model.fit(x_all[0:1600], y_all[0:1600])
    smote_model = SMOTE(random_state=42)
    x_smotem,y_smote=smote_model.fit_resample(x_all, y_all)
    model.fit(x_train, y_train)
    joblib.dump(model, f"results/model_weight/{name}.pkl")
    # 在训练集、验证集和测试集上分别找到最佳阈值
    y_train_proba = model.predict_proba(x_train)[:, 1]
    y_val_proba = model.predict_proba(x_val)[:, 1]
    y_test_proba = model.predict_proba(x_test)[:, 1]

    # 使用最佳阈值进行预测和计算指标
   
    train_metrics,train_ci = calculate_metrics_with_ci(np.array(y_train),  np.array(y_train_proba),n_bootstrap=100)
    val_metrics,val_ci = calculate_metrics_with_ci(np.array(y_val),  np.array(y_val_proba),n_bootstrap=100)
    
    test_metrics,test_ci = calculate_metrics_with_ci(np.array(y_test), np.array(y_test_proba),n_bootstrap=100)

    proba_dict_train[name] = y_train_proba
    proba_dict_val[name] = y_val_proba
    proba_dict_test[name] = y_test_proba

    train_output = pd.DataFrame({'ID': train_ids, 'proba': y_train_proba})
    train_output.to_csv(f'results/pred/{name}_train_proba.csv', index=False)

    val_output = pd.DataFrame({'ID': val_ids, 'proba': y_val_proba})
    val_output.to_csv(f'results/pred/{name}_val_proba.csv', index=False)

    test_output = pd.DataFrame({'ID': test_ids, 'proba': y_test_proba})
    test_output.to_csv(f'results/pred/{name}_test_proba.csv', index=False)
    
    # 将结果保存到DataFrame中
    for dataset, metrics, res_ci,true_labels, pred_proba in zip(['Train','val', 'test'], 
                                                        [train_metrics,val_metrics, test_metrics], 
                                                         [train_ci,val_ci, test_ci], 
                                                        [y_train,y_val,y_test],
                                                        [y_train_proba,y_val_proba, y_test_proba]):
        result = {
            'Dataset': dataset,
            'Model': name,
            'Threshold': metrics['threshold'],
            'ACC': metrics['accuracy'],
            'AUC': metrics['auc'],
            'Sensitivity': metrics['sensitivity'],
            'Specificity': metrics['specificity'],
            'NPV': metrics['npv'],
            'PPV': metrics['ppv'],
            'F1': metrics['f1'],
        }
        results = pd.concat([results, pd.DataFrame([result])], ignore_index=True)
        # 计算 95% CI
        ci_data = {
            'Dataset': dataset,
            'Model': name,
            'ACC': res_ci['accuracy'],
            'AUC': res_ci['auc'],
            'Sensitivity': res_ci['sensitivity'],
            'Specificity': res_ci['specificity'],
            'NPV': res_ci['npv'],
            'PPV': res_ci['ppv'],
            'F1': res_ci['f1'],
        }
        ci_results = pd.concat([ci_results, pd.DataFrame([ci_data])], ignore_index=True)
# 显示结果
display(results)
# 保存模型性能指标和95%置信区间到CSV文件
results.to_csv('results/model_performance_metrics.csv', index=False)
ci_results.to_csv('results/model_performance_metrics_CI.csv', index=False)

RF 最佳


In [None]:
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
for name, model in models:
        print(f"Plotting feature importance for {name}...")
        feature_importance = None
        if hasattr(model, 'feature_importances_'):
            feature_importance = model.feature_importances_
        elif hasattr(model, 'coef_'):
            feature_importance = np.abs(model.coef_[0])
        else:
            result = permutation_importance(model, x_train, y_train, n_repeats=30, random_state=42, n_jobs=-1)
            feature_importance = result.importances_mean.argsort()
        
        feature_importance_df = pd.DataFrame({
            'Feature': x_train.columns,
            'Importance': feature_importance
        })
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
        plt.xlabel("Feature Importance")
        plt.ylabel("Feature")
        plt.title(f"{name} Feature Importance")
        plt.gca().invert_yaxis()
        plt.grid(False)
        plt.savefig(f"results/img/{name}_feature_importance.svg", bbox_inches='tight')
        plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import auc
from pixelmed_calc.medical_imaging.Ploting.plot_metric import plot_multiple_ROCs
  
for name in proba_dict_test:
    y_truths_list=[y_train,y_val,y_test]
    y_scores_list=[proba_dict_train[name],proba_dict_val[name],proba_dict_test[name]]
    plot_multiple_ROCs(y_truths_list,y_scores_list,models=['train','val','test'],title=name)
    plt.savefig(f'results/img/{name}_roc.svg')
    plt.show()

In [None]:

y_truths_list=[y_train,y_train,y_train]
y_scores_list=[proba_dict_train['LR'],proba_dict_train['MLP'],proba_dict_train['GBM']]
plot_multiple_ROCs(y_truths_list,y_scores_list,models=['LR','MLP','GBM'])

In [None]:
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
from pixelmed_calc.medical_imaging.Ploting.plot_metric import plot_calibration_curves
from pixelmed_calc.medical_imaging.RadiologyComponents.components1 import HosmerLemeshow
for name in proba_dict_test:
    y_truths_list=[y_train,y_val,y_test]
    y_scores_list=[proba_dict_train[name],proba_dict_val[name],proba_dict_test[name]]
    plot_calibration_curves(y_truths_list,y_scores_list,models=['train','val','test'],title=name)
    hl_result_train = HosmerLemeshow(proba_dict_train[name], y_train,Q=5)
    print(f'{name} train:\n{hl_result_train}')

    hl_result_val = HosmerLemeshow(proba_dict_val[name], y_val,Q=5)
    print(f'{name} val:\n{hl_result_val}')

    hl_result_test = HosmerLemeshow(proba_dict_test[name], y_test,Q=5)
    print(f'{name} test:\n {hl_result_test}')
    plt.savefig(f'results/img/{name}_calibrated.svg')
    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pixelmed_calc.medical_imaging.Ploting.plot_metric import plot_DCA_curve

# Example usage:
for names in models:
    intersection_points_x_axis, intersection_points_model_vs_all = plot_DCA_curve(proba_dict_test[names[0]], y_test,model=names[0])
    print("Intersection points with x-axis:", intersection_points_x_axis)
    print("Intersection points between net_benefit_model and net_benefit_all:", intersection_points_model_vs_all)
    plt.savefig(f"results/img/{names[0]}_dca.svg")
    plt.show()

delong 检验

In [None]:
from itertools import combinations
import pandas as pd
from pixelmed_calc.medical_imaging.RadiologyComponents.delong import delong_roc_test

# List of models and data types
model_names =[]
for name, model in models:
    model_names.append(name)
data_types = ['test','val', 'train']

# Prepare to collect results
results = []

# Note: Assuming y_test_sel and y_train_sel are defined elsewhere in your code
# ground_truth for test and train data
ground_truth_test = y_test
ground_truth_train = y_train

# Generate all combinations of model comparisons for each data type
for sm1, sm2 in combinations(model_names, 2):
    for data_type in data_types:
        predictions_one = pd.read_csv(f'results/pred/{sm1}_{data_type}_proba.csv')
        predictions_two = pd.read_csv(f'results/pred/{sm2}_{data_type}_proba.csv')
        
        # Select appropriate ground_truth based on data_type
        ground_truth = ground_truth_train if data_type == 'train' else ground_truth_test
        
        delong = delong_roc_test(ground_truth, predictions_one['proba'], predictions_two['proba'])
        
        # Collect each comparison's results
        results.append({
            'Model 1': sm1,
            'Model 2': sm2,
            'Data Type': data_type,
            'P-Value': delong[0][0],
            'Z-Value': delong[1][0]
        })

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Display the DataFrame
results_df.to_csv('results/delong_test.csv')
results_df
