In [None]:
import pycaret

In [None]:
import pandas as pd
data = pd.read_excel(r'Model optimization\2.input\development dataset.xlsx',sheet_name="Sheet 1")

In [None]:
data['Halo_Sign'] = data['Halo_Sign'].map({'Exists':1,'Absent':0})  
data['Gender'] = data['Gender'].map({'Female':1,'Male':0})
data['Composition'] = data['Composition'].map({'Solid':1,'Others':0})
data['Shape'] = data['Shape'].map({'Microlobulated':1,'Others':0})
data['Echogenicity'] = data['Echogenicity'].map({'Hypoechogenicity':1,'Others':0})
data['Echogenic_Foci'] = data['Echogenic_Foci'].map({'Microcalcification':1,'Others':0})
data['Margin'] = data['Margin'].map({'Irregular':1,'Smooth':0})
data['ATR'] = data['ATR'].map({'Taller_than_Wide':1,'Wider_than_Tall':0})
data['Pathological_Diagnosis'] = data['Pathological_Diagnosis'].map({'Malignant':1,'Benign':0})

In [None]:
df_encoded1 = pd.get_dummies(data, columns=['Posterior_Echo'], prefix='')
df_encoded1.rename(columns={'_Absent_of_Shadowing':'Absent_of_Shadowing','_Posterior_Attenuation':'Posterior_Attenuation','_Shadowing':'Shadowing'},inplace=True)
columns_to_convert1 = ['Absent_of_Shadowing','Posterior_Attenuation','Shadowing']
df_encoded1[columns_to_convert1] = df_encoded1[columns_to_convert1].astype(int)


df_encoded2 = pd.get_dummies(df_encoded1, columns=['Location'], prefix='')
df_encoded2.rename(columns={'_Right_Lobe':'Right_Lobe','_Left_Lobe':'Left_Lobe','_Isthmus':'Isthmus'},inplace=True)
columns_to_convert2 = ['Right_Lobe','Left_Lobe','Isthmus']
df_encoded2[columns_to_convert2] = df_encoded2[columns_to_convert2].astype(int)


In [None]:
from sklearn.preprocessing import LabelEncoder

order_list = ['Intra_BFS','Peri_BFS']
label_encoder = LabelEncoder()

for i in order_list:
    
    df_encoded2[i] = label_encoder.fit_transform(df_encoded2[i])

    for class_label, encoded_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
        print(f"{class_label}: {encoded_label}")

In [None]:
dir_path0 = r'Model optimization\\3.output\\'

In [None]:
data_input_reconde = df_encoded2

In [None]:
from pycaret.classification import *
exp_clf = setup(
    data_input_reconde, target='Pathological_Diagnosis', session_id=111,
    numeric_features=["Age","Maximum_Diameter"],
    categorical_features=[
    "Gender","Composition","Shape","Echogenicity","Echogenic_Foci",
    "Margin","ATR"], 
    train_size = 0.7,data_split_shuffle = True,data_split_stratify = True,
    ignore_features=["ACR","Kwak","Data_Type",
                     "BMI","Halo_Sign", "Absent_of_Shadowing","Posterior_Attenuation","Shadowing",
    "Right_Lobe","Left_Lobe","Isthmus",'Intra_BFS'
                     ],
)


In [None]:
get_config()

In [None]:

remove_metric('acc')
remove_metric('auc')
remove_metric('recall')
remove_metric('precision')
remove_metric('f1')
remove_metric('kappa')
remove_metric('mcc')

In [None]:
get_metrics()

In [None]:
from pycaret.classification import *
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, cohen_kappa_score, matthews_corrcoef,
    balanced_accuracy_score, log_loss, average_precision_score,
    jaccard_score, brier_score_loss, 
)

def register_all_sklearn_metrics():
    
    
    add_metric('auc', 'ROC-AUC',roc_auc_score,target='pred_proba', greater_is_better=True)

    add_metric('balanced_acc', 'Balanced Accuracy', balanced_accuracy_score, target='pred',greater_is_better=True)

    add_metric('precision', 'Precision',precision_score,target='pred',greater_is_better=True)
    
    add_metric('recall', 'Recall',recall_score,target='pred',greater_is_better=True)
    
    add_metric('f1_score', 'F1_score',f1_score,target='pred',greater_is_better=True)
    
    add_metric('brier', 'Brier Score', brier_score_loss,target='pred_proba', greater_is_better=False)



if __name__ == "__main__":
    
    register_all_sklearn_metrics()
    
    

In [None]:
get_metrics()

In [None]:
# model training
best_model_list = compare_models(sort = 'ROC-AUC',n_select=30,exclude = ['svm',"ridge","lda","nb","qda","dummy"]) 

In [None]:
from sklearn import svm
SVM = svm.SVC(probability=True,random_state=111) 
SVM = create_model(SVM)

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifier
import numpy as np
Ridge_1 = RidgeClassifier(random_state=111)


alpha_values = np.concatenate([
    np.arange(0.1, 1.1, 0.05),  
    np.arange(1, 101, 5)      
])
param_grid_Ridge = {'alpha': alpha_values}
Ridge_2 = tune_model(Ridge_1, choose_better = True,optimize="ROC-AUC",custom_grid = param_grid_Ridge,  
                     n_iter=100,search_library = 'scikit-learn',search_algorithm = 'grid')  

calibrated_model_Ridge = CalibratedClassifierCV(Ridge_2,cv=10,n_jobs=-1)
Ridge = create_model(calibrated_model_Ridge)


In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

LDA_1 = LinearDiscriminantAnalysis()

param_grid_LDA = {'shrinkage': ['auto'], 'solver': ['lsqr', 'eigen']}
LDA_2 = tune_model(LDA_1, choose_better = True,optimize="ROC-AUC",custom_grid = param_grid_LDA, 
                   n_iter=100,search_library = 'scikit-learn',search_algorithm = 'grid') 
    
calibrated_model_LDA = CalibratedClassifierCV(LDA_2,cv=10,n_jobs=-1)
LDA = create_model(calibrated_model_LDA)


In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB

NB_1 = GaussianNB()

param_grid_NB = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}
NB_2 = tune_model(NB_1, choose_better = True,optimize="ROC-AUC",custom_grid = param_grid_NB,
                  n_iter=100,search_library = 'scikit-learn',search_algorithm = 'grid') 

calibrated_model_NB = CalibratedClassifierCV(NB_2,cv=10,n_jobs=-1)
NB = create_model(calibrated_model_NB)


In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import numpy as np

QDA_1 = QuadraticDiscriminantAnalysis()


param_grid_QDA = {'reg_param': np.arange(0.1, 1.05, 0.05)} 
QDA_2 = tune_model(QDA_1, choose_better = True,optimize="ROC-AUC",custom_grid = param_grid_QDA,
                   n_iter=100,search_library = 'scikit-learn',search_algorithm = 'grid') 

calibrated_model_QDA = CalibratedClassifierCV(QDA_2,cv=10,n_jobs=-1)
QDA = create_model(calibrated_model_QDA)


In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
GPC = GaussianProcessClassifier(random_state = 234)
GPC = create_model(GPC)

In [None]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(random_state = 234)
MLP = create_model(MLP)


In [None]:
leaderboard_df_1 = get_leaderboard()
leaderboard_df_new_1 = leaderboard_df_1.sort_values(by='ROC-AUC',ascending=False)
leaderboard_df_new_1


In [None]:
model_select_models_1 = leaderboard_df_new_1['Model']
model_select_models_1

In [None]:
model_select_names_1 = leaderboard_df_new_1['Model Name']
model_select_names_1

In [None]:
# Hyperparameter optimization

n_1 = len(model_select_models_1)

for i in range(n_1):
    try:
        tuned_model = tune_model(model_select_models_1[i], choose_better = True,optimize="ROC-AUC",
                                 n_iter=100,search_library = 'scikit-optimize',search_algorithm = 'bayesian')
                            
    except:
        
        continue


In [None]:
leaderboard_df_2 = get_leaderboard()
leaderboard_df_new_2 = leaderboard_df_2.sort_values(by='ROC-AUC',ascending=False)
leaderboard_df_new_2

In [None]:
leaderboard_df_sorted = leaderboard_df_new_2.sort_values(by='ROC-AUC', ascending=False)
leaderboard_df_new_3 = leaderboard_df_sorted.drop_duplicates(subset='Model Name', keep='first')
leaderboard_df_new_4 = leaderboard_df_new_3.reset_index(drop=True)
leaderboard_df_new_4

In [None]:
model_column = leaderboard_df_new_4.pop('Model')
leaderboard_df_new_4['Model'] = model_column

leaderboard_df_new_4

In [None]:
leaderboard_df_new_4.set_index('Model Name', inplace=True)

In [None]:
if 'parm' not in leaderboard_df_new_4.columns:
    leaderboard_df_new_4['parm'] = None

for index, row in leaderboard_df_new_4.iterrows():
    
    parm = row['Model'][-1].get_params()
    leaderboard_df_new_4.at[index, 'parm'] = str(parm)


In [None]:
# # calculate Specificity
leaderboard_df_new_4['Specificity'] = 2 * leaderboard_df_new_4['Balanced Accuracy'] - leaderboard_df_new_4['Recall']
cols = leaderboard_df_new_4.columns.tolist()
cols.remove('Specificity')
cols.insert(3, 'Specificity')
leaderboard_df_new_4 = leaderboard_df_new_4[cols]


In [None]:
leaderboard_df_new_4

In [None]:
#Save models

from pycaret.classification import save_model

for index, row in leaderboard_df_new_4.iterrows():
    save_model(row['Model'],index)
    


In [None]:
probability_threshold_input = 0.5

In [None]:
import os

dir_path3 = dir_path0 + '3-Model-Result_prob_excel\\' 
if not os.path.exists(dir_path3):
    os.makedirs(dir_path3)

In [None]:
import os
output_path_test =  dir_path0 + '3-Model-Result_prob_excel\\' + f"3-1-Model-Result_prob_Test-Data-{probability_threshold_input}\\"
if not os.path.exists(output_path_test):
    os.makedirs(output_path_test)

In [None]:
i = 0

for index, row in leaderboard_df_new_4.iterrows():
 
    pred_model_test_data = predict_model(row['Model'],raw_score=True,probability_threshold=probability_threshold_input)
    pred_model_test_data_summary = pull()
    model_name = index
    
    if i == 0:
        pred_model_test_data_summary_all = pred_model_test_data_summary
    else:
        pred_model_test_data_summary_all = pd.concat([pred_model_test_data_summary_all,pred_model_test_data_summary],axis=0)

    i += 1


In [None]:
pred_model_test_data_summary_all

In [None]:
pred_model_test_data_summary_all = pred_model_test_data_summary_all.sort_values(by='ROC-AUC',ascending=False)
pred_model_test_data_summary_all = pred_model_test_data_summary_all.set_index('Model')
pred_model_test_data_summary_all


In [None]:
model_params_dict = {}

for index, row in leaderboard_df_new_4.iterrows():
    Model_name = index
    model_params_dict[Model_name] = row['parm']
    

In [None]:
if 'parm' not in pred_model_test_data_summary_all.columns:
    pred_model_test_data_summary_all['parm'] = None

for index,row in pred_model_test_data_summary_all.iterrows():
    Model_name = index
    params = model_params_dict[Model_name]
    pred_model_test_data_summary_all.loc[Model_name,'parm'] = params


In [None]:
# calculate Specificity
pred_model_test_data_summary_all['Specificity'] = 2 * pred_model_test_data_summary_all['Balanced Accuracy'] - pred_model_test_data_summary_all['Recall']
cols = pred_model_test_data_summary_all.columns.tolist()
cols.remove('Specificity')
cols.insert(3, 'Specificity')
pred_model_test_data_summary_all = pred_model_test_data_summary_all[cols]

In [None]:
pred_model_test_data_summary_all

In [None]:
from sklearn.metrics import precision_recall_curve,roc_curve, auc

import pandas as pd
import numpy as np
pd_youden = pd.DataFrame(columns=['Model Name','PR-AUC','Max Youden Index ','Cutoff Value', 'ROC-AUC-2'])

for filename in os.listdir(output_path_test):
    if filename.endswith('.xlsx'):

        model_name = filename.split('_prob_Test-Data-0.5.xlsx')[0]

        file_path = os.path.join(output_path_test, filename)
        df = pd.read_excel(file_path,sheet_name="Sheet1")
        
        y_true = df['Pathological_Diagnosis']
        y_scores = df['prediction_score_1']

        precision, recall, thresholds_pr = precision_recall_curve(y_true, y_scores)
        pr_auc = auc(recall, precision)

        fpr, tpr, thresholds = roc_curve(y_true, y_scores)
        roc_auc = auc(fpr, tpr)

        youden_index_max = tpr - fpr
        cutoff_value = thresholds[np.argmax(youden_index_max)]
 
        pr_auc = np.round(pr_auc,4)
        youden_index_max = np.round(np.max(youden_index_max),4)
        cutoff_value = np.round(cutoff_value,4)
        roc_auc = np.round(roc_auc,4)
        
        new_row = pd.DataFrame({
            'Model Name': [model_name],
            'PR-AUC': [pr_auc],
            'Max Youden Index ': [youden_index_max],
            'Cutoff Value': [cutoff_value],
            'ROC-AUC-2': [roc_auc]
        })
        pd_youden = pd.concat([pd_youden, new_row], ignore_index=True)


In [None]:
pd_youden = pd_youden.sort_values(by='ROC-AUC-2', ascending=False)
pd_youden.set_index('Model Name', inplace=True)

pd_youden

In [None]:
pd_youden.to_excel(f'{dir_path2}2-3-Test-Data_Youden-Cutoff.xlsx', index=True)

In [None]:
import os
output_path_test =  dir_path0 + '3-Model-Result_prob_excel\\' + "3-2-Model-Result_prob_Test-Data_Best-Cutoff\\"
if not os.path.exists(output_path_test):
    os.makedirs(output_path_test)

In [None]:
pd_youden = pd.read_excel(f'{dir_path2}2-3-Test-Data_Youden-Cutoff.xlsx',sheet_name="Sheet1")
pd_youden.set_index('Model Name', inplace=True)
pd_youden

dict_modelname_cutoff = {}
for index, row in pd_youden.iterrows():
    model_name = index
    cutoff = row['Cutoff Value']
    dict_modelname_cutoff[model_name] = cutoff


In [None]:
i = 0

for index, row in leaderboard_df_new_4.iterrows():

    cutoff = dict_modelname_cutoff[index]
    pred_model_test_data = predict_model(row['Model'],raw_score=True,probability_threshold=float(cutoff))
    
    pred_model_test_data_summary = pull()
    model_name = index
    
    if i == 0:
        pred_model_test_data_summary_all = pred_model_test_data_summary
    else:
        pred_model_test_data_summary_all = pd.concat([pred_model_test_data_summary_all,pred_model_test_data_summary],axis=0)
 
    i += 1
    

In [None]:
# calculate Specificity
pred_model_test_data_summary_all['Specificity'] = 2 * pred_model_test_data_summary_all['Balanced Accuracy'] - pred_model_test_data_summary_all['Recall']
cols = pred_model_test_data_summary_all.columns.tolist()
cols.remove('Specificity')
cols.insert(3, 'Specificity')
pred_model_test_data_summary_all = pred_model_test_data_summary_all[cols]


pred_model_test_data_summary_all = pred_model_test_data_summary_all.sort_values(by='ROC-AUC',ascending=False)
pred_model_test_data_summary_all.set_index('Model', inplace=True)

pred_model_test_data_summary_all

In [None]:
pred_model_test_data_summary_all = pd.merge(pred_model_test_data_summary_all, pd_youden, left_index=True, right_index=True)


In [None]:
if 'parm' not in pred_model_test_data_summary_all.columns:
    pred_model_test_data_summary_all['parm'] = None

for index,row in pred_model_test_data_summary_all.iterrows():
    Model_name = index
    params = model_params_dict[Model_name]
    pred_model_test_data_summary_all.loc[Model_name,'parm'] = params


In [None]:
pred_model_test_data_summary_all

In [None]:

pred_model_test_data_summary_all.to_excel(f'{dir_path2}2-4-Model-Result_Evaluation-Metrics_Test-Data-BestCutoff.xlsx',index=True)


In [None]:
#External Validation 

data_validation = pd.read_excel(r'Model optimization\2.input\external validation dataset.xlsx',sheet_name="Sheet 1")

data_validation

In [None]:

data_validation['Halo_Sign'] = data_validation['Halo_Sign'].map({'Exists':1,'Absent':0})  
data_validation['Gender'] = data_validation['Gender'].map({'Female':1,'Male':0})
data_validation['Composition'] = data_validation['Composition'].map({'Solid':1,'Others':0})
data_validation['Shape'] = data_validation['Shape'].map({'Microlobulated':1,'Others':0})
data_validation['Echogenicity'] = data_validation['Echogenicity'].map({'Hypoechogenicity':1,'Others':0})
data_validation['Echogenic_Foci'] = data_validation['Echogenic_Foci'].map({'Microcalcification':1,'Others':0})
data_validation['Margin'] = data_validation['Margin'].map({'Irregular':1,'Smooth':0})
data_validation['ATR'] = data_validation['ATR'].map({'Taller_than_Wide':1,'Wider_than_Tall':0})
data_validation['Pathological_Diagnosis'] = data_validation['Pathological_Diagnosis'].map({'Malignant':1,'Benign':0})


In [None]:
df_encoded1 = pd.get_dummies(data_validation, columns=['Posterior_Echo'], prefix='')
df_encoded1.rename(columns={'_Absent_of_Shadowing':'Absent_of_Shadowing','_Posterior_Attenuation':'Posterior_Attenuation','_Shadowing':'Shadowing'},inplace=True)
columns_to_convert1 = ['Absent_of_Shadowing','Posterior_Attenuation','Shadowing']
df_encoded1[columns_to_convert1] = df_encoded1[columns_to_convert1].astype(int)


df_encoded2 = pd.get_dummies(df_encoded1, columns=['Location'], prefix='')
df_encoded2.rename(columns={'_Right_Lobe':'Right_Lobe','_Left_Lobe':'Left_Lobe','_Isthmus':'Isthmus'},inplace=True)
columns_to_convert2 = ['Right_Lobe','Left_Lobe','Isthmus']
df_encoded2[columns_to_convert2] = df_encoded2[columns_to_convert2].astype(int)

In [None]:
from sklearn.preprocessing import LabelEncoder

order_list = ['Intra_BFS','Peri_BFS']
label_encoder = LabelEncoder()

for i in order_list:
    
    df_encoded2[i] = label_encoder.fit_transform(df_encoded2[i])

    for class_label, encoded_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
        print(f"{class_label}: {encoded_label}")

In [None]:
validation_data = df_encoded2

In [None]:
validation_data

In [None]:
ignore_features=["BMI","Halo_Sign", "Absent_of_Shadowing","Posterior_Attenuation","Shadowing",
    "Right_Lobe","Left_Lobe","Isthmus",'Intra_BFS']

validation_data = validation_data.drop(ignore_features, axis=1)

validation_data

In [None]:
i = 0

for index,row in leaderboard_df_new_4.iterrows():
    
    cutoff = dict_modelname_cutoff[index]
    
    prediction = predict_model(row['Model'], data = validation_data ,raw_score=True,probability_threshold=float(cutoff))
    
    pred_model_external_data_summary = pull()
    model_name = index
    
    if i == 0:
        pred_model_external_data_summary_all = pred_model_external_data_summary
    else:
        pred_model_external_data_summary_all = pd.concat([pred_model_external_data_summary_all,pred_model_external_data_summary],axis=0)

    i += 1


In [None]:
# calculate Specificity
pred_model_external_data_summary_all['Specificity'] = 2 * pred_model_external_data_summary_all['Balanced Accuracy'] - pred_model_external_data_summary_all['Recall']
cols = pred_model_external_data_summary_all.columns.tolist()
cols.remove('Specificity')
cols.insert(3, 'Specificity')
pred_model_external_data_summary_all = pred_model_external_data_summary_all[cols]

pred_model_external_data_summary_all = pred_model_external_data_summary_all.sort_values(by='ROC-AUC',ascending=False)
pred_model_external_data_summary_all.set_index('Model', inplace=True)

pred_model_external_data_summary_all


In [None]:
from sklearn.metrics import precision_recall_curve,roc_curve, auc

import pandas as pd
import numpy as np
pd_youden_1 = pd.DataFrame(columns=['Model Name','PR-AUC','Max Youden Index ','ROC-AUC-2'])

for filename in os.listdir(output_path_validation):
    if filename.endswith('.xlsx'):  

        model_name = filename.split('_prob_Validation-Data-BestCutoff.xlsx')[0]

        file_path = os.path.join(output_path_validation, filename)  
        df = pd.read_excel(file_path,sheet_name="Sheet1")
        
        y_true = df['Pathological_Diagnosis']
        y_scores = df['prediction_score_1']

        precision, recall, thresholds_pr = precision_recall_curve(y_true, y_scores)
        pr_auc = auc(recall, precision)

        fpr, tpr, thresholds = roc_curve(y_true, y_scores)
        roc_auc = auc(fpr, tpr)

        youden_index_max = tpr - fpr
         
        pr_auc = np.round(pr_auc,4)
        youden_index_max = np.round(np.max(youden_index_max),4)
        roc_auc = np.round(roc_auc,4)
        
        new_row = pd.DataFrame({
            'Model Name': [model_name],
            'PR-AUC': [pr_auc],
            'Max Youden Index ': [youden_index_max],
            'ROC-AUC-2': [roc_auc]
        })
        pd_youden_1 = pd.concat([pd_youden_1, new_row], ignore_index=True)


In [None]:
pd_youde_1 = pd_youden_1.sort_values(by='ROC-AUC-2', ascending=False)
pd_youden_1.set_index('Model Name', inplace=True)


In [None]:
pd_youden_1['Cutoff Value'] = pd_youden['Cutoff Value'].astype(float)
pd_youden_1


In [None]:
pred_model_external_data_summary_all = pd.merge(pred_model_external_data_summary_all, pd_youden_1, left_index=True, right_index=True)


In [None]:
if 'parm' not in pred_model_external_data_summary_all.columns:
    pred_model_external_data_summary_all['parm'] = None

for index,row in pred_model_external_data_summary_all.iterrows():
    Model_name = index
    params = model_params_dict[Model_name]
    pred_model_external_data_summary_all.loc[Model_name,'parm'] = params


In [None]:
pred_model_external_data_summary_all 

In [None]:
#visualization

In [None]:

main_colors = [
    '#1F77B4', 
    '#FF7F0E',  
    '#2CA02C', 
    '#D62728',
    '#9467BD', 
    '#8C564B', 
    '#E377C2', 
    '#7F7F7F', 
    '#BCBD22', 
    '#17BECF',
    '#4E79A7', 
    '#F28E2B',  
    '#59A14F',  
    '#E15759', 
    '#B07AA1', 
    '#9C755F', 
    '#EDC948', 
    '#76B7B2', 
    '#FF9DA7', 
    '#9F9F9F'   
]

In [None]:
#confusion matrix

In [None]:
import os

In [None]:
#文件夹
output_path = r""
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


COLOR_SET = ['#F0FFF0', '#1F77B4']  
FONT_COLOR = 'black'                
ALPHA = 0.8                        

name_list = ["3-2-Model-Result_prob_Test-Data_Best-Cutoff","3-3-Model-Result_prob_Validation-Data-cutoff"]

for name in name_list:
    
    path_input = f'Model optimization\\3.output\\3-Model-Result_prob_excel\\{name}\\'

    for file in os.listdir(path_input):
        if file.endswith(".xlsx"):
            model_name = file.split('_')[0]
            
            data = pd.read_excel(f'{path_input}\\{file}')

            y_true = data['Pathological_Diagnosis']
            y_pred = data['prediction_label']

            cm = confusion_matrix(y_true, y_pred)
            cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

            cmap = LinearSegmentedColormap.from_list('my_cmap', COLOR_SET, N=256)

            plt.figure(figsize=(8, 6))
            plt.imshow(cm_normalized, cmap=cmap, alpha=ALPHA, 
                    interpolation='nearest', 
                    vmin=0, vmax=1)  

            for i in range(cm.shape[0]):
                for j in range(cm.shape[1]):
                    plt.text(j, i, f'{cm[i, j]}\n({cm_normalized[i, j]:.2%})', color=FONT_COLOR,va='center', ha='center',fontsize=12,fontweight='bold')
                    
            class_names = ['Benign', 'Malignant']
            plt.xticks(np.arange(cm.shape[1]), class_names)
            plt.yticks(np.arange(cm.shape[0]), class_names)
            plt.xlabel('Predicted Label', fontsize=12,labelpad=10)
            plt.ylabel('True Label', fontsize=12,labelpad=-6)
            if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
                plt.title(f'Confusion Matrix by {model_name} Model in Test Data', pad=10, fontsize=14)
            elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
                plt.title(f'Confusion Matrix by {model_name} Model in External Validation Data', pad=10, fontsize=14)


            cbar = plt.colorbar()
            cbar.set_label('Percentage', rotation=270, labelpad=14,fontsize=12)

            
            plt.grid(False)
            plt.tight_layout()

            # 保存图片
            if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
                plt.savefig(f'{output_path}Test-Data_{model_name}_Confusion-Matrix.svg', format='svg', bbox_inches='tight', dpi=1200, transparent=True)
                plt.savefig(f'{output_path}Test-Data_{model_name}_Confusion-Matrix.pdf', format='pdf', bbox_inches='tight', dpi=1200, transparent=True)
            elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
                plt.savefig(f'{output_path}Validation-Data_{model_name}_Confusion-Matrix.svg', format='svg', bbox_inches='tight', dpi=1200, transparent=True)
                plt.savefig(f'{output_path}Validation-Data_{model_name}_Confusion-Matrix.pdf', format='pdf', bbox_inches='tight', dpi=1200, transparent=True)

            plt.show()

In [None]:
# ROC curve

In [None]:
output_path = r""
if not os.path.exists(output_path):
    os.makedirs(output_path)


In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


name_list = ["3-2-Model-Result_prob_Test-Data_Best-Cutoff","3-3-Model-Result_prob_Validation-Data-cutoff"]


for name in name_list:
        
        path_input = f'Model optimization\\3.output\\3-Model-Result_prob_excel\\{name}\\'

        roc_dict = {}
        i = 0

        for file in os.listdir(path_input):
            if file.endswith(".xlsx"):
                dict1 = {}

                model_name = file.split('_')[0]
                
                data = pd.read_excel(f'{path_input}\\{file}')
                model_name = file.split('_')[0]

                y_true = data['Pathological_Diagnosis']
                y_proba = data['prediction_score_1']

                dict1['y_proba'] = y_proba
                dict1['color'] = main_colors[i]
                roc_dict[model_name] = dict1
                i = i + 1

        roc_dict = dict(sorted(roc_dict.items(), key=lambda item: roc_auc_score(y_true, item[1]['y_proba']), reverse=True))

        plt.figure(figsize=(8, 6), dpi=600)

        for model_name, model_data in roc_dict.items():
            y_proba = model_data['y_proba']
            color = model_data['color']
            
            fpr, tpr, thresholds = roc_curve(y_true, y_proba)
            roc_auc = auc(fpr, tpr)
            
            plt.plot(fpr, tpr, color=color, lw=2, label=f'{model_name} (AUC = {roc_auc:.4f})')
            
        
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random chance')


        plt.xlabel('False Positive Rate (FPR)', fontsize=12) 
        plt.ylabel('True Positive Rate (TPR)', fontsize=12) 
        if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
            plt.title(f'ROC Curve by All Model in Test Data', fontsize=12) 
        elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
            plt.title(f'ROC Curve by All Model in External Validation Data', fontsize=12) 
        
        
        plt.legend(labelspacing=0.5,loc='lower right',  fontsize=8, frameon=False)
        plt.grid(False)  
        plt.tight_layout()


        if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
            plt.savefig(f'{output_path}All-Model_ROC-Curve_Test-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
            plt.savefig(f'{output_path}All-Model_ROC-Curve_Test-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
        elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
            plt.savefig(f'{output_path}All-Model_ROC-Curve_Validation-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
            plt.savefig(f'{output_path}All-Model_ROC-Curve_Validation-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
    
        plt.show()



In [None]:
# PR curve

In [None]:
output_path = r""
if not os.path.exists(output_path):
    os.makedirs(output_path)


In [None]:
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


name_list = ["3-2-Model-Result_prob_Test-Data_Best-Cutoff","3-3-Model-Result_prob_Validation-Data-cutoff"]


for name in name_list:
        
    path_input = f'Model optimization\\3.output\\3-Model-Result_prob_excel\\{name}\\'   

    pr_dict = {}
    i = 0

    for file in os.listdir(path_input):
        if file.endswith(".xlsx"):
            dict1 = {}

            model_name = file.split('_')[0]
            
            data = pd.read_excel(f'{path_input}\\{file}')
            model_name = file.split('_')[0]

            y_true = data['Pathological_Diagnosis']
            y_proba = data['prediction_score_1']

            dict1['y_proba'] = y_proba
            dict1['color'] = main_colors[i]
            pr_dict[model_name] = dict1
            i = i + 1

    pr_dict = dict(sorted(pr_dict.items(), key=lambda item: roc_auc_score(y_true, item[1]['y_proba']), reverse=True))

    plt.figure( dpi=600)

    i1 = 0
    for model_name, model_data in pr_dict.items():
        y_proba = model_data['y_proba']
        color = model_data['color']
        
        precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
        pr_auc = auc(recall, precision)
        
        plt.plot(recall, precision, color=color, lw=2, label=f'{model_name} (AUC = {pr_auc:.4f})')
        

    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.xlabel('Recall', fontsize=12 ) 
    plt.ylabel('Precision', fontsize=12) 

    if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
        plt.title(f'PR Curve by All Models in Test Data', fontsize=12) 
    elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
        plt.title(f'PR Curve by All Models in External Validation Data', fontsize=12) 

    plt.legend(labelspacing=0.4,loc='lower right',  fontsize=8, frameon=False)
    plt.grid(False)  
    plt.tight_layout()

    
    if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
        plt.savefig(f'{output_path}All-Model_PR-Curve_Test-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
        plt.savefig(f'{output_path}All-Model_PR-Curve_Test-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
    elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
        plt.savefig(f'{output_path}All-Model_PR-Curve_Validation-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
        plt.savefig(f'{output_path}All-Model_PR-Curve_Validation-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)

    plt.show()

In [None]:
# calibration curve

In [None]:
output_path =  r""
if not os.path.exists(output_path):
    os.makedirs(output_path)
    

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

name_list = ["3-2-Model-Result_prob_Test-Data_Best-Cutoff","3-3-Model-Result_prob_Validation-Data-cutoff"]


for name in name_list:
        
    path_input = f'Model optimization\\3.output\\3-Model-Result_prob_excel\\{name}\\'

    for file in os.listdir(path_input):
        if file.endswith(".xlsx"):
            model_name = file.split('_')[0]
            
            data = pd.read_excel(f'{path_input}\\{file}')

            y_true = data['Pathological_Diagnosis']
            y_proba = data['prediction_score_1']

            fraction_of_positives, mean_predicted_value = calibration_curve(y_true, y_proba, n_bins=10)
            brier_score = round(brier_score_loss(y_true, y_proba), 4)

            plt.figure(dpi=600)
            plt.plot(mean_predicted_value, fraction_of_positives, marker='o', color="#AD002AB2", lw=2, label=f'Model Calibration Curve')
            plt.plot([0, 1], [0, 1], color='blue', linestyle='--', label='Perfectly Calibrated Curve')
            plt.text(0.752, 0.135, f'Brier Score: {brier_score}', transform=plt.gca().transAxes, fontsize=8, verticalalignment='top', horizontalalignment='left')

            plt.xlabel('Mean Predicted Value', fontsize=12) 
            plt.ylabel('Fraction of Positives', fontsize=12) 
            
            if name == "3-2-Model-Result_prob_Test-Data_Best-Cutoff":
                plt.title(f'Calibration Curve by {model_name} Model in Test Data', fontsize=12) 
            elif name == "3-3-Model-Result_prob_Validation-Data-cutoff":
                plt.title(f'Calibration Curve by {model_name} Model in External Validation Data', fontsize=12 ) 

            plt.legend(loc='lower right', fontsize=8, labelspacing=0.5, frameon=False)
            plt.grid(False)
            plt.tight_layout()

            if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
                plt.savefig(f'{output_path}{model_name}_Calibration-Curve_Test-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
                plt.savefig(f'{output_path}{model_name}_Calibration-Curve_Test-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
            elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
                plt.savefig(f'{output_path}{model_name}_Calibration-Curve_Validation-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
                plt.savefig(f'{output_path}{model_name}_Calibration-Curve_Validation-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)

            plt.show()


In [None]:
#DCA curve

In [None]:
output_path =  r''
if not os.path.exists(output_path):
    os.makedirs(output_path)
    

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


name_list = ["3-2-Model-Result_prob_Test-Data_Best-Cutoff","3-3-Model-Result_prob_Validation-Data-cutoff"]


for name in name_list:

    path_input = f'Model optimization\\3.output\\3-Model-Result_prob_excel\\{name}\\'

    for file in os.listdir(path_input):
        if file.endswith(".xlsx"):
            model_name = file.split('_')[0]
            
            data = pd.read_excel(f'{path_input}\\{file}')

            y_labels = data['Pathological_Diagnosis'] 
            y_pred_scores = data['prediction_score_1']  
            
            
            thresholds = np.arange(0, 1, 0.0001)


            y_labels = np.array(y_labels)
            y_pred_scores = np.array(y_pred_scores)
            n = len(y_labels)
            net_benefit_model = np.zeros_like(thresholds)
            y_pred_matrix = (y_pred_scores[:, None] > thresholds).astype(int)
            tp = (y_pred_matrix & y_labels[:, None]).sum(axis=0)
            fp = ((y_pred_matrix == 1) & (y_labels[:, None] == 0)).sum(axis=0)
            net_benefit_model = (tp / n) - (fp / n) * (thresholds / (1 - thresholds))
            
            num_positive = np.sum(y_labels == 1)
            num_negative = np.sum(y_labels == 0)
            n_total = len(y_labels)
            
            net_benefit_all = (num_positive / n_total) - (num_negative / n_total) * (thresholds / (1 - thresholds))


            # Calculate the intersection point linear interpolation method
            def find_intersection_points(thresholds, net_benefit_model, net_benefit_all):
                intersection_points1 = []
                for i in range(1, len(thresholds)):
                    if (net_benefit_model[i-1] - net_benefit_all[i-1]) * (net_benefit_model[i] - net_benefit_all[i]) < 0:
                        x1, y1 = thresholds[i-1], net_benefit_model[i-1]
                        x2, y2 = thresholds[i], net_benefit_model[i]
                        x3, y3 = thresholds[i-1], net_benefit_all[i-1]
                        x4, y4 = thresholds[i], net_benefit_all[i]
                        det = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
                        if det != 0:
                            x = ((x1*y2 - y1*x2)*(x3 - x4) - (x1 - x2)*(x3*y4 - y3*x4)) / det
                            intersection_points1.append((x, (y1 + y2) / 2))
                
                intersection_points2 = []
                for i in range(1, len(thresholds)):
                    if (net_benefit_model[i-1] - 0) * (net_benefit_model[i] - 0) < 0:
                        x1, y1 = thresholds[i-1], net_benefit_model[i-1]
                        x2, y2 = thresholds[i], net_benefit_model[i]
                        x = x1 - y1 * (x1 - x2) / (y1 - y2)
                        intersection_points2.append((x, 0))
                
                return intersection_points1, intersection_points2

            intersection_points1, intersection_points2= find_intersection_points(thresholds, net_benefit_model, net_benefit_all)
            
            fig, ax = plt.subplots(dpi=600)
            ax.plot(thresholds, net_benefit_model, color='deepskyblue', label= "Model")
            ax.plot(thresholds, net_benefit_all, color='black', label='Treat all')
            ax.plot((0, 1), (0, 0), color='#808080', label='Treat none')

            y2 = np.maximum(net_benefit_all, 0)
            y1 = np.maximum(net_benefit_model, y2)
            ax.fill_between(thresholds, y1, y2, color='deepskyblue', alpha=0.3)

            i = 0
            for point in intersection_points1:
                x, y = point
                ax.scatter(x, y, color='red', s=20, zorder=2.0)
                ax.text(x, y + i, f'Threshold Probability: {x:.4f}', fontsize=8, color='red') 
                i = i - 0.05
            i = 0
            for point in intersection_points2:
                x, y = point
                ax.scatter(x, y, color='red', s=20, zorder=2.0)
                ax.text(x, y + i, f'Threshold Probability: {x:.4f}', fontsize=8, color='red')
                i = i - 0.05

            ax.set_xlim(0, 1)
            ax.set_ylim(-0.15, 1.15)
            ax.set_xlabel('Threshold Probability', fontsize=12 ) 
            ax.set_ylabel('Net Benefit', fontsize=12) 
            ax.grid(False)
            ax.legend(loc='upper right', fontsize=8, frameon=False)
            
            if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
                plt.title(f'DCA Curve by {model_name} in Test Data', fontsize=12) 
            elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
                plt.title(f'DCA Curve by {model_name} in External Validation Data', fontsize=12) 
            
            plt.tight_layout()

            if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
                plt.savefig(f'{output_path}{model_name}_DCA-Curve_Test-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
                plt.savefig(f'{output_path}{model_name}_DCA-Curve_Test-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
            elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
                plt.savefig(f'{output_path}{model_name}_DCA-Curve_Validation-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
                plt.savefig(f'{output_path}{model_name}_DCA-Curve_Validation-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
        

            plt.show()

In [None]:
# KS curve

In [None]:
import os

output_path = r""

if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


name_list = ["3-2-Model-Result_prob_Test-Data_Best-Cutoff","3-3-Model-Result_prob_Validation-Data-cutoff"]


for name in name_list:
        
    path_input = f'Model optimization\\3.output\\3-Model-Result_prob_excel\\{name}\\'

    for file in os.listdir(path_input):
        if file.endswith(".xlsx"):
            model_name = file.split('_')[0]
            
            data = pd.read_excel(f'{path_input}\\{file}')

            y_labels = data['Pathological_Diagnosis'] 
            y_pred_scores = data['prediction_score_1'] 

            fpr, tpr, thresholds= roc_curve(y_labels, y_pred_scores)
            ks_value = max(abs(tpr-fpr)) 
            
            plt.figure(dpi = 600)
            plt.plot(thresholds, abs(tpr-fpr), label='TPR-FPR', color=main_colors[3], linewidth=2.5, alpha=0.8)

            plt.plot(thresholds, tpr, label='True Positive Rate (TPR)', color=main_colors[0], linewidth=2.5, alpha=0.8)
            plt.plot(thresholds, fpr, label='False Positive Rate (FPR)', color=main_colors[7], linewidth=2.5, alpha=0.8)
    
            plt.xlabel('Thresholds', fontsize=12)

            plt.ylabel('Rate', fontsize=12)

            plt.xlim(0.0, 1.0)
            plt.ylim(0.0, 1.0)

            if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
                plt.title(f'KS Curve by {model_name} in Test Data', fontsize=12) 
            elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
                plt.title(f'KS Curve by {model_name} in External Validation Data', fontsize=12) 
               
            plt.grid(False)
            plt.legend(labelspacing=0.5,loc='lower left', fontsize=10, frameon=False)

            idx = np.argwhere(abs(fpr-tpr) == ks_value)[0, 0]
            ks_thresholds = thresholds[idx]

            plt.plot((ks_thresholds, ks_thresholds), (fpr[idx], tpr[idx]), 
                    label='KS - {:.4f}'.format(ks_value), 
                    color=main_colors[3], linestyle='--', linewidth=2.5, marker='o', 
                    markerfacecolor=main_colors[3], markersize=8)

            plt.annotate(f'KS Value: {ks_value:.4f}\nThreshold: {ks_thresholds:.4f}', 
                        xy=(ks_thresholds, (fpr[idx] + tpr[idx]) / 2), 
                        xytext=(ks_thresholds + 0.05, (fpr[idx] + tpr[idx]) / 2 + 0.05),
                        arrowprops=dict(facecolor=main_colors[3], shrink=0.05),
                        fontsize=12) 

            plt.scatter((ks_thresholds, ks_thresholds), (fpr[idx], tpr[idx]), color=main_colors[3], s=100, zorder=5)    

            plt.tight_layout()

            if name == '3-2-Model-Result_prob_Test-Data_Best-Cutoff':
                plt.savefig(f'{output_path}{model_name}_KS-Curve_Test-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
                plt.savefig(f'{output_path}{model_name}_KS-Curve_Test-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
            elif name == '3-3-Model-Result_prob_Validation-Data-cutoff':
                plt.savefig(f'{output_path}{model_name}_KS-Curve_Validation-Data.svg', format='svg', bbox_inches='tight', dpi=1200)
                plt.savefig(f'{output_path}{model_name}_KS-Curve_Validation-Data.pdf', format='pdf', bbox_inches='tight', dpi=1200)
        
            plt.show()


In [None]:
# learning curve

In [None]:
import os

output_path = r''

if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
import  pandas as pd
X_train_transformed = pd.read_excel(r"X_train_transformed.xlsx", index_col=0)
y_train_transformed = pd.read_excel(r"y_train_transformed.xlsx", index_col=0)


In [None]:
model_path_input  =  r"" # Model_pkl path

In [None]:
from pycaret.classification import load_model


In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


import os
for file in os.listdir(model_path_input):
    if file.endswith('.pkl'):
        model_name  =  file.split('.')[0]
        model_path = model_path_input + model_name
        
        model = load_model(model_path)
        
        train_sizes, train_scores, test_scores = learning_curve(
            model, X_train_transformed, y_train_transformed,
            train_sizes=np.linspace(0.1, 1.0, 50),
            cv=10,
            scoring='roc_auc',
            n_jobs=5,
            random_state=42,
            shuffle=True,
        )

        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        
        n_cv_folds = train_scores.shape[1] 
        train_scores_se = train_scores_std / np.sqrt(n_cv_folds)
        test_scores_se = test_scores_std / np.sqrt(n_cv_folds)
        
        plt.figure(figsize=(8, 6), dpi=600)           
        plt.plot(train_sizes, train_scores_mean, 'o-', color= main_colors[0], label="Training Score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color= main_colors[1], label="Cross Validation Score")

        plt.fill_between(train_sizes, train_scores_mean - 1.96*train_scores_se,
                         train_scores_mean + 1.96*train_scores_se, alpha=0.2, color= main_colors[0], label="Training Score 95% CI")
        plt.fill_between(train_sizes, test_scores_mean - 1.96*test_scores_se,
                         test_scores_mean + 1.96*test_scores_se, alpha=0.2, color= main_colors[1], label="Cross Validation Score 95% CI")


        plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.4f'))
        plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.02))
        plt.gca().yaxis.set_major_locator(plt.MaxNLocator(nbins=10))

        plt.xlabel('Data Size', fontsize=12) 
        plt.ylabel('Area Under the ROC Curve', fontsize=12) 
        plt.title(f"Learning Curve by {model_name} with 95% Confidence Interval", fontsize=12)
        plt.legend(loc="best", fontsize=12)
        plt.grid(False)
        plt.tight_layout()

        plt.savefig(f'{output_path}Learning_Curve_by_{model_name}.svg', format='svg', bbox_inches='tight', dpi=1200)
        plt.savefig(f'{output_path}Learning_Curve_by_{model_name}.pdf', format='pdf', bbox_inches='tight', dpi=1200)
       
        plt.show()
        


In [None]:
# SHAP
# The best model is logistic

In [None]:
import os
output_path = r""
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

In [None]:
# Load test data
X_test_transformed = pd.read_excel(r"X_test_transformed.xlsx",index_col=0) 
y_test_transformed = pd.read_excel(r"y_test_transformed.xlsx",index_col=0)

In [None]:
from pycaret.classification import load_model

model_input = load_model(r"Web Application\Logistic Regression")
best_model = model_input[-1]

best_model


In [None]:
import shap

explainer = shap.SamplingExplainer(best_model.predict, X_test_transformed) 
shap_values = explainer.shap_values(X_test_transformed,n_jobs=-2)

shap_values2 = explainer(X_test_transformed) 


In [None]:
fig5 = plt.gcf()
shap.plots.bar(shap_values2[14],show_data=True,max_display=20) 


fig5.savefig(f'{output_path}8-Local-Summary-Bar_Test-Data-14.svg', format='svg', bbox_inches='tight', dpi=1200)
fig5.savefig(f'{output_path}8-Local-Summary-Bar_Test-Data-14.pdf', format='pdf', bbox_inches='tight', dpi=1200)


In [None]:
fig55 = plt.gcf()
shap.plots.bar(shap_values2[167],show_data=True,max_display=20) 


fig55.savefig(f'{output_path}8-Local-Summary-Bar_Test-Data-167.svg', format='svg', bbox_inches='tight', dpi=1200)
fig55.savefig(f'{output_path}8-Local-Summary-Bar_Test-Data-167.pdf', format='pdf', bbox_inches='tight', dpi=1200)
