In [None]:
import pycaret

In [None]:
import pandas as pd
data = pd.read_excel(r'Crude model\2.input\development dataset.xlsx',sheet_name="Sheet 1")


In [None]:

data['Halo_Sign'] = data['Halo_Sign'].map({'Exists':1,'Absent':0})  
data['Gender'] = data['Gender'].map({'Female':1,'Male':0})
data['Composition'] = data['Composition'].map({'Solid':1,'Others':0})
data['Shape'] = data['Shape'].map({'Microlobulated':1,'Others':0})
data['Echogenicity'] = data['Echogenicity'].map({'Hypoechogenicity':1,'Others':0})
data['Echogenic_Foci'] = data['Echogenic_Foci'].map({'Microcalcification':1,'Others':0})
data['Margin'] = data['Margin'].map({'Irregular':1,'Smooth':0})
data['ATR'] = data['ATR'].map({'Taller_than_Wide':1,'Wider_than_Tall':0})
data['Pathological_Diagnosis'] = data['Pathological_Diagnosis'].map({'Malignant':1,'Benign':0})



In [None]:
df_encoded1 = pd.get_dummies(data, columns=['Posterior_Echo'], prefix='')
df_encoded1.rename(columns={'_Absent_of_Shadowing':'Absent_of_Shadowing','_Posterior_Attenuation':'Posterior_Attenuation','_Shadowing':'Shadowing'},inplace=True)
columns_to_convert1 = ['Absent_of_Shadowing','Posterior_Attenuation','Shadowing']
df_encoded1[columns_to_convert1] = df_encoded1[columns_to_convert1].astype(int)

df_encoded2 = pd.get_dummies(df_encoded1, columns=['Location'], prefix='')
df_encoded2.rename(columns={'_Right_Lobe':'Right_Lobe','_Left_Lobe':'Left_Lobe','_Isthmus':'Isthmus'},inplace=True)
columns_to_convert2 = ['Right_Lobe','Left_Lobe','Isthmus']
df_encoded2[columns_to_convert2] = df_encoded2[columns_to_convert2].astype(int)



In [None]:

from sklearn.preprocessing import LabelEncoder

order_list = ['Intra_BFS','Peri_BFS']
label_encoder = LabelEncoder()

for i in order_list:
    
    df_encoded2[i] = label_encoder.fit_transform(df_encoded2[i])

    for class_label, encoded_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
        print(f"{class_label}: {encoded_label}")

In [None]:

data_input_reconde = df_encoded2

In [None]:

from pycaret.classification import *
exp_clf = setup(
    data_input_reconde, target='Pathological_Diagnosis', session_id=111,
    numeric_features=["BMI","Age","Maximum_Diameter"],
    categorical_features=["Halo_Sign", 
    "Gender","Composition","Shape","Echogenicity","Echogenic_Foci",
    "Margin","ATR","Absent_of_Shadowing","Posterior_Attenuation","Shadowing",
    "Right_Lobe","Left_Lobe","Isthmus"], 
    train_size = 0.7,data_split_shuffle = True,data_split_stratify = True,
    ignore_features=["ACR","Kwak","Data_Type"],
)


In [None]:

get_config()


In [None]:

X_train_transformed = get_config("X_train_transformed")
y_train_transformed = get_config("y_train_transformed")


In [None]:
import os
import pycaret

input_dir = r"" #Model_pkl path 
model_params_dict = {}

for root, dirs, files in os.walk(input_dir):
    for file in files:

        model_name = file.split(".pkl")[0]
        file_path = os.path.join(root, model_name)
        model_init = load_model(file_path)
        
        parm = model_init[-1].get_params()

        model_params_dict[model_name] = parm


In [None]:
model_params_dict 

In [None]:

import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


def create_models_with_optimized_hyperparameters(optimized_params_dict):

    models_name_model = {}
    
    # 1. CatBoost Classifier
    catboost_params = optimized_params_dict['CatBoost Classifier']
    models_name_model['CatBoost Classifier'] = CatBoostClassifier(**catboost_params)
    
    # 2. Gradient Boosting Classifier
    gbc_params = optimized_params_dict['Gradient Boosting Classifier']
    models_name_model['Gradient Boosting Classifier'] = GradientBoostingClassifier(**gbc_params)
    
    # 3. Extreme Gradient Boosting (XGBoost)
    xgb_params = optimized_params_dict['Extreme Gradient Boosting']
    models_name_model['Extreme Gradient Boosting'] = XGBClassifier(**xgb_params)
    
    # 4. Light Gradient Boosting Machine
    lgbm_params = optimized_params_dict['Light Gradient Boosting Machine']
    models_name_model['Light Gradient Boosting Machine'] = LGBMClassifier(**lgbm_params)
    
    # 5. Neural Network (MLP Classifier)
    mlp_params = optimized_params_dict['MLP Classifier']
    models_name_model['MLP Classifier'] = MLPClassifier(**mlp_params)
    
    # 6. Random Forest Classifier
    rf_params = optimized_params_dict['Random Forest Classifier']
    models_name_model['Random Forest Classifier'] = RandomForestClassifier(**rf_params)
    
    # 7. Extra Trees Classifier
    et_params = optimized_params_dict['Extra Trees Classifier']
    models_name_model['Extra Trees Classifier'] = ExtraTreesClassifier(**et_params)
    
    # 8. Adaptive Boosting Classifier
    ada_params = optimized_params_dict['Ada Boost Classifier']
    models_name_model['Ada Boost Classifier'] = AdaBoostClassifier(**ada_params)
    
    # 9. Logistic Regression
    lr_params = optimized_params_dict['Logistic Regression']
    models_name_model['Logistic Regression'] = LogisticRegression(**lr_params)
    
    # 10. Ridge Classifier
    ridge_params = optimized_params_dict['Ridge Classifier']
    models_name_model['Ridge Classifier'] = RidgeClassifier(**ridge_params)
    
    # 11. Linear Discriminant Analysis
    lda_params = optimized_params_dict['Linear Discriminant Analysis']
    models_name_model['Linear Discriminant Analysis'] = LinearDiscriminantAnalysis(**lda_params)
    
    # 12. Quadratic Discriminant Analysis
    qda_params = optimized_params_dict['Quadratic Discriminant Analysis']
    models_name_model['Quadratic Discriminant Analysis'] = QuadraticDiscriminantAnalysis(**qda_params)
    
    # 13. Decision Tree Classifier
    dt_params = optimized_params_dict['Decision Tree Classifier']
    models_name_model['Decision Tree Classifier'] = DecisionTreeClassifier(**dt_params)
    
    # 14. Naive Bayes
    nb_params = optimized_params_dict['Naive Bayes']    
    models_name_model['Naive Bayes'] = GaussianNB(**nb_params)
    
    # 15. K-Nearest Neighbor Classifier
    knn_params = optimized_params_dict['K Neighbors Classifier']
    models_name_model['K Neighbors Classifier'] = KNeighborsClassifier(**knn_params)
    
    # 16. Support Vector Machine
    svm_params = optimized_params_dict['SVM - Radial Kernel']
    models_name_model['SVM - Radial Kernel'] = SVC(**svm_params)
    
    # 17. Gaussian Process Classifier
    gpc_params = optimized_params_dict['Gaussian Process Classifier']   
    models_name_model['Gaussian Process Classifier'] = GaussianProcessClassifier(**gpc_params)
    
    return models_name_model


In [None]:
models_name_model = create_models_with_optimized_hyperparameters(model_params_dict)
models_name_model

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.base import clone


def feature_selection_with_cv(X_train, y_train, selected_features, models_name_model, cv_folds=10):
    
    mean_results = []
    std_results = []
    cv_results = {}
   
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    for model_name in models_name_model.keys():
        cv_results[model_name] = {
            'mean_auc': [],
            'std_auc': [],
            'all_cv_scores': []  
        }
    
    for num_features in range(1, len(selected_features) + 1):

        i = 1
        
        mean_row = {"Number_of_Features": num_features}
        std_row = {"Number_of_Features": num_features}
        
        top_features = selected_features[:num_features]
        X_train_subset = X_train[top_features]
        
        for model_name, model_config in models_name_model.items():
            try:
               
                model = clone(model_config)
                                
                cv_scores = cross_val_score(
                    model, X_train_subset, y_train,
                    cv=cv, scoring='roc_auc', n_jobs=-1
                )
                
                mean_auc = np.mean(cv_scores)
                std_auc = np.std(cv_scores)
                
                mean_row[model_name] = mean_auc
                
                std_row[model_name] = std_auc
                
                cv_results[model_name]['mean_auc'].append(mean_auc)
                cv_results[model_name]['std_auc'].append(std_auc)
                cv_results[model_name]['all_cv_scores'].append(cv_scores)
                
            except Exception as e:
                mean_row[model_name] = np.nan
                std_row[model_name] = np.nan

            i = i + 1
        mean_results.append(mean_row)
        std_results.append(std_row)
    
    mean_results_df = pd.DataFrame(mean_results)
    std_results_df = pd.DataFrame(std_results)

    
    return mean_results_df, std_results_df, cv_results

In [None]:


import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.base import clone

def feature_selection_with_cv_dict(X_train, y_train, shap_feature_importances, models_name_model, cv_folds=10):
    
    mean_results = []
    std_results = []
    cv_results = {}
    
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    for model_name in models_name_model.keys():
        cv_results[model_name] = {
            'mean_auc': [],
            'std_auc': [],
            'all_cv_scores': []  
        }
    
    max_features = min(len(features) for features in shap_feature_importances.values())
    

    for num_features in range(1, max_features + 1):
        
        i = 1
        
        mean_row = {"Number_of_Features": num_features}
        std_row = {"Number_of_Features": num_features}
        
        for model_name, model_config in models_name_model.items():
            
            selected_features = shap_feature_importances[model_name]
            
            top_features = selected_features[:num_features]
            X_train_subset = X_train[top_features]
            
            try:
                model = clone(model_config)
                
                cv_scores = cross_val_score(
                    model, X_train_subset, y_train,
                    cv=cv, scoring='roc_auc', n_jobs=-1
                )
                
                mean_auc = np.mean(cv_scores)
                std_auc = np.std(cv_scores)
                
                mean_row[model_name] = mean_auc
                std_row[model_name] = std_auc
                
                cv_results[model_name]['mean_auc'].append(mean_auc)
                cv_results[model_name]['std_auc'].append(std_auc)
                cv_results[model_name]['all_cv_scores'].append(cv_scores)
               
            except Exception as e:
                mean_row[model_name] = np.nan
                std_row[model_name] = np.nan

            i = i + 1
            
        mean_results.append(mean_row)
        std_results.append(std_row)
    
    mean_results_df = pd.DataFrame(mean_results)
    std_results_df = pd.DataFrame(std_results)
    
    return mean_results_df, std_results_df, cv_results

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False


In [None]:
X_train = get_config("X_train_transformed")
y_train = get_config("y_train_transformed")

In [None]:
df = pd.concat([X_train, y_train], axis=1)
df

In [None]:

import pymrmr
target = df.columns[-1]
df_reordered = df[[target] + [col for col in df.columns if col != target]]

k = 19
selected_features = pymrmr.mRMR(df_reordered, 'MIQ', k)


print(selected_features)

In [None]:

mRMR_feature_importances_df = pd.DataFrame(selected_features)
mRMR_feature_importances_df


In [None]:
mean_results_df, std_results_df, cv_results =  feature_selection_with_cv(X_train, y_train, selected_features, models_name_model, cv_folds=10)

In [None]:
mean_results_df

In [None]:
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

step_size = 1  
initial_value = 1  


filtered_results_df = mean_results_df[
    mean_results_df["Number_of_Features"] >= initial_value  
].iloc[::step_size, :].reset_index(drop=True)  


filtered_results_df.sort_values(by="Number_of_Features", ascending=False, inplace=True)
lancet_colors = [
    '#00468B', '#ED0000', '#42B540', '#0099B4', '#925E9F', '#FDAF91', '#AD002A', 
    '#ADB6B6', '#1B1919', '#7C7C7C', '#4DBBD5', '#E64B35', '#00A087', '#3C5488', 
    '#F39B7F', '#8491B4'
]


plt.figure(figsize=(8, 6))  
for i, column in enumerate(filtered_results_df.columns[1:]):
    plt.plot(
        filtered_results_df["Number_of_Features"],  
        filtered_results_df[column],              
        label=column,                              
        color=lancet_colors[i % len(lancet_colors)], 
        marker='o',                                
        linewidth=1.5                              
    )


optimal_features = 9  
plt.axvline(
    x=optimal_features,  
    color='black',       
    linestyle='--',      
    label='Optimal Features',  
    alpha=0.5
)


plt.title('Recursive Feature Elimination', fontsize=10)  
plt.xlabel('Number of Features', fontsize=10) 
plt.ylabel('Area Under the ROC Curve (AUC)', fontsize=10)  

plt.xticks(
    ticks=filtered_results_df["Number_of_Features"],  
    fontsize=8  
)
plt.yticks(fontsize=8)  
plt.legend(title="Models", fontsize=8, loc="best")  
plt.grid(axis='y', alpha=0.5)  
plt.tight_layout()
plt.grid(False)

# plt.savefig(r'mRMR-ROC.pdf', format='pdf', bbox_inches='tight', dpi=1200)
# plt.savefig(r'mRMR-ROC.svg', format='svg', bbox_inches='tight', dpi=1200)

plt.show()

In [None]:
#Boruta

In [None]:
from boruta import BorutaPy
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
X_train = get_config("X_train_transformed")
y_train = get_config("y_train_transformed")


In [None]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch


def boruta_random_forest_feature_selection(X_train, y_train, 
                                         n_estimators=100, 
                                         max_iter=100, 
                                         random_state=42):
    
    rf = RandomForestClassifier(
        class_weight='balanced', 
        max_depth=5,
        n_estimators=n_estimators,
        random_state=random_state,
        n_jobs=-1
    )
    
    boruta_selector = BorutaPy(
        estimator=rf,
        n_estimators='auto',  
        max_iter=max_iter,
        random_state=random_state,
        verbose=0  
    )
    
    boruta_selector.fit(X_train.values, y_train.values)
    
    feature_ranking = boruta_selector.ranking_
    
    feature_rank_pairs = list(zip(X_train.columns, feature_ranking))
    feature_rank_pairs.sort(key=lambda x: x[1])  
    sorted_features = [feature for feature, rank in feature_rank_pairs]
    
    feature_states = {}
    for i, feature in enumerate(X_train.columns):
        if boruta_selector.support_[i]:
            feature_states[feature] = 'Accepted'
        elif hasattr(boruta_selector, 'support_weak_') and boruta_selector.support_weak_[i]:
            feature_states[feature] = 'Tentative'
        else:
            feature_states[feature] = 'Rejected'
    
    
    return sorted_features, feature_states, boruta_selector

In [None]:
sorted_features, feature_states, boruta_selector = boruta_random_forest_feature_selection(X_train, y_train, n_estimators=100, max_iter=100, random_state=42)

In [None]:

Boruta_feature = pd.DataFrame([
    {'Feature_Name': feature, 'State': status}
    for feature, status in feature_states.items()
])


Boruta_feature['State'] = pd.Categorical(Boruta_feature['State'], 
                            categories=['Accepted', 'Tentative', 'Rejected'], 
                            ordered=True)
Boruta_feature_sorted = Boruta_feature.sort_values('State').reset_index(drop=True)
Boruta_feature_sorted


In [None]:
Boruta_feature_list = Boruta_feature_sorted['Feature_Name'].to_list()
Boruta_feature_list

In [None]:
mean_results_df, std_results_df, cv_results = feature_selection_with_cv(X_train, y_train, Boruta_feature_list, models_name_model, cv_folds=10)

In [None]:
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


step_size = 1  
initial_value = 1  


filtered_results_df = mean_results_df[
    mean_results_df["Number_of_Features"] >= initial_value  
].iloc[::step_size, :].reset_index(drop=True)  


filtered_results_df.sort_values(by="Number_of_Features", ascending=False, inplace=True)
lancet_colors = [
    '#00468B', '#ED0000', '#42B540', '#0099B4', '#925E9F', '#FDAF91', '#AD002A', 
    '#ADB6B6', '#1B1919', '#7C7C7C', '#4DBBD5', '#E64B35', '#00A087', '#3C5488', 
    '#F39B7F', '#8491B4'
]


plt.figure(figsize=(8, 6))  
for i, column in enumerate(filtered_results_df.columns[1:]):
    plt.plot(
        filtered_results_df["Number_of_Features"],  
        filtered_results_df[column],               
        label=column,                             
        color=lancet_colors[i % len(lancet_colors)], 
        marker='o',                               
        linewidth=1.5                             
    )


optimal_features = 10  
plt.axvline(
    x=optimal_features,  
    color='black',       
    linestyle='--',      
    label='Optimal Features',  
    alpha=0.5
)


plt.title('Recursive Feature Elimination', fontsize=10)  
plt.xlabel('Number of Features', fontsize=10)  
plt.ylabel('Area Under the ROC Curve (AUC)', fontsize=10)  
plt.xticks(
    ticks=filtered_results_df["Number_of_Features"],  
    fontsize=8  
)
plt.yticks(fontsize=8)  
plt.legend(title="Models", fontsize=8, loc="best")  
plt.grid(axis='y', alpha=0.5)  
plt.tight_layout()
plt.grid(False)

# plt.savefig(r'Boruta-ROC.pdf', format='pdf', bbox_inches='tight', dpi=1200)
# plt.savefig(r'Boruta-ROC.svg', format='svg', bbox_inches='tight', dpi=1200)

plt.show()

In [None]:
# BorutaShap 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from BorutaShap import BorutaShap
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
X_train = get_config("X_train_transformed")
y_train = get_config("y_train_transformed")


In [None]:
rf = RandomForestClassifier(class_weight='balanced',max_depth=5,n_estimators=100,random_state=42,n_jobs=-1)

Feature_Selector = BorutaShap(model=rf, importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=y_train, n_trials=100, sample=False, train_or_test='train', normalize=True, verbose=True, random_state=42)


In [None]:
Feature_Selector.plot(y_scale='log', which_features='all')

In [None]:
import pandas as pd

def process_boruta_features(accepted_features, tentative_features, rejected_features):
    
    features_data = []
    

    for feature in accepted_features:
        features_data.append({'Feature_Name': feature, 'Status': 'Accepted'})
    
    for feature in tentative_features:
        features_data.append({'Feature_Name': feature, 'Status': 'Tentative'})
    
    for feature in rejected_features:
        features_data.append({'Feature_Name': feature, 'Status': 'Rejected'})
    
    features_df = pd.DataFrame(features_data)
    
    all_features_list = accepted_features + tentative_features + rejected_features
    
    return features_df, all_features_list

In [None]:
BS_features_df, BS_features_list = process_boruta_features(Feature_Selector.accepted, Feature_Selector.tentative, Feature_Selector.rejected)

In [None]:
mean_results_df_BS, std_results_df_BS, cv_results_BS =  feature_selection_with_cv(X_train, y_train, BS_features_list, models_name_model, cv_folds=10)

In [None]:
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


step_size = 1  
initial_value = 1  


filtered_results_df = mean_results_df_BS[
    mean_results_df_BS["Number_of_Features"] >= initial_value  
].iloc[::step_size, :].reset_index(drop=True)  


filtered_results_df.sort_values(by="Number_of_Features", ascending=False, inplace=True)

lancet_colors = [
    '#00468B', '#ED0000', '#42B540', '#0099B4', '#925E9F', '#FDAF91', '#AD002A', 
    '#ADB6B6', '#1B1919', '#7C7C7C', '#4DBBD5', '#E64B35', '#00A087', '#3C5488', 
    '#F39B7F', '#8491B4'
]


plt.figure(figsize=(8, 6))  
for i, column in enumerate(filtered_results_df.columns[1:]):
    plt.plot(
        filtered_results_df["Number_of_Features"],  
        filtered_results_df[column],               
        label=column,                              
        color=lancet_colors[i % len(lancet_colors)], 
        marker='o',                                
    )

optimal_features = 12  
plt.axvline(
    x=optimal_features,  
    color='black',       
    linestyle='--',      
    label='Optimal Features',  
    alpha=0.5
)


plt.title('Recursive Feature Elimination', fontsize=10)  
plt.xlabel('Number of Features', fontsize=10)  
plt.ylabel('Area Under the ROC Curve (AUC)', fontsize=10)  
plt.xticks(
    ticks=filtered_results_df["Number_of_Features"], 
    fontsize=8  
)
plt.yticks(fontsize=8) 
plt.legend(title="Models", fontsize=8, loc="best") 
plt.grid(axis='y', alpha=0.5) 
plt.tight_layout()
plt.grid(False)

# plt.savefig(r'BorutaShap-ROC.pdf', format='pdf', bbox_inches='tight', dpi=1200)
# plt.savefig(r'BorutaShap-ROC.svg', format='svg', bbox_inches='tight', dpi=1200)

plt.show()

In [None]:
# Lasso

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RepeatedKFold
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


In [None]:
X_train = get_config("X_train_transformed")
y_train = get_config("y_train_transformed")


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
import numpy as np

feature_names = X_train.columns

alphas = np.logspace(-4, 4, 1000)  

lasso_cv = LassoCV(alphas=alphas, cv=RepeatedKFold(n_splits=10, n_repeats=10, random_state=42), random_state=42,max_iter=100,n_jobs=-1)
lasso_cv.fit(X_train_scaled, y_train)

mse_path = lasso_cv.mse_path_.mean(axis=1)  
mse_std = lasso_cv.mse_path_.std(axis=1)    

best_alpha_index = np.argmin(mse_path)  
best_alpha = lasso_cv.alphas_[best_alpha_index] 

print(f"Best alpha (λ_min): {best_alpha}")


lasso_best_alpha = LassoCV(alphas=[best_alpha], cv=RepeatedKFold(n_splits=10, n_repeats=10, random_state=42), random_state=42,max_iter=100,n_jobs=-1)
lasso_best_alpha.fit(X_train_scaled, y_train)
selected_features_best = [feature_names[i] for i in np.where(lasso_best_alpha.coef_ != 0)[0]]  
print(f"Selected features with λ_min: {selected_features_best}")  


In [None]:
coefs = []

for a in alphas:
    lasso = Lasso(alpha=a, max_iter=10000)
    lasso.fit(X_train_scaled, y_train)
    coefs.append(lasso.coef_)

ax = plt.gca()

plt.xscale('log')  

ax.plot(alphas, coefs)
plt.axvline(lasso_cv.alphas_[best_alpha_index], linestyle='--', color='black', label=r'$\lambda_{min}$='+str(round(best_alpha, 4)))

plt.xlabel('Alpha (α) value', fontsize=10)
plt.ylabel('Coefficients', fontsize=10)
plt.title('Lasso Paths', fontsize=10)
plt.axis('tight')

plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.legend(fontsize=8)
plt.tight_layout() 
plt.grid(False)

# plt.savefig(r'Lasso.pdf', format='pdf', bbox_inches='tight', dpi=1200)
# plt.savefig(r'Lasso.svg', format='svg', bbox_inches='tight', dpi=1200)

plt.show()


In [None]:
# ElasticNet

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False  
plt.rcParams['svg.fonttype'] = 'none'
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


In [None]:
X_train = get_config("X_train_transformed")
y_train = get_config("y_train_transformed")


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
from sklearn.linear_model import ElasticNetCV
elastic_net = ElasticNetCV(l1_ratio=np.linspace(0.01, 1, 100),  
                           alphas=np.logspace(-4, 4, 100),
                           cv=10,
                           max_iter=100,  
                           random_state=42)
elastic_net.fit(X_train_scaled, y_train)
print("Best alpha:", elastic_net.alpha_)
print("Best l1_ratio:", elastic_net.l1_ratio_)
feature_coef = elastic_net.coef_
selected_features = X_train.columns[feature_coef != 0].tolist()
print(selected_features)


In [None]:
from sklearn.linear_model import ElasticNet

coefs = []
for alpha in alphas:
    elastic_net_1 = ElasticNet(alpha=alpha, l1_ratio=elastic_net.l1_ratio_, max_iter=100)  
    elastic_net_1.fit(X_train_scaled, y_train)
    coefs.append(elastic_net_1.coef_)

ax = plt.gca()

plt.xscale('log')  

ax.plot(alphas, coefs)
plt.axvline(elastic_net.alpha_ , linestyle='--', color='black', label=r'$\lambda_{min}$='+str(round(best_alpha, 4)))
plt.xlabel('Alpha (α) value', fontsize=10)
plt.ylabel('Coefficients', fontsize=10)
plt.title('Coefficient Path Using ElasticNet with Best L1 ratio', fontsize=10)
plt.axis('tight')


plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.legend(fontsize=8)
plt.tight_layout()
plt.grid(False)

# plt.savefig(r'ElasticNet.pdf', format='pdf', bbox_inches='tight', dpi=1200)
# plt.savefig(r'ElasticNet.svg', format='svg', bbox_inches='tight', dpi=1200)

plt.show()


