In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from scipy.stats import sem, t, ttest_1samp
from sklearn import svm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score, 
                             precision_score, recall_score, roc_auc_score, 
                             confusion_matrix, brier_score_loss, classification_report)
from sklearn.pipeline import Pipeline
from sklearn.exceptions import ConvergenceWarning
from xgboost import XGBClassifier
from lifelines import KaplanMeierFitter
from sklearn.utils import resample


In [None]:
data = pd.read_csv('yourdara.csv')


In [None]:
train_data = data[data['centre']!='xxxx']
test_data = data[data['centre']=='xxx']

In [None]:
mice_imputer = IterativeImputer()

# Define a mapping for 'yes' and 'no' strings to binary values




# Update the list of categorical columns
categorical_cols = ['cat1', 'cat2', 'etc']


# Perform encoding for categorical columns in both the training and test sets
train_encoded = pd.get_dummies(train_data, columns=categorical_cols)
test_encoded = pd.get_dummies(test_data, columns=categorical_cols)

# Align the training and testing data to ensure they have the same columns
train_aligned, test_aligned = train_encoded.align(test_encoded, join='outer', axis=1, fill_value=0)

# Fit the imputer on the training data
mice_imputer.fit(train_aligned)

# Use the fitted imputer to transform the training data
train_imputed = mice_imputer.transform(train_aligned)
train_imputed = pd.DataFrame(train_imputed, columns=train_aligned.columns)

# Use the same fitted imputer to transform the test data
test_imputed = mice_imputer.transform(test_aligned)
test_imputed = pd.DataFrame(test_imputed, columns=test_aligned.columns)



In [None]:
from sklearn.preprocessing import StandardScaler

# List of continuous columns to standardize
continuous_cols = 'list of continuous cols'

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler on the continuous columns in the training data
scaler.fit(train_imputed[continuous_cols])

# Use the fitted scaler to transform the continuous columns in the training data
train_imputed[continuous_cols] = scaler.transform(train_imputed[continuous_cols])

# Use the same fitted scaler to transform the continuous columns in the test data
test_imputed[continuous_cols] = scaler.transform(test_imputed[continuous_cols])




In [None]:
from sklearn.exceptions import ConvergenceWarning
ConvergenceWarning('ignore')

In [None]:
# Define the feature and target variables
y = train_imputed['Outcome Col']
x = train_imputed.drop(columns=[]
np.random.seed(123)

# Define the model
model = LogisticRegression(penalty='l1', solver='saga')



# Grid search for optimal hyperparameters
param_grid = {'C': np.logspace(-4, 4, 50)}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='roc_auc')
grid_search.fit(x, y)

# Fit the model with optimal hyperparameters
model = grid_search.best_estimator_

# Make predictions
pred = model.predict(x)

# Get the names of the selected features
coef = model.coef_[0]
Lasso_selected_features = np.array(x.columns)[coef != 0]

print(len(Lasso_selected_features))
print(Lasso_selected_features)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

np.random.seed(123)

# Define the model
model = LogisticRegression(penalty='elasticnet', solver='saga')

# Grid search for optimal hyperparameters
param_grid = {
    'C': np.logspace(-4, 4, 50),
    'l1_ratio': np.linspace(0, 1, 10)
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='roc_auc')
grid_search.fit(x, y)

# Fit the model with optimal hyperparameters
model = grid_search.best_estimator_

# Make predictions
pred = model.predict(x)

# Get the names of the selected features
coef = model.coef_[0]
Elastic_selected_features = np.array(x.columns)[coef != 0]

print(len(Elastic_selected_features))
print(Elastic_selected_features)


In [None]:
np.random.seed(321)

# Encode the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Define the model
model = RandomForestClassifier()

# Perform Recursive Feature Elimination
selector = RFECV(model, step=1, cv=10)
selector = selector.fit(x, y_encoded)

# Print the results
print(selector.support_)
print(selector.ranking_)

# Get the names of the selected features
RFE_selected_features = np.array(x.columns)[selector.support_]

print(len(RFE_selected_features))
print(RFE_selected_features)




In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Perform PCA
pca = PCA()
pca.fit(x)

# Print explained variance ratio
var_explained = pca.explained_variance_ratio_
cum_var_explained = np.cumsum(var_explained)

print("Variance explained by each component:")
print(var_explained)

print("Cumulative variance explained:")
print(cum_var_explained)

# Scree plot
plt.figure(figsize=(10, 7))
plt.plot(range(1, len(var_explained) + 1), var_explained, 'o-')
plt.title('Scree plot')
plt.xlabel('Number of components')
plt.ylabel('Proportional variance explained')
plt.show()

# Get the names of the selected features
PCA_selected_features = np.array(x.columns)[pca.components_[0] != 0]

print(len(PCA_selected_features))
print(PCA_selected_features)


In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# Define the model
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# Define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=123)

# Find all relevant features
feat_selector.fit(x.values, y.values.ravel())

# Check selected features
Boruta_selected_features = x.columns[feat_selector.support_].tolist()

print(len(Boruta_selected_features))
print(Boruta_selected_features)


In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

np.random.seed(123)

# Define the model
model = RandomForestClassifier(n_estimators=1000)

# Train the model
model.fit(x, y)

# Get feature importances
importances = model.feature_importances_

# Plot feature importances
plt.figure(figsize=(10, 7))
plt.barh(x.columns, importances)
plt.xlabel('Importance')
plt.title('Feature importances from Random Forest')
plt.show()

# Get the names of the selected features
RF_selected_features = x.columns[importances > 0]

print(len(RF_selected_features))
print(RF_selected_features)


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Select the 10 best features based on Mutual Information
selector = SelectKBest(mutual_info_classif, k=10)
selector.fit(x, y)

# Get the names of the selected features
MIM_selected_features = x.columns[selector.get_support()]

print(len(MIM_selected_features))
print(MIM_selected_features)


In [None]:
# Calculate absolute Pearson correlation with the outcome
pearson_corr = x.corrwith(y).abs()

# Select the top 40% of features
top_40_percent = int(0.4 * len(pearson_corr))
pearson_selected_features = pearson_corr.nlargest(top_40_percent).index

print(len(pearson_selected_features))
print(pearson_selected_features)


In [None]:
# Calculate absolute Spearman correlation with the outcome
spearman_corr = x.corrwith(y, method='spearman').abs()

# Select the top 40% of features
spearman_selected_features = spearman_corr.nlargest(top_40_percent).index

print(len(spearman_selected_features))
print(spearman_selected_features)


In [None]:
# Calculate absolute Kendall correlation with the outcome
kendall_corr = x.corrwith(y, method='kendall').abs()

# Select the top 40% of features
kendall_selected_features = kendall_corr.nlargest(top_40_percent).index

print(len(kendall_selected_features))
print(kendall_selected_features)


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Select the 10 best features based on ANOVA F-value
selector = SelectKBest(f_classif, k=10)
selector.fit(x, y)

# Get the names of the selected features
fvalue_selected_features = x.columns[selector.get_support()]

print(len(fvalue_selected_features))
print(fvalue_selected_features)


In [None]:
from sklearn.feature_selection import VarianceThreshold

# Remove all features with zero variance
selector = VarianceThreshold()
selector.fit(x)

# Get the names of the selected features
variance_selected_features = x.columns[selector.get_support()]

print(len(variance_selected_features))
print(variance_selected_features)


In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression

# Sequential Forward Selection
sfs = SFS(LogisticRegression(),
           k_features=10,
           forward=True,
           floating=False,
           scoring='accuracy',
           cv=0)

sfs = sfs.fit(x, y)

# Get the names of the selected features
sfs_selected_features = list(sfs.k_feature_names_)

print(len(sfs_selected_features))
print(sfs_selected_features)


In [None]:

from xgboost import XGBClassifier

# Create a dictionary to hold the selected features from each method
features_select = {
    'pearson': pearson_selected_features,
    'spearman': spearman_selected_features,
    'kendall': kendall_selected_features,
    'mutual_info': MIM_selected_features,
    'lasso': Lasso_selected_features,
    'elastic_net': Elastic_selected_features,
    'rfe': RFE_selected_features,
    'pca': PCA_selected_features,
    'boruta': Boruta_selected_features,
    'random_forest': RF_selected_features,
}

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, roc_curve, auc
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

# Initialize an empty list to store the ROC curves
roc_curves_list = []

le = LabelEncoder()
smote = SMOTE(random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(verbose=False),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'XGBoost': XGBClassifier(),
    'Ridge': CalibratedClassifierCV(RidgeClassifier()) 
}

hyperparameters = {
    'Logistic Regression': {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]},
    'Naive Bayes': {},
    'KNN': {'n_neighbors': [5, 10, 15], 'weights': ['uniform', 'distance']},
    'Neural Network': {'hidden_layer_sizes': [(10,), (20,), (30,)], 'alpha': [0.0001, 0.01, 0.1]},
    'Random Forest': {'n_estimators': [50, 100, 200, 400 ], 'max_depth': [5, 10, None], 'min_samples_split': [2, 10]},
    'Support Vector Machine': {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001]},
    'XGBoost': {'n_estimators': [100, 200], 'learning_rate': [0.1, 0.2], 'max_depth': [3, 5]},
    'Ridge': {}
}

results = []

# Record the ROC curves and feature importance
roc_curves = {}
feature_importance = {}

for feature_set_name, feature_set in features_select.items():
    roc_curves_per_feature_set = []
    X_train, X_val, y_train, y_val = train_test_split(train_imputed[feature_set], le.fit_transform(train_imputed['outcome var']).astype(int), test_size=0.2, random_state=42)
    X_test = test_imputed[feature_set]
    y_test = le.transform(test_imputed['outcome var']).astype(int)

    results_per_feature_set = []

    for model_name, model in models.items():
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

        # Hyperparameter tuning
        if hyperparameters[model_name]:
            model = GridSearchCV(model, param_grid=hyperparameters[model_name], cv=cv, scoring=make_scorer(roc_auc_score, needs_proba=True))

        model.fit(X_train, y_train)

        y_pred_train_labels = model.predict(X_train)
        y_pred_val_labels = model.predict(X_val)
        y_pred_test_labels = model.predict(X_test)

        # Get the probability estimates
        y_pred_train_proba = model.predict_proba(X_train)[:, 1]
        y_pred_val_proba = model.predict_proba(X_val)[:, 1]
        y_pred_test_proba = model.predict_proba(X_test)[:, 1]

        precision_train = precision_score(y_train, y_pred_train_labels)
        precision_val = precision_score(y_val, y_pred_val_labels)
        precision_test = precision_score(y_test, y_pred_test_labels)

        recall_train = recall_score(y_train, y_pred_train_labels)
        recall_val = recall_score(y_val, y_pred_val_labels)
        recall_test = recall_score(y_test, y_pred_test_labels)

        f1_train = 2 * (precision_train * recall_train) / (precision_train + recall_train)
        f1_val = 2 * (precision_val * recall_val) / (precision_val + recall_val)
        f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)

               # for train data
        fpr_train, tpr_train, _ = roc_curve(y_train, y_pred_train_proba)
        roc_auc_train = auc(fpr_train, tpr_train)

        # for validation data
        fpr_val, tpr_val, _ = roc_curve(y_val, y_pred_val_proba)
        roc_auc_val = auc(fpr_val, tpr_val)

        # for test data
        fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_test_proba)
        roc_auc_test = auc(fpr_test, tpr_test)
        
        cm_train = confusion_matrix(y_train, y_pred_train_labels)
        cm_val = confusion_matrix(y_val, y_pred_val_labels)
        cm_test = confusion_matrix(y_test, y_pred_test_labels)

        tp_train, fp_train, tn_train, fn_train = cm_train.ravel()
        tp_val, fp_val, tn_val, fn_val = cm_val.ravel()
        tp_test, fp_test, tn_test, fn_test = cm_test.ravel()

        sensitivity_train = tp_train / (tp_train + fn_train)
        sensitivity_val = tp_val / (tp_val + fn_val)
        sensitivity_test = tp_test / (tp_test + fn_test)

        specificity_train = tn_train / (tn_train + fp_train)
        specificity_val = tn_val / (tn_val + fp_val)
        specificity_test = tn_test / (tn_test + fp_test)
        
           # Get ROC curves
        fpr_train, tpr_train, _ = roc_curve(y_train, y_pred_train_proba)
        fpr_val, tpr_val, _ = roc_curve(y_val, y_pred_val_proba)
        fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_test_proba)
        
        roc_curves_per_feature_set.append({
            'model': model_name,
            'train': {'fpr': fpr_train, 'tpr': tpr_train},
            'val': {'fpr': fpr_val, 'tpr': tpr_val},
            'test': {'fpr': fpr_test, 'tpr': tpr_test}
        })
        
        

        # Append the results to the lists
        results_per_feature_set.append({
            'model': model_name,
            'roc_auc_train': roc_auc_train,
            'roc_auc_val': roc_auc_val,
            'roc_auc_test': roc_auc_test,
            'f1_train': f1_train,
            'f1_val': f1_val,
            'f1_test': f1_test,
            'recall_train': recall_train,
            'recall_val': recall_val,
            'recall_test': recall_test,
            'precision_train': precision_train,
            'precision_val': precision_val,
            'precision_test': precision_test,
            'sensitivity_train': sensitivity_train,
            'sensitivity_val': sensitivity_val,
            'sensitivity_test': sensitivity_test,
            'specificity_train': specificity_train,
            'specificity_val': specificity_val,
            'specificity_test': specificity_test,
        })

    results.append({
        'feature_set': feature_set_name,
        'results': results_per_feature_set
    })
    roc_curves_list.append({
        'feature_set': feature_set_name,
        'roc_curves': roc_curves_per_feature_set
    })



In [None]:
# Initialize lists for each metric
train_auc = []
val_auc = []
test_auc = []

train_f1 = []
val_f1 = []
test_f1 = []

train_recall = []
val_recall = []
test_recall = []

train_precision = []
val_precision = []
test_precision = []

train_sensitivity = []
val_sensitivity = []
test_sensitivity = []

train_specificity = []
val_specificity = []
test_specificity = []

# Iterate over the results list
for result in results:
    for model_result in result['results']:
        # Append the metrics to the respective lists
        train_auc.append(model_result['roc_auc_train'])
        val_auc.append(model_result['roc_auc_val'])
        test_auc.append(model_result['roc_auc_test'])

        train_f1.append(model_result['f1_train'])
        val_f1.append(model_result['f1_val'])
        test_f1.append(model_result['f1_test'])

        train_recall.append(model_result['recall_train'])
        val_recall.append(model_result['recall_val'])
        test_recall.append(model_result['recall_test'])

        train_precision.append(model_result['precision_train'])
        val_precision.append(model_result['precision_val'])
        test_precision.append(model_result['precision_test'])

        train_sensitivity.append(model_result['sensitivity_train'])
        val_sensitivity.append(model_result['sensitivity_val'])
        test_sensitivity.append(model_result['sensitivity_test'])

        train_specificity.append(model_result['specificity_train'])
        val_specificity.append(model_result['specificity_val'])
        test_specificity.append(model_result['specificity_test'])


In [None]:

# Initialize lists to store the results
roc_auc_test_results = []
f1_test_results = []

# Iterate over the results
for result in results:
    for model_result in result['results']:
        # Append the test ROC AUC and F1-scores to the respective lists
        roc_auc_test_results.append(model_result['roc_auc_val'])
        f1_test_results.append(model_result['f1_val'])

# Convert the lists to numpy arrays
roc_auc_test_matrix = np.array(roc_auc_test_results).reshape(len(models), len(features_select))
f1_test_matrix = np.array(f1_test_results).reshape(len(models), len(features_select))

# Combine the ROC AUC and F1-score matrices
combined_matrix = roc_auc_test_matrix + f1_test_matrix

# Find the indices of the maximum value in the combined matrix
best_model_index, best_feature_index = np.unravel_index(np.argmax(combined_matrix), combined_matrix.shape)

# Get the corresponding best model and feature selection technique
best_model = list(models.keys())[best_model_index]
best_feature = list(features_select.keys())[best_feature_index]

# Print the best model x feature selection combination
print("Best Model x Feature Selection Combination based on AUC and F1-score:")
print("Model:", best_model)
print("Feature Selection Technique:", best_feature)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Define the models and feature selection techniques
models = [roc_curve['model'] for roc_curve in results[0]['results']]  # Assuming the same models are used for each feature selection method
feature_select_methods = [result['feature_set'] for result in results]

# Initialize lists to store the results
results_dict = {
    'roc_auc_train': np.zeros((len(models), len(feature_select_methods))),
    'roc_auc_val': np.zeros((len(models), len(feature_select_methods))),
    'roc_auc_test': np.zeros((len(models), len(feature_select_methods))),
    'f1_train': np.zeros((len(models), len(feature_select_methods))),
    'f1_val': np.zeros((len(models), len(feature_select_methods))),
    'f1_test': np.zeros((len(models), len(feature_select_methods))),
    'precision_train': np.zeros((len(models), len(feature_select_methods))),
    'precision_val': np.zeros((len(models), len(feature_select_methods))),
    'precision_test': np.zeros((len(models), len(feature_select_methods))),
    'recall_train': np.zeros((len(models), len(feature_select_methods))),
    'recall_val': np.zeros((len(models), len(feature_select_methods))),
    'recall_test': np.zeros((len(models), len(feature_select_methods))),
    'sensitivity_train': np.zeros((len(models), len(feature_select_methods))),
    'sensitivity_val': np.zeros((len(models), len(feature_select_methods))),
    'sensitivity_test': np.zeros((len(models), len(feature_select_methods))),
    'specificity_train': np.zeros((len(models), len(feature_select_methods))),
    'specificity_val': np.zeros((len(models), len(feature_select_methods))),
    'specificity_test': np.zeros((len(models), len(feature_select_methods))),
}

# Iterate over the results
for i, result in enumerate(results):
    for j, model_result in enumerate(result['results']):
        # Store the metrics in the respective arrays
        for metric in results_dict.keys():
            results_dict[metric][j, i] = model_result[metric]  # Transpose the matrix by swapping i and j

# Create the subplots grid
fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(40, 40))

# Flatten the axes to iterate over them
axes = axes.flatten()

# Generate the heatmaps
for (metric, matrix), ax in zip(results_dict.items(), axes):
    sns.heatmap(matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax, cbar=False)
    ax.set_title(metric.replace('_', ' ').title())
    ax.set_ylabel("Model")  # Swap x and y labels
    ax.set_xlabel("Feature Selection Technique")  # Swap x and y labels
    ax.set_yticklabels(models, rotation=0)  # Swap x and y tick labels
    ax.set_xticklabels(feature_select_methods, rotation=90)  # Swap x and y tick labels
    ax.tick_params(axis='both', which='both', length=0)  # Remove tick marks
    ax.set_aspect('equal')  # Ensure equal aspect ratio for each subplot

    # Adjust font size for better readability
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=14)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=14)

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:

# Create a dictionary to hold the selected features from each method
features_select = {
    'pearson': pearson_selected_features,
    'spearman': spearman_selected_features,
    'kendall': kendall_selected_features,
    'mutual_info': MIM_selected_features,
    'lasso': Lasso_selected_features,
    'elastic_net': Elastic_selected_features,
    'rfe': RFE_selected_features,
    'pca': PCA_selected_features,
    'boruta': Boruta_selected_features,
    'random_forest': RF_selected_features,
}

In [None]:
# Get the features selected by Lasso
from sklearn.metrics import precision_score, recall_score, roc_curve, auc
feature_set_name = 'lasso'  # Replace this with the correct key if it's different
feature_set = features_select[feature_set_name]

# Get the training, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(train_imputed[feature_set], le.fit_transform(train_imputed['Success.Y.N.Lost_Y']).astype(int), test_size=0.2, random_state=42)
X_test = test_imputed[feature_set]
y_test = le.transform(test_imputed['Success.Y.N.Lost_Y']).astype(int)

# Initialize Logistic Regression model
model_name = 'Logistic Regression'
model = LogisticRegression()

# If there are hyperparameters for Logistic Regression, perform hyperparameter tuning
if hyperparameters[model_name]:
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)
    model = GridSearchCV(model, param_grid=hyperparameters[model_name], cv=cv, scoring=make_scorer(roc_auc_score, needs_proba=True))

# Fit the model
model.fit(X_train, y_train)

# Predict the probabilities
y_pred_train_proba = model.predict_proba(X_train)[:, 1]
y_pred_val_proba = model.predict_proba(X_val)[:, 1]
y_pred_test_proba = model.predict_proba(X_test)[:, 1]

# Calculate the ROC curves
fpr_train, tpr_train, _ = roc_curve(y_train, y_pred_train_proba)
fpr_val, tpr_val, _ = roc_curve(y_val, y_pred_val_proba)
fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_test_proba)

# Plot ROC curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_train, tpr_train, label='Train')
plt.plot(fpr_val, tpr_val, label='Validation')
plt.plot(fpr_test, tpr_test, label='Test')
plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()

# Plot feature importances
# Note: Logistic Regression does not directly provide feature importances. Instead, we can use the coefficients
# from the logistic regression model as an indication of feature importance.
plt.figure(figsize=(10, 8))
plt.barh(feature_set, model.best_estimator_.coef_[0])
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
from sklearn.utils import resample
from scipy.stats import sem, t

def compute_auc_ci(y_true, y_score, n_bootstraps=1000, alpha=0.05):
    """
    Compute the AUC and its confidence interval using bootstrapping.
    
    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        True binary labels.
    y_score : array-like of shape (n_samples,)
        Target scores.
    n_bootstraps : int, default=1000
        Number of bootstraps.
    alpha : float, default=0.05
        Confidence level (e.g., 0.05 for a 95% confidence interval).
    
    Returns
    -------
    auc : float
        AUC score.
    ci_lower : float
        Lower bound of the confidence interval.
    ci_upper : float
        Upper bound of the confidence interval.
    """
    assert len(y_true) == len(y_score), "Lengths of y_true and y_score should be equal."
    
    auc = roc_auc_score(y_true, y_score)
    bootstrapped_scores = []
    
    for _ in range(n_bootstraps):
        # Bootstrap by sampling with replacement
        y_true_resampled, y_score_resampled = resample(y_true, y_score)
        score = roc_auc_score(y_true_resampled, y_score_resampled)
        bootstrapped_scores.append(score)
    
    sorted_scores = np.array(bootstrapped_scores)
    sorted_scores.sort()
    
    # Compute the lower and upper bound of the confidence interval
    confidence_lower = sorted_scores[int((alpha / 2.0) * n_bootstraps)]
    confidence_upper = sorted_scores[int((1 - alpha / 2.0) * n_bootstraps)]
    
    return auc, confidence_lower, confidence_upper


In [None]:
# Compute the AUC and its CI for the train set
auc_train, ci_lower_train, ci_upper_train = compute_auc_ci(y_train, y_pred_train_proba)

# Compute the AUC and its CI for the validation set
auc_val, ci_lower_val, ci_upper_val = compute_auc_ci(y_val, y_pred_val_proba)

# Compute the AUC and its CI for the test set
auc_test, ci_lower_test, ci_upper_test = compute_auc_ci(y_test, y_pred_test_proba)

# Plot ROC curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_train, tpr_train, label=f'Train (AUC = {auc_train:.2f}, CI = [{ci_lower_train:.2f}, {ci_upper_train:.2f}])')
plt.plot(fpr_val, tpr_val, label=f'Validation (AUC = {auc_val:.2f}, CI = [{ci_lower_val:.2f}, {ci_upper_val:.2f}])')
plt.plot(fpr_test, tpr_test, label=f'Test (AUC = {auc_test:.2f}, CI = [{ci_lower_test:.2f}, {ci_upper_test:.2f}])')
plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()


In [None]:

def generate_metrics_table(y_true, y_pred, y_score, n_bootstraps=1000, alpha=0.05):
    # Ensure input lengths match
    assert len(y_true) == len(y_pred) == len(y_score), "Input lengths should be equal."
    
    # Initialize an empty dictionary to hold your metrics
    metrics_dict = {}
    
    # Compute basic metrics
    metrics_dict['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics_dict['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics_dict['F1 Score'] = f1_score(y_true, y_pred)
    metrics_dict['Sensitivity / Recall'] = recall_score(y_true, y_pred)
    metrics_dict['Specificity'] = recall_score(y_true, y_pred, pos_label=0)  # Assuming 0 is the negative class
    metrics_dict['PPV / Precision'] = precision_score(y_true, y_pred)
    metrics_dict['Brier Score'] = brier_score_loss(y_true, y_score)
    
  
    # ... (your existing code for other metrics)
    
    # Bootstrapping for Accuracy 95% CI
    bootstrapped_acc_scores = []
    for _ in range(n_bootstraps):
        # Bootstrap by sampling with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        acc_score = accuracy_score(y_true_resampled, y_pred_resampled)
        bootstrapped_acc_scores.append(acc_score)
    
    sorted_acc_scores = np.array(bootstrapped_acc_scores)
    sorted_acc_scores.sort()
    
    # Compute the lower and upper bound of the confidence interval for Accuracy
    acc_ci_lower = sorted_acc_scores[int((alpha / 2.0) * n_bootstraps)]
    acc_ci_upper = sorted_acc_scores[int((1 - alpha / 2.0) * n_bootstraps)]
    metrics_dict['Accuracy 95% CI'] = f'[{acc_ci_lower:.2f}, {acc_ci_upper:.2f}]'
    
    # ... (your existing code for other metrics)
    
    metrics_df = pd.DataFrame(metrics_dict, index=['Value']).transpose()
    
    return metrics_df

# Usage:
# Assuming y_train, y_pred_train, y_score_train etc. are defined
metrics_val = generate_metrics_table(y_val, y_pred_val_labels, y_pred_val_proba)
metrics_test = generate_metrics_table(y_test,y_pred_test_labels, y_pred_test_proba )

# Concatenate the DataFrames for a side-by-side comparison
metrics_table = pd.concat([ metrics_val, metrics_test], axis=1, keys=['Validation', 'Test'])
print(metrics_table)

