In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve
from pyswarm import pso

### Random Forest (without feature selection)

In [16]:
def evaluate_dataset(df, dataset_name):
    # Classify the malignancy (0 = benign, 1 = malignant)
    df['malignancy'] = df['malignancy'].replace({1: 0, 2: 0, 4: 1, 5: 1})  

    # Separate Features and Target
    X = df.drop('malignancy', axis=1)  # Features
    y = df['malignancy']  # Target

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Define multiple metrics to evaluate
    scoring = {
        'accuracy': 'accuracy',
        'f1': make_scorer(f1_score),
        'roc_auc': 'roc_auc',
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score)
    }

    # Perform 10-fold cross-validation with multiple metrics
    cv_results = cross_validate(rf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)

    # Print the mean and standard deviation of each metric
    print(f"Results for {dataset_name}:")
    print(f"Accuracy (10-fold CV): {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
    print(f"F1 Score (10-fold CV): {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")
    print(f"ROC-AUC (10-fold CV): {cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}")
    print(f"Precision (10-fold CV): {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
    print(f"Recall (10-fold CV): {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
    print('-' * 50)

# Load datasets
df_combined = pd.read_csv('all_features_cleaned.csv')
df_radiomic = pd.read_csv('radiomic_features.csv')
df_pylidc = pd.read_csv('pylidc_features.csv')

# Evaluate each dataset without feature selection
evaluate_dataset(df_combined, "Combined Features")
evaluate_dataset(df_radiomic, "Radiomic Features")
evaluate_dataset(df_pylidc, "PyLidc Features")


Results for Combined Features:
Accuracy (10-fold CV): 0.8624 ± 0.0148
F1 Score (10-fold CV): 0.8144 ± 0.0207
ROC-AUC (10-fold CV): 0.9276 ± 0.0096
Precision (10-fold CV): 0.8459 ± 0.0210
Recall (10-fold CV): 0.7851 ± 0.0218
--------------------------------------------------
Results for Radiomic Features:
Accuracy (10-fold CV): 0.8113 ± 0.0161
F1 Score (10-fold CV): 0.7514 ± 0.0197
ROC-AUC (10-fold CV): 0.8811 ± 0.0109
Precision (10-fold CV): 0.7626 ± 0.0295
Recall (10-fold CV): 0.7416 ± 0.0275
--------------------------------------------------
Results for PyLidc Features:
Accuracy (10-fold CV): 0.8736 ± 0.0122
F1 Score (10-fold CV): 0.8337 ± 0.0163
ROC-AUC (10-fold CV): 0.9378 ± 0.0102
Precision (10-fold CV): 0.8438 ± 0.0194
Recall (10-fold CV): 0.8241 ± 0.0200
--------------------------------------------------


### Random Forest Classifier (with feature selection using Random Forest)

In [15]:
def evaluate_dataset_with_feature_selection(df, dataset_name, threshold=0.01):
    # Classify the malignancy (0 = benign, 1 = malignant)
    df['malignancy'] = df['malignancy'].replace({1: 0, 2: 0, 4: 1, 5: 1})  

    # Separate Features and Target
    X = df.drop('malignancy', axis=1)  # Features
    y = df['malignancy']  # Target

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Get feature importances
    feature_importances = rf.feature_importances_

    # Create a DataFrame to store feature names and their importances
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    })

    # Select features with importance greater than the threshold
    selected_features = importance_df[importance_df['Importance'] > threshold]['Feature'].tolist()
    print(f"Selected features (importance > {threshold}) for {dataset_name}: {selected_features}")

    # Reduce the dataset to the selected features
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Initialize a new Random Forest Classifier for the selected features
    rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)

    # Define multiple metrics to evaluate
    scoring = {
        'accuracy': 'accuracy',
        'f1': make_scorer(f1_score),
        'roc_auc': 'roc_auc',
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score)
    }

    # Perform 10-fold cross-validation with the selected features
    cv_results = cross_validate(rf_selected, X_train_selected, y_train, cv=10, scoring=scoring, return_train_score=False)

    # Print the mean and standard deviation of each metric
    print(f"Results for {dataset_name} after feature selection:")
    print(f"Accuracy (10-fold CV): {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
    print(f"F1 Score (10-fold CV): {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")
    print(f"ROC-AUC (10-fold CV): {cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}")
    print(f"Precision (10-fold CV): {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
    print(f"Recall (10-fold CV): {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
    print('-' * 50)

# Load datasets
df_combined = pd.read_csv('all_features_cleaned.csv')
df_radiomic = pd.read_csv('radiomic_features.csv')
df_pylidc = pd.read_csv('pylidc_features.csv')

# Evaluate each dataset with feature selection
evaluate_dataset_with_feature_selection(df_combined, "Combined Features")
evaluate_dataset_with_feature_selection(df_radiomic, "Radiomic Features")
evaluate_dataset_with_feature_selection(df_pylidc, "PyLidc Features")


Selected features (importance > 0.01) for Combined Features: ['original_shape_Maximum2DDiameterSlice', 'original_shape_MeshVolume', 'original_shape_MinorAxisLength', 'original_shape_SurfaceArea', 'original_shape_VoxelVolume', 'original_firstorder_Minimum', 'original_glcm_Idn', 'original_glcm_Imc1', 'original_glcm_Imc2', 'original_gldm_DependenceNonUniformity', 'original_gldm_GrayLevelNonUniformity', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_glrlm_GrayLevelNonUniformity', 'original_glrlm_RunLengthNonUniformity', 'original_glszm_GrayLevelNonUniformity', 'subtlety', 'calcification', 'margin', 'lobulation', 'spiculation', 'diameter', 'surface_area', 'volume', 'fourier_hist_bin_9']
Results for Combined Features after feature selection:
Accuracy (10-fold CV): 0.8724 ± 0.0137
F1 Score (10-fold CV): 0.8294 ± 0.0182
ROC-AUC (10-fold CV): 0.9375 ± 0.0112
Precision (10-fold CV): 0.8540 ± 0.0241
Recall (10-fold CV): 0.8065 ± 0.0227
---------------------------------------------

### Random Forest Classifier (with feature selection using LASSO)

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, roc_auc_score, accuracy_score

def evaluate_dataset_with_lasso(df, dataset_name, lasso_alpha=0.01):
    """
    Standard function to evaluate Random Forest with 10-fold cross-validation
    after selecting features using Lasso for feature selection.
    
    Parameters:
    - df: DataFrame containing features and target
    - dataset_name: String, name of the dataset (for printing results)
    - lasso_alpha: Float, regularization strength for Lasso (default 0.01)
    
    Returns:
    - Prints the performance metrics of the model after feature selection with Lasso
    """
    # Classify the malignancy (0 = benign, 1 = malignant)
    df['malignancy'] = df['malignancy'].replace({1: 0, 2: 0, 4: 1, 5: 1})  

    # Separate Features and Target
    X = df.drop('malignancy', axis=1)  # Features
    y = df['malignancy']  # Target

    # Standardize features for Lasso (important for regularization)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Apply Lasso for feature selection (L1 regularization)
    lasso = LogisticRegression(penalty='l1', solver='liblinear', C=1/lasso_alpha, max_iter=1000)
    lasso.fit(X_train, y_train)

    # Get the non-zero coefficients (selected features)
    selected_features_indices = lasso.coef_ != 0
    selected_features = X.columns[selected_features_indices[0]]
    
    print(f"Selected features using Lasso for {dataset_name}: {selected_features.tolist()}")

    # Reduce the dataset to the selected features
    X_train_selected = X_train[:, selected_features_indices[0]]
    X_test_selected = X_test[:, selected_features_indices[0]]

    # Train a Random Forest Classifier on the selected features
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Define multiple metrics to evaluate
    scoring = {
        'accuracy': 'accuracy',
        'f1': make_scorer(f1_score),
        'roc_auc': 'roc_auc',
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score)
    }

    # Perform 10-fold cross-validation with the selected features
    cv_results = cross_validate(rf, X_train_selected, y_train, cv=10, scoring=scoring, return_train_score=False)

    # Print the mean and standard deviation of each metric
    print(f"Results for {dataset_name} after Lasso feature selection:")
    print(f"Accuracy (10-fold CV): {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
    print(f"F1 Score (10-fold CV): {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")
    print(f"ROC-AUC (10-fold CV): {cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}")
    print(f"Precision (10-fold CV): {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
    print(f"Recall (10-fold CV): {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
    print('-' * 50)

# Load datasets
df_combined = pd.read_csv('all_features_cleaned.csv')
df_radiomic = pd.read_csv('radiomic_features.csv')
df_pylidc = pd.read_csv('pylidc_features.csv')

# Evaluate each dataset with Lasso feature selection and Random Forest classification
evaluate_dataset_with_lasso(df_combined, "Combined Features")
evaluate_dataset_with_lasso(df_radiomic, "Radiomic Features")
evaluate_dataset_with_lasso(df_pylidc, "PyLidc Features")



Selected features using Lasso for Combined Features: ['original_shape_Elongation', 'original_shape_Flatness', 'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterSlice', 'original_shape_Maximum3DDiameter', 'original_shape_MeshVolume', 'original_shape_MinorAxisLength', 'original_shape_Sphericity', 'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio', 'original_shape_VoxelVolume', 'original_firstorder_10Percentile', 'original_firstorder_90Percentile', 'original_firstorder_Energy', 'original_firstorder_Entropy', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'original_firstorder_MeanAbsoluteDeviation', 'original_firstorder_Mean', 'original_firstorder_Median', 'original_firstorder_Minimum', 'original_firstorder_Range', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_firstorder_RootMeanSquared', 'original_first

### Random Forest Classifier (with feature selection using PCA)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

def evaluate_dataset_with_pca(df, dataset_name, n_components=0.95):
    # Classify the malignancy (0 = benign, 1 = malignant)
    df['malignancy'] = df['malignancy'].replace({1: 0, 2: 0, 4: 1, 5: 1})  

    # Separate Features and Target
    X = df.drop('malignancy', axis=1)  # Features
    y = df['malignancy']  # Target

    # Standardize features for PCA
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    print(f"Number of components selected by PCA for {dataset_name}: {X_train_pca.shape[1]}")

    # Train a Random Forest Classifier on the PCA-transformed features
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Define multiple metrics to evaluate
    scoring = {
        'accuracy': 'accuracy',
        'f1': make_scorer(f1_score),
        'roc_auc': 'roc_auc',
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score)
    }

    # Perform 10-fold cross-validation with the PCA components
    cv_results = cross_validate(rf, X_train_pca, y_train, cv=10, scoring=scoring, return_train_score=False)

    # Print the mean and standard deviation of each metric
    print(f"Results for {dataset_name} after PCA feature selection:")
    print(f"Accuracy (10-fold CV): {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
    print(f"F1 Score (10-fold CV): {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")
    print(f"ROC-AUC (10-fold CV): {cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}")
    print(f"Precision (10-fold CV): {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
    print(f"Recall (10-fold CV): {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
    print('-' * 50)

# Load datasets
df_combined = pd.read_csv('all_features_cleaned.csv')
df_radiomic = pd.read_csv('radiomic_features.csv')
df_pylidc = pd.read_csv('pylidc_features.csv')

# Evaluate each dataset with PCA feature selection and Random Forest classification
evaluate_dataset_with_pca(df_combined, "Combined Features")
evaluate_dataset_with_pca(df_radiomic, "Radiomic Features")
evaluate_dataset_with_pca(df_pylidc, "PyLidc Features")


Number of components selected by PCA for Combined Features: 30
Results for Combined Features after PCA feature selection:
Accuracy (10-fold CV): 0.8621 ± 0.0086
F1 Score (10-fold CV): 0.8148 ± 0.0096
ROC-AUC (10-fold CV): 0.9275 ± 0.0104
Precision (10-fold CV): 0.8437 ± 0.0218
Recall (10-fold CV): 0.7882 ± 0.0107
--------------------------------------------------
Number of components selected by PCA for Radiomic Features: 24
Results for Radiomic Features after PCA feature selection:
Accuracy (10-fold CV): 0.8113 ± 0.0188
F1 Score (10-fold CV): 0.7479 ± 0.0219
ROC-AUC (10-fold CV): 0.8809 ± 0.0134
Precision (10-fold CV): 0.7708 ± 0.0360
Recall (10-fold CV): 0.7271 ± 0.0193
--------------------------------------------------
Number of components selected by PCA for PyLidc Features: 8
Results for PyLidc Features after PCA feature selection:
Accuracy (10-fold CV): 0.8683 ± 0.0162
F1 Score (10-fold CV): 0.8304 ± 0.0223
ROC-AUC (10-fold CV): 0.9321 ± 0.0100
Precision (10-fold CV): 0.8213 ± 0.