In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import statsmodels.api as sm

# Load Diabetes dataset
data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = (data.target > data.target.mean()).astype(int)  # Convert the regression target to binary for classification

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# --- 1. Logistic Regression with p-values (Select Top 5) ---
print("1. Logistic Regression Feature Selection (Top 5 Features by p-value)")
X_with_const = sm.add_constant(X_scaled)  # Add intercept term

# Fit the model and calculate p-values without printing the optimization process
log_reg_model = sm.Logit(y, X_with_const).fit(maxiter=500, method='lbfgs', disp=False)
p_values = log_reg_model.pvalues[1:]  # Ignore the constant term

# Select the top 5 features based on p-value ranking
top_5_log_reg = np.argsort(p_values)[:5]  # Get indices of 5 smallest p-values
selected_features_log_reg = X.columns[top_5_log_reg]
print(f"Top 5 features by Logistic Regression (p-value): {selected_features_log_reg.tolist()}\n")

1. Logistic Regression Feature Selection (Top 5 Features by p-value)
Top 5 features by Logistic Regression (p-value): ['bp', 'bmi', 's5', 'sex', 's1']



In [10]:
# --- 2. Random Forest (Select Top 5 Features) ---
print("2. Random Forest Feature Selection (Top 5 Features by Importance)")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_scaled, y)
importances_rf = rf_model.feature_importances_

# Select the top 5 features based on feature importance
top_5_rf = np.argsort(importances_rf)[-5:]  # Get indices of top 5 features
selected_features_rf = X.columns[top_5_rf]
print(f"Top 5 features by Random Forest: {selected_features_rf.tolist()}\n")

2. Random Forest Feature Selection (Top 5 Features by Importance)
Top 5 features by Random Forest: ['s1', 's3', 'bp', 'bmi', 's5']



In [11]:
# --- 3. Gaussian Process with ARD (Increased Upper Bound for Length Scale) ---
print("3. Gaussian Process Feature Selection (Top 5 Features by ARD)")

# Increase the upper bound of the length scale to 1e3
kernel = C(1.0, (1e-2, 1e2)) * RBF(length_scale=np.ones(X_scaled.shape[1]), length_scale_bounds=(1e-2, 1e9))

# Train Gaussian Process Classifier
gp_model = GaussianProcessClassifier(kernel=kernel, random_state=42)
gp_model.fit(X_scaled, y)

# Get length scales from the RBF kernel
fitted_kernel = gp_model.kernel_
rbf_kernel = fitted_kernel.k2  # Access the RBF kernel
length_scales_gp = rbf_kernel.length_scale

# Select the top 5 features based on smallest length scales
top_5_gp = np.argsort(length_scales_gp)[:5]
selected_features_gp = X.columns[top_5_gp]
print(f"Top 5 features by Gaussian Process (ARD): {selected_features_gp.tolist()}\n")

3. Gaussian Process Feature Selection (Top 5 Features by ARD)
Top 5 features by Gaussian Process (ARD): ['s5', 'sex', 'bp', 'bmi', 's3']



In [12]:
# --- 4. Lasso Regression (Select Top 5 Features) ---
print("4. Lasso Regression Feature Selection (Top 5 Features by Coefficient Magnitude)")
lasso_model = Lasso(alpha=0.01, random_state=42)
lasso_model.fit(X_scaled, y)

# Select the top 5 features based on coefficient magnitudes (absolute values)
coef_lasso = np.abs(lasso_model.coef_)
top_5_lasso = np.argsort(coef_lasso)[-5:]  # Get indices of top 5 features
selected_features_lasso = X.columns[top_5_lasso]
print(f"Top 5 features by Lasso Regression: {selected_features_lasso.tolist()}\n")

4. Lasso Regression Feature Selection (Top 5 Features by Coefficient Magnitude)
Top 5 features by Lasso Regression: ['sex', 's3', 'bp', 'bmi', 's5']

