## Feature selection of pyradiomic feutures
 #### As refered in article 2 the most common algoritms for feuture selection are random forest, PCA(principle component analysis) and LASSO(least absolute shrinkage and selection operator) we'll be using all 3 and choosing the one that has the best results in several metrics

library imports + dataset handling comemt further

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset_path = 'radiomic_features_pylidc_and_fourier.csv'  # Replace with the actual path to your CSV file
data = pd.read_csv(dataset_path)
# Get the shape of the dataset (rows, columns)
print(data.shape)

# Get basic statistics about the dataset
print(data.describe())
X = data.drop(columns=['malignancy_mode','malignancy_mean'])  # Replace 'target' with the actual column name of your labels
y = data['malignancy_mode']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


##### 1.1 Random forest

In [None]:
# Initialize and train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
X_train_rf = X_train
y_train_rf = y_train
rf.fit(X_train_rf, y_train_rf)

# Make predictions and calculate accuracy before feature selection
y_pred = rf.predict(X_test)
initial_accuracy = accuracy_score(y_test, y_pred)
print(initial_accuracy)


In [None]:
# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({
    'Feature': X_train_rf.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df = feature_importance_df

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.show()


In [None]:
# Select top 5 features (as an example)
top_features = feature_importance_df.head(10)['Feature'].values

# Create a new dataset with only the top features
X_train_selected = X_train_rf[top_features]
X_test_selected = X_test[top_features]


# Train the classifier with selected features
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(X_train_selected, y_train)

# Make predictions and calculate accuracy after feature selection
y_pred_selected = rf_selected.predict(X_test_selected)
selected_accuracy = accuracy_score(y_test, y_pred_selected)

print(f'Accuracy before feature selection: {initial_accuracy:.4f}')
print(f'Accuracy after feature selection: {selected_accuracy:.4f}')


#### 1.2 LASSO

In [None]:
# Convert the continuous target into binary labels (benign/malignant)
threshold = 0.5  # Assuming values >= 0.5 are malignant
y_binary = np.where(y >= threshold, 1, 0)  # 1 for malignant, 0 for benign

X_train_scaled = X_train
X_test_scaled = X_test


# Standardize the features (important for LASSO)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model with L1 (LASSO) regularization
lasso = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, max_iter=1000)
lasso.fit(X_train_scaled, y_train)

# Get the coefficients
coefficients = lasso.coef_

# Find the indices of the non-zero coefficients
non_zero_features = np.where(coefficients != 0)[1]
non_zero_features = np.unique(non_zero_features)
# Print the selected features
print("Selected features:", non_zero_features)

# You can map back these indices to the original feature names if needed
selected_feature_names = X.columns[non_zero_features]
print("Selected feature names:", selected_feature_names)

#### 1.3 PCA

In [None]:
# Step 1: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA
pca = PCA(n_components=X.shape[1])  # Fit PCA to all features
X_pca = pca.fit_transform(X_scaled)

# Step 3: Create a DataFrame with feature importance
pca_df = pd.DataFrame(pca.components_, columns=X.columns)

# Step 4: Calculate the absolute value of the feature weights
feature_importance = np.abs(pca_df).sum(axis=0)

# Step 5: Create a DataFrame to see feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Step 6: Print the feature importance
print("Feature Importance:")
print(feature_importance_df)

# Optional: Select top N features
N = 10  # Change this to the number of top features you want
top_features = feature_importance_df.head(N)
print(f"\nTop {N} features based on PCA:")
print(top_features)