### Import Libraries

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate
from sklearn.metrics import classification_report,make_scorer,confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from lightgbm import LGBMClassifier

import seaborn as sns
import matplotlib.pyplot as plt

import os

outdir = r'./eval/'
if not os.path.exists(outdir):
    os.mkdir(outdir)
    
outdir = r'./eval/rfe'
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [4]:
random_state = 23873

dt = pd.read_excel(r'./data/CA_BC.xlsx').T
dt.columns = dt.iloc[0]
dt = dt.iloc[1:]
dt['Label'] = dt.Label.map({'E':1,'A':0})

dt.drop_duplicates(keep='first',inplace=True)


unknown_counter = 1
new_columns = []

for column_name in dt.columns:
    if not column_name or pd.isnull(column_name):
        # Assign "unknown" followed by a number as the column name
        new_columns.append(f"unknown_{unknown_counter}")
        unknown_counter += 1
    else:
        new_columns.append(column_name)

# Update the column names in the DataFrame
dt.columns = new_columns

for x in dt.select_dtypes(include=['object']).columns:
    dt[x] = dt[x].astype('float64')
dt.columns = [str(x) for x in dt.columns]
dt = dt.select_dtypes(include=['float64','int64'])


## Handle Duplicate Columns

# Columns to not rename
excluded = dt.columns[~dt.columns.duplicated(keep=False)]

# An incrementer
import itertools
inc = itertools.count().__next__

# A renamer
def ren(name):
    return f"{name}{inc()}" if name not in excluded else name

# Use inside rename()
for x in range(0,2):
    dt.rename(columns=ren,inplace=True)

In [6]:
dt.to_csv(r'data/refined_data.csv',index=False)

In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Assuming your data is in a pandas DataFrame called 'dt'
X = dt.drop('Label', axis=1)
y = dt['Label']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42),
    'Support Vector Machine': SVC(kernel='linear', class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42)
}

# Initialize RFE with the desired number of features to select
num_features = 10  # Adjust this number based on your specific requirements

# Initialize an empty dictionary to store the selected features for each classifier
selected_features = {}

# Initialize an empty list to store the results
results = []

def stability_selection(X, y, estimator, n_trials=5, threshold=0.6):
    n_samples, n_features = X.shape
    feature_scores = np.zeros(n_features)
    
    for trial in range(n_trials):
        # Generate bootstrap sample
        bootstrap_idx = np.random.choice(n_samples, size=n_samples, replace=True)
        X_bootstrap = X[bootstrap_idx]
        y_bootstrap = y[bootstrap_idx]

        # Fit estimator
        estimator.fit(X_bootstrap, y_bootstrap)
        
        # Update feature scores
        feature_scores += estimator.get_support().astype(int)

        # Print progress
        print(f"Trial {trial + 1}/{n_trials} completed")

    # Normalize scores and select features above the threshold
    feature_scores = feature_scores / n_trials
    selected = np.where(feature_scores >= threshold)[0]

    return selected

for clf_name, clf in classifiers.items():
    print(f"Performing Stability Selection for {clf_name}:")
    rfe = RFE(clf, n_features_to_select=num_features)

    # Perform Stability Selection
    selected = stability_selection(X_train, y_train, rfe)

    # Save selected features
    selected_features[clf_name] = selected

    # Create a new DataFrame with the selected features
    X_selected = dt.iloc[:, selected]

    print(f"Selected features for {clf_name}:")
    print(X_selected.columns)
    print()

    # Evaluate classifier performance using cross-validation
    scores = cross_val_score(clf, X_selected, y, cv=5, scoring='balanced_accuracy')
    print(f"Mean balanced accuracy for {clf_name}: {np.mean(scores):.3f}")
    print("-" * 40)
    results.append({'Model Name': clf_name, 'Accuracy': np.mean(scores), 'Features': X_selected.columns})

# Create the DataFrame from the results list
df_results = pd.DataFrame(results)

# Print the final DataFrame
print(df_results)

df_results.to_csv(os.path.join(outdir, 'model_eval.csv'))

Performing Stability Selection for Logistic Regression:
Trial 1/5 completed
Trial 2/5 completed
Trial 3/5 completed
Trial 4/5 completed
Trial 5/5 completed
Selected features for Logistic Regression:
Index(['PDRC1', 'UQCRBP1', 'AOC4P'], dtype='object')

Mean balanced accuracy for Logistic Regression: 0.408
----------------------------------------
Performing Stability Selection for Support Vector Machine:
Trial 1/5 completed
Trial 2/5 completed
Trial 3/5 completed
Trial 4/5 completed
Trial 5/5 completed
Selected features for Support Vector Machine:
Index(['PDRC1', 'AOC4P'], dtype='object')

Mean balanced accuracy for Support Vector Machine: 0.486
----------------------------------------
Performing Stability Selection for Random Forest:
Trial 1/5 completed
Trial 2/5 completed
Trial 3/5 completed
Trial 4/5 completed
Trial 5/5 completed
Selected features for Random Forest:
Index(['TTC29', 'DKFZP564G196', 'TUBB6', 'PLEKHD1'], dtype='object')

Mean balanced accuracy for Random Forest: 0.665
-