### Import Libraries

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,cross_validate
from sklearn.metrics import classification_report,make_scorer,confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from lightgbm import LGBMClassifier

import seaborn as sns
import matplotlib.pyplot as plt


import os

outdir = r'./eval/'
if not os.path.exists(outdir):
    os.mkdir(outdir)
    
outdir = r'./eval/sfs'
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [2]:
random_state = 23873

dt = pd.read_excel(r'./data/CA_BC.xlsx').T
dt.columns = dt.iloc[0]
dt = dt.iloc[1:]
dt['Label'] = dt.Label.map({'E':1,'A':0})

dt.drop_duplicates(keep='first',inplace=True)


unknown_counter = 1
new_columns = []

for column_name in dt.columns:
    if not column_name or pd.isnull(column_name):
        # Assign "unknown" followed by a number as the column name
        new_columns.append(f"unknown_{unknown_counter}")
        unknown_counter += 1
    else:
        new_columns.append(column_name)

# Update the column names in the DataFrame
dt.columns = new_columns

for x in dt.select_dtypes(include=['object']).columns:
    dt[x] = dt[x].astype('float64')
dt.columns = [str(x) for x in dt.columns]
dt = dt.select_dtypes(include=['float64','int64'])


## Handle Duplicate Columns

# Columns to not rename
excluded = dt.columns[~dt.columns.duplicated(keep=False)]

# An incrementer
import itertools
inc = itertools.count().__next__

# A renamer
def ren(name):
    return f"{name}{inc()}" if name not in excluded else name

# Use inside rename()
for x in range(0,2):
    dt.rename(columns=ren,inplace=True)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

# Assuming your data is in a pandas DataFrame called 'dt'
X = dt.drop('Label', axis=1)
y = dt['Label']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42),
    'Support Vector Machine': SVC(kernel='linear', class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42)
}

# Initialize the desired number of features to select
num_features = 10  # Adjust this number based on your specific requirements

# Initialize an empty dictionary to store the selected features for each classifier
selected_features = {}

# Initialize an empty list to store the results
results = []

for clf_name, clf in classifiers.items():
    print(f"Performing Sequential Feature Selection for {clf_name}:")

    # Create a custom scoring function to track the progress
    def progress_score(estimator, X, y):
        print(f"Feature {X.shape[1]}/{num_features} selected")
        return estimator.score(X, y)

    # Create a Sequential Feature Selector with the desired number of features
    sfs = SequentialFeatureSelector(clf, n_features_to_select=num_features, direction='forward', cv=5, n_jobs=-1, scoring=progress_score)

    # Fit the Sequential Feature Selector
    sfs.fit(X_train, y_train)

    # Save selected features
    selected_features[clf_name] = sfs.get_support(indices=True)

    # Create a new DataFrame with the selected features
    X_selected = dt.iloc[:, selected_features[clf_name]]

    print(f"Selected features for {clf_name}:")
    print(X_selected.columns)
    print()

    # Evaluate classifier performance using cross-validation
    scores = cross_val_score(clf, X_selected, y, cv=5, scoring='balanced_accuracy')
    print(f"Mean balanced accuracy for {clf_name}: {np.mean(scores):.3f}")
    print("-" * 40)
    results.append({'Model Name': clf_name, 'Accuracy': np.mean(scores), 'Features': X_selected.columns})

# Create the DataFrame from the results list
df_results = pd.DataFrame(results)

# Print the final DataFrame
print(df_results)

df_results.to_csv(os.path.join(outdir, 'model_eval.csv'))

Performing Sequential Feature Selection for Logistic Regression:
Selected features for Logistic Regression:
Index(['TIMM23', 'RNU12-2P', 'HSPB1P1', 'AKAP14', 'LOC286073', 'LOC154309',
       'PDRC1', 'DAOA', 'GUSBP1', 'LRFN4'],
      dtype='object')

Mean balanced accuracy for Logistic Regression: 0.614
----------------------------------------
Performing Sequential Feature Selection for Support Vector Machine:
Selected features for Support Vector Machine:
Index(['ACOX3', 'ALS2CL', 'ANKRD20B', 'LOC100506900', 'TEX40', 'C17ORF97',
       'TIMMDC1', 'KRTAP10-11', 'UQCRBP1', 'OR2B9'],
      dtype='object')

Mean balanced accuracy for Support Vector Machine: 0.686
----------------------------------------
Performing Sequential Feature Selection for Random Forest:
