## Load 3 Datasets

In [3]:
import pandas as pd

In [4]:
# Load datasets
TB_HC_OD = pd.read_csv(r'..\..\Datasets\processed\TB_HC_OD.csv')
PTB_EPTB = pd.read_csv(r'..\..\Datasets\processed\PTB_EPTB.csv')
ATB_LTB = pd.read_csv(r'..\..\Datasets\processed\ATB_LTB.csv')

## Separate features and target

In [5]:
# Separate features and target in each dataset
X_TB_HC_OD = TB_HC_OD.drop(columns=['TB_Status'])
y_TB_HC_OD = TB_HC_OD['TB_Status']

X_PTB_EPTB = PTB_EPTB.drop(columns=['TB_Status'])
y_PTB_EPTB = PTB_EPTB['TB_Status']

X_ATB_LTB = ATB_LTB.drop(columns=['TB_Status'])
y_ATB_LTB = ATB_LTB['TB_Status']

## Select Best Feature Selection Algorithm

In [6]:
# Import necessary libraries
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split

# Encode target labels if they are categorical
def encode_target(y):
    le = LabelEncoder()
    return le.fit_transform(y)

# Define a function to compute top features based on correlation
def select_top_k_correlation(X, y, k):
    """
    Select top k features based on the absolute value of the correlation coefficient with the target.
    """
    correlations = X.apply(lambda col: np.corrcoef(col, y)[0, 1])
    top_features = correlations.abs().sort_values(ascending=False).index[:k]
    return X[top_features]

# Define the function to evaluate feature selection methods
def evaluate_feature_selection(X, y, feature_counts, cv_splits=5):
    # Encode target labels
    y_encoded = encode_target(y)

    # Initialize list to store results
    results = []

    # Define feature selection methods
    selection_algorithms = {
        'SelectKBest_f_classif': SelectKBest(score_func=f_classif),
        'SelectKBest_mutual_info': SelectKBest(score_func=mutual_info_classif),
        'Lasso': LassoCV(cv=cv_splits, random_state=0),
        'GradientBoosting': GradientBoostingClassifier(n_estimators=100, 
                                                        learning_rate=0.1, 
                                                        random_state=0)
    }

    # Set up K-Fold cross-validation
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=0)

    # Loop through feature selection algorithms
    for algorithm_name, selector in selection_algorithms.items():
        for k in feature_counts:
            try:
                # Feature selection
                if algorithm_name == 'Lasso':
                    selector.fit(X, y_encoded)
                    top_k_features = X.columns[np.argsort(np.abs(selector.coef_))[-k:]]
                    X_selected = X[top_k_features]

                elif algorithm_name.startswith('SelectKBest'):
                    selector.k = k
                    X_selected = selector.fit_transform(X, y_encoded)

                elif algorithm_name == 'GradientBoosting':
                    # Train a gradient boosting model and get feature importances
                    X_train, X_valid, y_train, y_valid = train_test_split(
                        X, y_encoded, test_size=0.2, random_state=0)
                    
                    selector.fit(X_train, y_train)
                    top_k_features = X.columns[np.argsort(selector.feature_importances_)[-k:]]
                    X_selected = X[top_k_features]

                # Evaluate with cross-validation
                score = cross_val_score(RandomForestClassifier(random_state=0),
                                        X_selected, y_encoded, cv=kf, n_jobs=-1).mean()

                # Store results
                results.append({
                    'Feature Selection Algorithm': algorithm_name,
                    'Feature Count': k,
                    'Cross Validation Score': score
                })

            except ValueError as e:
                print(f"Skipping {algorithm_name} with k={k} due to error: {e}")
                continue

    # Convert results to DataFrame
    result_df = pd.DataFrame(results)

    # Identify the best algorithm and feature count
    best_row = result_df.loc[result_df['Cross Validation Score'].idxmax()]
    print("\nBest Algorithm and Feature Count:")
    print(f"Algorithm: {best_row['Feature Selection Algorithm']}")
    print(f"Feature Count: {best_row['Feature Count']}")
    print(f"Cross Validation Score: {best_row['Cross Validation Score']:.4f}")

    return result_df

# Define feature counts to evaluate
feature_counts = [10, 20, 30]


In [7]:
# Select Best Feature Selection Algorithm for 'TB_HC_OD' dataset
result_df_TB_HC_OD = evaluate_feature_selection(X_TB_HC_OD, y_TB_HC_OD, feature_counts)

result_df_TB_HC_OD

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Best Algorithm and Feature Count:
Algorithm: GradientBoosting
Feature Count: 30
Cross Validation Score: 0.7446


Unnamed: 0,Feature Selection Algorithm,Feature Count,Cross Validation Score
0,SelectKBest_f_classif,10,0.619108
1,SelectKBest_f_classif,20,0.651592
2,SelectKBest_f_classif,30,0.67707
3,SelectKBest_mutual_info,10,0.645223
4,SelectKBest_mutual_info,20,0.679618
5,SelectKBest_mutual_info,30,0.697452
6,Lasso,10,0.657325
7,Lasso,20,0.684076
8,Lasso,30,0.689172
9,GradientBoosting,10,0.699363


In [8]:
# Select Best Feature Selection Algorithm for 'PTB_EPTB' dataset
result_df_PTB_EPTB = evaluate_feature_selection(X_PTB_EPTB, y_PTB_EPTB, feature_counts)

result_df_PTB_EPTB


Best Algorithm and Feature Count:
Algorithm: GradientBoosting
Feature Count: 30
Cross Validation Score: 0.7029


Unnamed: 0,Feature Selection Algorithm,Feature Count,Cross Validation Score
0,SelectKBest_f_classif,10,0.649724
1,SelectKBest_f_classif,20,0.649529
2,SelectKBest_f_classif,30,0.639403
3,SelectKBest_mutual_info,10,0.631743
4,SelectKBest_mutual_info,20,0.667316
5,SelectKBest_mutual_info,30,0.664784
6,Lasso,10,0.659786
7,Lasso,20,0.647128
8,Lasso,30,0.654755
9,GradientBoosting,10,0.675073


In [9]:
# Select Best Feature Selection Algorithm for 'ATB_LTB' dataset
result_df_ATB_LTB = evaluate_feature_selection(X_ATB_LTB, y_ATB_LTB, feature_counts)

result_df_ATB_LTB


Best Algorithm and Feature Count:
Algorithm: Lasso
Feature Count: 30
Cross Validation Score: 0.9122


Unnamed: 0,Feature Selection Algorithm,Feature Count,Cross Validation Score
0,SelectKBest_f_classif,10,0.837864
1,SelectKBest_f_classif,20,0.841142
2,SelectKBest_f_classif,30,0.87028
3,SelectKBest_mutual_info,10,0.85394
4,SelectKBest_mutual_info,20,0.860391
5,SelectKBest_mutual_info,30,0.870122
6,Lasso,10,0.905976
7,Lasso,20,0.902485
8,Lasso,30,0.912216
9,GradientBoosting,10,0.870333
