## Load 3 Datasets

In [2]:
import pandas as pd

# Load datasets
TB_HC_OD = pd.read_csv(r'..\..\Datasets\processed\TB_HC_OD.csv')
PTB_EPTB = pd.read_csv(r'..\..\Datasets\processed\PTB_EPTB.csv')
ATB_LTB = pd.read_csv(r'..\..\Datasets\processed\ATB_LTB.csv')

## Separate features and target

In [3]:
# Separate features and target in each dataset
X_TB_HC_OD = TB_HC_OD.drop(columns=['TB_Status'])
y_TB_HC_OD = TB_HC_OD['TB_Status']

X_PTB_EPTB = PTB_EPTB.drop(columns=['TB_Status'])
y_PTB_EPTB = PTB_EPTB['TB_Status']

X_ATB_LTB = ATB_LTB.drop(columns=['TB_Status'])
y_ATB_LTB = ATB_LTB['TB_Status']

## Feature Selection for 'TB_HC_OD'

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder

# --- Feature Selection for TB_HC_OD using Gradient Boosting ---
# Encode the target variable (y_TB_HC_OD) to numeric values
label_encoder_TB_HC_OD = LabelEncoder()
y_TB_HC_OD_encoded = label_encoder_TB_HC_OD.fit_transform(y_TB_HC_OD)

# Initialize Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)

# Fit the model to the data
gb_model.fit(X_TB_HC_OD, y_TB_HC_OD_encoded)

# Use feature importances from the model for selection
feature_importances = gb_model.feature_importances_

# Select features based on importance, sorted in descending order
sorted_indices = feature_importances.argsort()[::-1]
selected_features_indices = sorted_indices[:30]  # Select top 30 features
final_selected_features = X_TB_HC_OD.columns[selected_features_indices]

# Filter out unwanted features
filtered_features = [feature for feature in final_selected_features if feature not in ['ILMN_1848843', 'ILMN_1860051']]

# Ensure exactly 30 features are selected
final_exact_features = filtered_features[:30]

# Save selected features to CSV
selected_features_df_TB_HC_OD = pd.DataFrame(final_exact_features, columns=['Selected Features'])
save_path = r'..\..\Saved_files\selected_features_TB_HC_OD.csv'
selected_features_df_TB_HC_OD.to_csv(save_path, index=False)

# Show the selected features
selected_features_df_TB_HC_OD


Unnamed: 0,Selected Features
0,ILMN_2114568
1,ILMN_2176063
2,ILMN_2388547
3,ILMN_1811489
4,ILMN_2082209
5,ILMN_1667408
6,ILMN_1806040
7,ILMN_1657871
8,ILMN_2167426
9,ILMN_1803476


## Feature Selection for 'PTB_EPTB'

In [8]:
# --- Feature Selection for 'PTB_EPTB' using Gradient Boosting ---
# Encode the target variable (y_PTB_EPTB) to numeric values
label_encoder_PTB_EPTB = LabelEncoder()
y_PTB_EPTB_encoded = label_encoder_PTB_EPTB.fit_transform(y_PTB_EPTB)

# Initialize Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)

# Fit the model to the data
gb_model.fit(X_PTB_EPTB, y_PTB_EPTB_encoded)

# Use feature importances from the model for selection
feature_importances = gb_model.feature_importances_

# Select the top 20 features based on importance, sorted in descending order
sorted_indices = feature_importances.argsort()[::-1]
selected_features_indices = sorted_indices[:20]
selected_features_PTB_EPTB = X_PTB_EPTB.columns[selected_features_indices]

# Save selected features to CSV
selected_features_df_PTB_EPTB = pd.DataFrame(selected_features_PTB_EPTB, columns=['Selected Features'])
save_path = r'..\..\Saved_files\selected_features_PTB_EPTB.csv'
selected_features_df_PTB_EPTB.to_csv(save_path, index=False)

# Show the selected features
selected_features_df_PTB_EPTB


Unnamed: 0,Selected Features
0,ILMN_2079655
1,ILMN_1658399
2,ILMN_1710734
3,ILMN_2402272
4,ILMN_1728676
5,ILMN_1671992
6,ILMN_2225144
7,ILMN_1685441
8,ILMN_1652660
9,ILMN_2194561


## Feature Selection for 'ATB_LTB'

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# --- Encode categorical target variable y_ATB_LTB using LabelEncoder ---
label_encoder = LabelEncoder()
y_ATB_LTB_encoded = label_encoder.fit_transform(y_ATB_LTB)

# --- Scale the input features to enhance Lasso performance ---
scaler = StandardScaler()
X_ATB_LTB_scaled = scaler.fit_transform(X_ATB_LTB)

# --- Feature Selection for ATB_LTB using Lasso ---
# Fit the Lasso model with cross-validation to determine the optimal alpha
lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X_ATB_LTB_scaled, y_ATB_LTB_encoded)

# Get the absolute values of coefficients
coefficients = abs(lasso.coef_)

# Get the indices of the features selected by Lasso (non-zero coefficients)
selected_indices = [i for i in range(len(lasso.coef_)) if lasso.coef_[i] != 0]

# Get feature names and their corresponding coefficients
selected_features_and_coefficients = [
    (X_ATB_LTB.columns[i], coefficients[i]) for i in selected_indices
]

# Filter out 'ILMN_1821270' and sort by the coefficient magnitude in descending order
filtered_features = [
    feature for feature in selected_features_and_coefficients if feature[0] != 'ILMN_1821270'
]
filtered_features_sorted = sorted(filtered_features, key=lambda x: x[1], reverse=True)

# Select the top 10 features
top_10_features = [feature[0] for feature in filtered_features_sorted[:10]]

# Save selected features to CSV
output_path = r'..\..\Saved_files\selected_features_ATB_LTB.csv'
selected_features_df_ATB_LTB = pd.DataFrame(top_10_features, columns=['Selected Features'])
selected_features_df_ATB_LTB.to_csv(output_path, index=False)

# Show the selected features
selected_features_df_ATB_LTB


Unnamed: 0,Selected Features
0,ILMN_2114568
1,ILMN_1680388
2,ILMN_2315569
3,ILMN_2147105
4,ILMN_1769550
5,ILMN_1780368
6,ILMN_1712719
7,ILMN_1792799
8,ILMN_2094261
9,ILMN_1776125
