# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import ast

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

import shap
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE

import ollama

from tqdm import tqdm

# Parameters

In [2]:
# List of ml models
ml_models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

In [3]:
# List of LLMs
model_names = ['llama3.2:1b','llama3.2:3b', 'llama3.1','gemma3:1b','gemma3:4b', 'dolphin3', 'mistral','deepseek-llm']

In [4]:
# Number of features to select list
n_features_list = [5, 10, 15]

In [5]:
# CSV file names
base_data_prep_name = "tabular_data_preprocessed_2025_04_04.csv"
llm_data_prep_name = "tabular_data_llm_preprocessed_2025_04_03.csv"

In [6]:
# Force CUDA usage
os.environ["OLLAMA_BACKEND"] = "cuda"
os.environ["OLLAMA_NUM_THREADS"] = "16"

# Load Data

In [7]:
df = pd.read_csv(base_data_prep_name)
llm_df = pd.read_csv(llm_data_prep_name).drop("nlg", axis=1)

In [8]:
df.head()

Unnamed: 0,age,workclass,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,income,relationship_not_in_family,...,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,race_asian_pac_islander,race_black,race_other,race_white,sex_male
0,0.025996,2.137359,1.136512,-1.31846,0.146932,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,1.424944,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
1,0.828308,1.454401,1.136512,-0.609318,-0.144804,-0.217127,-2.213032,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
2,-0.046942,0.088484,-0.419335,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
3,1.047121,0.088484,-1.197259,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,0.70422
4,-0.776316,0.088484,1.136512,0.808965,-0.144804,-0.217127,-0.034087,-4.08338,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,-1.42001


In [9]:
llm_df.head()

Unnamed: 0,age,workclass,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,income,career_stage_classification_llama3.2_1b,...,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,race_asian_pac_islander,race_black,race_other,race_white,sex_male
0,-1.51325,0.001008,-0.425643,0.369591,-0.14031,-0.208399,-1.625939,0.297816,0,1.059612,...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,0.412478,0.707213
1,-1.587107,0.001008,-0.815427,1.320918,-0.14031,-0.208399,-1.625939,0.297816,0,0.661561,...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,0.412478,-1.414001
2,-0.996253,-0.839358,-0.425643,-0.106073,-0.14031,-0.208399,-0.038309,0.297816,0,1.059612,...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,3.065128,-0.087511,-2.424372,0.707213
3,-1.365537,0.001008,-0.03586,1.320918,-0.14031,-0.208399,-0.832124,0.297816,0,0.661561,...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,0.412478,-1.414001
4,0.628595,0.001008,-0.425643,0.131759,-0.14031,-0.208399,1.231794,-0.479163,0,-2.124796,...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,-2.424372,0.707213


# Experimential setup

In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df['income'], test_size=0.2, random_state=42)

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
print(f"Test Set: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")

Training Set: X_train shape = (39073, 24), y_train shape = (39073,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)


In [11]:
# Split the data into training and test sets
X_train_llm, X_test_llm, y_train_llm, y_test_llm = train_test_split(llm_df.drop('income', axis=1), llm_df['income'], test_size=0.2, random_state=42)

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train_llm.shape}, y_train shape = {y_train_llm.shape}")
print(f"Test Set: X_test shape = {X_test_llm.shape}, y_test shape = {y_test_llm.shape}")

Training Set: X_train shape = (4000, 88), y_train shape = (4000,)
Test Set: X_test shape = (1000, 88), y_test shape = (1000,)


# Modeling and Performance metrics

In [12]:
# Function to train and evaluate models with multiple metrics
def evaluate_models_with_metrics(models, X_train, y_train, X_test, y_test):
    results = []
    
    for name, model in models:
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_test_pred = model.predict(X_test)

        # Store results for the model
        model_results = {
            'Model': name,
            'Test Accuracy': accuracy_score(y_test, y_test_pred),
            'Test Precision': precision_score(y_test, y_test_pred),
            'Test Recall': recall_score(y_test, y_test_pred),
            'Test F1-Score': f1_score(y_test, y_test_pred)
        }
        
        results.append(model_results)

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [13]:
# Evaluate models with multiple metrics and print results
results = evaluate_models_with_metrics(ml_models, X_train, y_train, X_test, y_test)
display(results)

Unnamed: 0,Model,Test Accuracy,Test Precision,Test Recall,Test F1-Score
0,Logistic Regression,0.841744,0.725097,0.553291,0.627649
1,Random Forest,0.847989,0.719033,0.606369,0.657913
2,SVM,0.845634,0.760615,0.524841,0.621106
3,KNN,0.830075,0.675594,0.567728,0.616982
4,Gradient Boosting,0.866414,0.792969,0.603397,0.685315


In [14]:
# Evaluate models with multiple metrics and print results
results_llm = evaluate_models_with_metrics(ml_models, X_train_llm, y_train_llm, X_test_llm, y_test_llm)
display(results_llm)

Unnamed: 0,Model,Test Accuracy,Test Precision,Test Recall,Test F1-Score
0,Logistic Regression,0.838,0.732323,0.570866,0.641593
1,Random Forest,0.837,0.763006,0.519685,0.618267
2,SVM,0.843,0.759358,0.559055,0.643991
3,KNN,0.809,0.622568,0.629921,0.626223
4,Gradient Boosting,0.847,0.751244,0.594488,0.663736


# Feature selection experiment

Simple experiment where we compare SOTA feature selection vs LLM selected features based on prompt
1. Shapley values
2. RFE
3. SFS

## Shapley values

In [15]:
def shapley(X_train, y_train, n_features):
    # Train Gradient Boosting model
    gbr = GradientBoostingClassifier(random_state=42)
    gbr.fit(X_train, y_train)
    
    # Compute SHAP values using TreeExplainer
    explainer = shap.TreeExplainer(gbr)
    shap_values = explainer.shap_values(X_train)

    # Convert SHAP values to mean absolute importance per feature
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Store feature names with importance
    shap_feature_importance = pd.DataFrame({
        "Feature": X_train.columns,
        "SHAP Importance": shap_importance
    })

    # Sort by importance and select top features
    shap_feature_importance = shap_feature_importance.sort_values(by="SHAP Importance", ascending=False)
    selected_features = shap_feature_importance["Feature"].head(n_features).values

    return selected_features

## RFE

In [16]:
def rfe(X_train, y_train, n_features):
    # Train Gradient Boosting model
    gbr = GradientBoostingClassifier(random_state=42)
    
    # Apply RFE
    rfe = RFE(estimator=gbr, n_features_to_select=n_features)
    rfe.fit(X_train, y_train)

    # Get selected features
    selected_features = X_train.columns[rfe.support_].values
    
    return selected_features

## SFS

In [17]:
def sfs(X_train, y_train, n_features):
    # Train Gradient Boosting model
    gbr = GradientBoostingClassifier(random_state=42)
    
    # Apply SFS (Forward Selection)
    sfs = SequentialFeatureSelector(gbr, n_features_to_select=n_features, direction="forward", cv=5)
    sfs.fit(X_train, y_train)

    # Get selected features
    selected_features = X_train.columns[sfs.get_support()].values
    
    return selected_features

# Comparison feature selection
We use the best model from our prior experiment (Gradient Boosting)

In [18]:
# Model
models = [('Gradient Boosting', GradientBoostingClassifier(random_state=42))]

# Base results
results_no_fs = results.copy()
display(results_no_fs)

Unnamed: 0,Model,Test Accuracy,Test Precision,Test Recall,Test F1-Score
0,Logistic Regression,0.841744,0.725097,0.553291,0.627649
1,Random Forest,0.847989,0.719033,0.606369,0.657913
2,SVM,0.845634,0.760615,0.524841,0.621106
3,KNN,0.830075,0.675594,0.567728,0.616982
4,Gradient Boosting,0.866414,0.792969,0.603397,0.685315


In [19]:
def evaluate_with_feature_selection(X_train, y_train, X_test, y_test, ml_models, n_features, feature_selector, display_results = True):
    # Feature selection
    if feature_selector == 'shapley':
        features_selected = shapley(X_train, y_train, n_features)
    elif feature_selector == 'rfe':
        features_selected = rfe(X_train, y_train, n_features)
    elif feature_selector == 'sfs':
        features_selected = sfs(X_train, y_train, n_features)
    else:
        raise ValueError("Invalid feature selection algorithm. Choose 'shapley', 'rfe', or 'sfs'.")

    # Evaluate models with selected features
    results_fs = evaluate_models_with_metrics(models, X_train[features_selected], y_train, X_test[features_selected], y_test).set_index("Model")

    # Difference feature selection vs no feature selection, abosolute and % difference
    difference = results_fs - results_no_fs[results_no_fs['Model'] == 'Gradient Boosting'].reset_index(drop=True).set_index("Model")
    percentage_diff = (difference / results_no_fs[results_no_fs['Model'] == 'Gradient Boosting'].reset_index(drop=True).set_index("Model")) * 100
    
    # Display info  
    if(display_results):
        print(f"Selected features using {feature_selector}: " + ', '.join(features_selected))
        display(results_fs)
        print()  
        print("Absolute difference table: feature selection vs no feature selection")
        display(difference)
        print()
        print("Percentage difference table: feature selection vs no feature selection")
        display(percentage_diff)
        print()
        
    return difference, features_selected

In [20]:
def select_best_n_features(X_train, y_train, X_test, y_test, models, n_features_list):
    best_n_features_shap = None
    best_n_features_rfe = None
    best_n_features_sfs = None
    best_features_shap = None
    best_features_rfe = None
    best_features_sfs = None
    best_f1_shap = -float('inf')
    best_f1_rfe = -float('inf')
    best_f1_sfs = -float('inf')

    for n_features in n_features_list:
        # Evaluate with Shapley
        results_shap, features_shap = evaluate_with_feature_selection(X_train, y_train, X_test, y_test, models, n_features, 'shapley')
        test_f1_shap = results_shap['Test F1-Score'].max() # Only one value so max is fine

        # Evaluate with RFE
        results_rfe, features_rfe = evaluate_with_feature_selection(X_train, y_train, X_test, y_test, models, n_features, 'rfe')
        test_f1_rfe = results_rfe['Test F1-Score'].max() # Only one value so max is fine

        # Evaluate with SFS
        results_sfs, features_sfs = evaluate_with_feature_selection(X_train, y_train, X_test, y_test, models, n_features, 'sfs')
        test_f1_sfs = results_sfs['Test F1-Score'].max() # Only one value so max is fine

        # Update best feature selection method based on highest F1 score
        if test_f1_shap > best_f1_shap:
            best_f1_shap = test_f1_shap
            best_n_features_shap = n_features
            best_features_shap = features_shap

        if test_f1_rfe > best_f1_rfe:
            best_f1_rfe = test_f1_rfe
            best_n_features_rfe = n_features
            best_features_rfe = features_rfe

        if test_f1_sfs > best_f1_sfs:
            best_f1_sfs = test_f1_sfs
            best_n_features_sfs = n_features
            best_features_sfs = features_sfs
   
    # Print all results with features
    print("\n=== Shapley Feature Selection Results ===")
    print(f"Best n_features: {best_n_features_shap}")
    print(f"Best F1-score: {best_f1_shap:.4f}")
    print("Selected features:", best_features_shap)
    
    print("\n=== RFE Feature Selection Results ===")
    print(f"Best n_features: {best_n_features_rfe}")
    print(f"Best F1-score: {best_f1_rfe:.4f}")
    print("Selected features:", best_features_rfe)
    
    print("\n=== SFS Feature Selection Results ===")
    print(f"Best n_features: {best_n_features_sfs}")
    print(f"Best F1-score: {best_f1_sfs:.4f}")
    print("Selected features:", best_features_sfs)

    return (best_n_features_shap, best_features_shap, best_n_features_rfe, best_features_rfe, best_n_features_sfs, best_features_sfs)

In [21]:
# Base data set
best_n_features_shap, best_features_shap, best_n_features_rfe, best_features_rfe, best_n_features_sfs, best_features_sfs = select_best_n_features(X_train, y_train, X_test, y_test, ml_models, n_features_list)

Selected features using shapley: marital_status_married_civ_spouse, age, education_num, capital_gain, hours_per_week


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.854847,0.75393,0.590658,0.662381



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.011567,-0.039039,-0.012739,-0.022934



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-1.335066,-4.923171,-2.111189,-3.346453



Selected features using rfe: age, education_num, capital_gain, capital_loss, marital_status_married_civ_spouse


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.857099,0.789378,0.555414,0.652044



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.009315,-0.00359,-0.047983,-0.033271



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-1.075142,-0.452774,-7.952146,-4.854823



Selected features using sfs: education_num, occupation, capital_gain, capital_loss, marital_status_married_civ_spouse


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.859044,0.797808,0.556263,0.655492



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.00737,0.004839,-0.047134,-0.029823



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.850662,0.610213,-7.8114,-4.351733



Selected features using shapley: marital_status_married_civ_spouse, age, education_num, capital_gain, hours_per_week, occupation, capital_loss, sex_male, workclass, relationship_wife


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.864981,0.791339,0.597452,0.680861



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.001433,-0.00163,-0.005945,-0.004453



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.165406,-0.205578,-0.985222,-0.649822



Selected features using rfe: age, workclass, education_num, occupation, capital_gain, capital_loss, hours_per_week, relationship_wife, marital_status_married_civ_spouse, sex_male


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.864981,0.791339,0.597452,0.680861



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.001433,-0.00163,-0.005945,-0.004453



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.165406,-0.205578,-0.985222,-0.649822



Selected features using sfs: age, workclass, education_num, occupation, capital_gain, capital_loss, relationship_own_child, marital_status_married_civ_spouse, marital_status_married_spouse_absent, marital_status_never_married


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.863548,0.794016,0.585987,0.674322



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.002866,0.001047,-0.01741,-0.010993



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.330813,0.132081,-2.885292,-1.604033



Selected features using shapley: marital_status_married_civ_spouse, age, education_num, capital_gain, hours_per_week, occupation, capital_loss, sex_male, workclass, relationship_wife, relationship_own_child, race_white, native_country, marital_status_married_af_spouse, relationship_unmarried


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.86406,0.787032,0.597877,0.679537



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.002354,-0.005937,-0.00552,-0.005778



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.271739,-0.748691,-0.914849,-0.843117



Selected features using rfe: age, workclass, education_num, occupation, capital_gain, capital_loss, hours_per_week, native_country, relationship_own_child, relationship_wife, marital_status_married_af_spouse, marital_status_married_civ_spouse, marital_status_separated, race_white, sex_male


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.864879,0.788301,0.600849,0.681928



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.001535,-0.004668,-0.002548,-0.003387



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.177221,-0.588663,-0.422238,-0.494222



Selected features using sfs: age, workclass, education_num, occupation, capital_gain, capital_loss, hours_per_week, relationship_other_relative, relationship_own_child, relationship_wife, marital_status_married_af_spouse, marital_status_married_civ_spouse, marital_status_married_spouse_absent, marital_status_never_married, marital_status_widowed


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.8658,0.792273,0.600849,0.68341



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.000614,-0.000696,-0.002548,-0.001905



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.070888,-0.08771,-0.422238,-0.277957




=== Shapley Feature Selection Results ===
Best n_features: 10
Best F1-score: -0.0045
Selected features: ['marital_status_married_civ_spouse' 'age' 'education_num' 'capital_gain'
 'hours_per_week' 'occupation' 'capital_loss' 'sex_male' 'workclass'
 'relationship_wife']

=== RFE Feature Selection Results ===
Best n_features: 15
Best F1-score: -0.0034
Selected features: ['age' 'workclass' 'education_num' 'occupation' 'capital_gain'
 'capital_loss' 'hours_per_week' 'native_country' 'relationship_own_child'
 'relationship_wife' 'marital_status_married_af_spouse'
 'marital_status_married_civ_spouse' 'marital_status_separated'
 'race_white' 'sex_male']

=== SFS Feature Selection Results ===
Best n_features: 15
Best F1-score: -0.0019
Selected features: ['age' 'workclass' 'education_num' 'occupation' 'capital_gain'
 'capital_loss' 'hours_per_week' 'relationship_other_relative'
 'relationship_own_child' 'relationship_wife'
 'marital_status_married_af_spouse' 'marital_status_married_civ_spouse'

In [22]:
# LLM data set
best_n_features_shap_llm, best_features_shap_llm, best_n_features_rfe_llm, best_features_rfe_llm, best_n_features_sfs_llm, best_features_sfs_llm = select_best_n_features(X_train_llm, y_train_llm, X_test_llm, y_test_llm, ml_models, n_features_list)

Selected features using shapley: marital_status_married_civ_spouse, capital_gain, career_stage_classification_gemma3_4b, age, education_num


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.842,0.735294,0.590551,0.655022



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.024414,-0.057675,-0.012846,-0.030293



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-2.81784,-7.273254,-2.128921,-4.420283



Selected features using rfe: education_num, capital_gain, career_stage_classification_gemma3_4b, years_of_experience_gemma3_4b, marital_status_married_civ_spouse


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.838,0.744681,0.551181,0.633484



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.028414,-0.048288,-0.052216,-0.051831



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-3.279513,-6.089508,-8.65366,-7.563025



Selected features using sfs: education_num, capital_gain, capital_loss, career_stage_classification_gemma3_4b, marital_status_married_civ_spouse


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.85,0.768041,0.586614,0.665179



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.016414,-0.024928,-0.016783,-0.020136



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-1.894494,-3.143568,-2.781395,-2.938229



Selected features using shapley: marital_status_married_civ_spouse, capital_gain, career_stage_classification_gemma3_4b, age, education_num, hours_per_week, years_of_experience_gemma3_4b, marital_status_never_married, career_stage_classification_mistral, job_vs_education_match_mistral


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.833,0.720812,0.559055,0.629712



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.033414,-0.072157,-0.044342,-0.055603



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-3.856604,-9.099547,-7.348712,-8.113489



Selected features using rfe: age, education_num, capital_gain, capital_loss, hours_per_week, career_stage_classification_gemma3_4b, years_of_experience_gemma3_4b, socio_economic_mobility_gemma3_4b, career_stage_classification_mistral, marital_status_married_civ_spouse


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.846,0.742718,0.602362,0.665217



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.020414,-0.05025,-0.001035,-0.020097



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-2.356167,-6.336984,-0.171499,-2.932564



Selected features using sfs: education_num, capital_gain, capital_loss, socio_economic_mobility_llama3.2_1b, cultural_integration_score_llama3.2_3b, job_vs_education_match_gemma3_1b, career_stage_classification_gemma3_4b, occupational_demand_outlook_mistral, marital_status_married_civ_spouse, marital_status_widowed


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.857,0.793651,0.590551,0.677201



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.009414,0.000682,-0.012846,-0.008114



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-1.086567,0.086011,-2.128921,-1.18395



Selected features using shapley: marital_status_married_civ_spouse, capital_gain, career_stage_classification_gemma3_4b, age, education_num, hours_per_week, years_of_experience_gemma3_4b, marital_status_never_married, career_stage_classification_mistral, job_vs_education_match_mistral, capital_loss, job_vs_education_match_gemma3_1b, education_roi_llama3.1, occupation, socio_economic_mobility_gemma3_4b


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.855,0.763285,0.622047,0.685466



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.011414,-0.029684,0.01865,0.000152



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-1.317403,-3.743366,3.09087,0.022135



Selected features using rfe: age, education_num, occupation, capital_gain, capital_loss, hours_per_week, years_of_experience_llama3.2_3b, career_stage_classification_gemma3_4b, years_of_experience_gemma3_4b, socio_economic_mobility_gemma3_4b, job_vs_education_match_gemma3_4b, career_stage_classification_mistral, job_vs_education_match_mistral, marital_status_married_civ_spouse, marital_status_never_married


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.852,0.759804,0.610236,0.676856



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.014414,-0.033165,0.006839,-0.008459



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-1.663658,-4.182363,1.133448,-1.234293



Selected features using sfs: education_num, capital_gain, capital_loss, socio_economic_mobility_llama3.2_1b, cultural_integration_score_llama3.2_3b, job_vs_education_match_gemma3_1b, career_stage_classification_gemma3_4b, cultural_integration_score_gemma3_4b, occupational_demand_outlook_mistral, relationship_not_in_family, relationship_other_relative, marital_status_married_af_spouse, marital_status_married_civ_spouse, marital_status_widowed, race_other


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,0.855,0.791444,0.582677,0.671202



Absolute difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-0.011414,-0.001525,-0.02072,-0.014113



Percentage difference table: feature selection vs no feature selection


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gradient Boosting,-1.317403,-0.192303,-3.433869,-2.059327




=== Shapley Feature Selection Results ===
Best n_features: 15
Best F1-score: 0.0002
Selected features: ['marital_status_married_civ_spouse' 'capital_gain'
 'career_stage_classification_gemma3_4b' 'age' 'education_num'
 'hours_per_week' 'years_of_experience_gemma3_4b'
 'marital_status_never_married' 'career_stage_classification_mistral'
 'job_vs_education_match_mistral' 'capital_loss'
 'job_vs_education_match_gemma3_1b' 'education_roi_llama3.1' 'occupation'
 'socio_economic_mobility_gemma3_4b']

=== RFE Feature Selection Results ===
Best n_features: 15
Best F1-score: -0.0085
Selected features: ['age' 'education_num' 'occupation' 'capital_gain' 'capital_loss'
 'hours_per_week' 'years_of_experience_llama3.2_3b'
 'career_stage_classification_gemma3_4b' 'years_of_experience_gemma3_4b'
 'socio_economic_mobility_gemma3_4b' 'job_vs_education_match_gemma3_4b'
 'career_stage_classification_mistral' 'job_vs_education_match_mistral'
 'marital_status_married_civ_spouse' 'marital_status_never_marr

## LLM Feature selection

## Base data

In [23]:
prompt = '''Instruction:
Rank the top 20 features that optimize the prediction of whether a person earns more or less than $50K per year. 

IMPORTANT: 
YOU MUST RETURN EXACTLY 20 FEATURES!
YOU MUST ONLY RETURN THE LIST. DO NOT RETURN ANYTHING ELSE BUT THE LIST! 
DO NOT CHANGE THE NAME OF THE GIVEN COLUMN NAMES!
YOU MUST RETURN THE FEATURES FROM MOST IMPORTANT TO LEAST IMPORTANT!

Columns with discriptions:
age: the individual's age in years.
workclass: employment sector/category.
education_num: numerical representation of education level.
occupation: job role or profession
capital_gain: income from non_wage sources.
capital_loss: financial loss from non_wage sources.
hours_per_week: number of hours worked weekly.
native_country: country of birth.

The following columns are all binary (0 or 1):
relationship_not_in_family: not part of a family.
relationship_other_relative: extended family.
relationship_own_child: biological/adopted child of the householder.
relationship_unmarried: unmarried partner.
relationship_wife: spouse.
marital_status_married_af_spouse: married to an armed forces member.
marital_status_married_civ_spouse: married to a civilian spouse.
marital_status_married_spouse_absent: married but separated.
marital_status_never_married: no history of marriage.
marital_status_separated: legally separated.
marital_status_widowed: spouse has died.
race_asian_pac_islander: asian or pacific islander heritage.
race_black: black/african_american heritage.
race_other: race not listed in other categories.
race_white: white/caucasian heritage.
sex_male: male sex.'''

In [24]:
llm_prompt = '''Instruction:
Rank the top 20 features that optimize the prediction of whether a person earns more or less than $50K per year. 

IMPORTANT: 
YOU MUST RETURN EXACTLY 20 FEATURES!
YOU MUST ONLY RETURN THE LIST. DO NOT RETURN ANYTHING ELSE BUT THE LIST! 
DO NOT CHANGE THE NAME OF THE GIVEN COLUMN NAMES!
YOU MUST RETURN THE FEATURES FROM MOST IMPORTANT TO LEAST IMPORTANT!

Columns with discriptions:
age: the individual's age in years.
workclass: employment sector/category.
education_num: numerical representation of education level.
occupation: job role or profession
capital_gain: income from non_wage sources.
capital_loss: financial loss from non_wage sources.
hours_per_week: number of hours worked weekly.
native_country: country of birth.

The following columns are all binary (0 or 1):
relationship_not_in_family: not part of a family.
relationship_other_relative: extended family.
relationship_own_child: biological/adopted child of the householder.
relationship_unmarried: unmarried partner.
relationship_wife: spouse.
marital_status_married_af_spouse: married to an armed forces member.
marital_status_married_civ_spouse: married to a civilian spouse.
marital_status_married_spouse_absent: married but separated.
marital_status_never_married: no history of marriage.
marital_status_separated: legally separated.
marital_status_widowed: spouse has died.
race_asian_pac_islander: asian or pacific islander heritage.
race_black: black/african_american heritage.
race_other: race not listed in other categories.
race_white: white/caucasian heritage.
sex_male: male sex.

The following columns are all from a scale of 1 to 10:
career_stage_classification: categorizes professional progression.
occupational_demand_outlook: forecasts job market demand for a profession.
education_roi: estimates the financial/opportunity return on investment of an educational qualification.
years_of_experience: total time (in years) actively working in a field or role.
socio_economic_mobility: measures potential for upward economic movement tied to career/education choices.
job_vs_education_match: indicates alignment between an individual’s education and their current occupation.
job_security_rating: predicts stability/risk of unemployment in a given role.
cultural_integration_score: assesses ease of adapting to workplace cultural norms.'''

### LLM Feature selection method

In [25]:
# Save the names of the models which struggled with the feature selection
failed_models = []

def generate_llm_feature_selections(model_names, n_features_list, prompt, df):
    # This will store {n_features: {model_name: selected_features}}
    llm_feature_gen_dict = {}  
    for n_feature in n_features_list:
        llm_feature_gen_dict[n_feature] = {}

    # Use tqdm to show progress bar
    for model in tqdm(model_names, desc=f"Processing models"):     
        # Suffix for LLM-generated columns
        model_suffix = f"_{model.replace(':', '_').replace('-', '_')}"  
        
        max_attempts = 20
        attempt = 0
        success = False
        last_exception = None
        last_selected_features = None

        # Keep retrying until valid response or max attempts reached
        while attempt < max_attempts and not success:  
            attempt += 1
            try:
                # Generate response
                response = ollama.generate(model=model, prompt=prompt)['response']
        
                # Extract columns from response
                selected_features = [col for col in df.columns if col in response]
                
                # Length of features must be correct
                assert len(selected_features) >= max(n_features_list), "Too few features"
                
                # Assert that the LLM only selected valid features                
                for llm_generated_col in selected_features:
                    if llm_generated_col in df.columns.difference(['income']): # Exclude income columns
                        pass
                    else:
                        if llm_generated_col in [col.replace(model_suffix, "") for col in df.columns]:
                            selected_features[llm_generated_col] = llm_generated_col + model_suffix
                        else:
                            raise Exception(f"Invalid column: '{llm_generated_col}' '.")       

                # Valid features where output --> Save features 
                for n_feature in n_features_list:
                    llm_feature_gen_dict[n_feature][model] = selected_features[:n_feature]
                
                success = True
                
            except Exception as e:
                last_exception = e
                last_selected_features = selected_features if 'selected_features' in locals() else None
                print(f"Error with model {model} (attempt {attempt}/{max_attempts}): {str(e)}")
        
        # If all attempts failed, save whatever we have (if anything)
        if not success:
            failed_models.append(model)
            print(f"Failed to get valid features for model {model} after {max_attempts} attempts")
            
            # Drop income col if the LLM halucinated 
            if 'income' in last_selected_features:
                last_selected_features = last_selected_features.remove('income')
                
            if last_selected_features:
                print("Saving partial features")
                for n_feature in n_features_list:
                    # Take as many features as we have, up to n_feature
                    llm_feature_gen_dict[n_feature][model] = last_selected_features[:min(n_feature, len(last_selected_features))]

    return llm_feature_gen_dict

### Display the results method

In [26]:
def print_feature_selections(dict):
    # For each number of features configuration
    for feature_num, models in dict.items():
        print(f"Number of features {feature_num}:")

        # For each model
        for model, features in models.items():
            print(f"    Model: {model}")
            print("        Features:") # 1 Indent

            # For each feature
            for feature in features:
                print(f"            - {feature}") # 2 Indents

            print() 

## Execute on base and LLM data sets

In [27]:
llm_features_dict_base_data = generate_llm_feature_selections(model_names, n_features_list, prompt, df)

Processing models:  88%|█████████████████████████████████████████████████████████████████████████████████▍           | 7/8 [03:32<00:40, 40.89s/it]

Error with model deepseek-llm (attempt 1/20): Too few features
Error with model deepseek-llm (attempt 2/20): Too few features


Processing models: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [04:46<00:00, 35.80s/it]


In [28]:
print_feature_selections(llm_features_dict_base_data)

Number of features 5:
    Model: llama3.2:1b
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: llama3.2:3b
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: llama3.1
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: gemma3:1b
        Features:
            - age
            - education_num
            - occupation
            - capital_gain
            - capital_loss

    Model: gemma3:4b
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: dolphin3
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

 

In [29]:
# Base failed models
print(failed_models)
failed_models = [] # Reset

[]


In [30]:
llm_features_dict_llm_data = generate_llm_feature_selections(model_names, n_features_list, llm_prompt, llm_df)

Processing models:   0%|                                                                                                     | 0/8 [00:00<?, ?it/s]

Error with model llama3.2:1b (attempt 1/20): Too few features
Error with model llama3.2:1b (attempt 2/20): Too few features
Error with model llama3.2:1b (attempt 3/20): Too few features
Error with model llama3.2:1b (attempt 4/20): Too few features


Processing models:  12%|███████████▋                                                                                 | 1/8 [00:21<02:29, 21.29s/it]

Error with model llama3.2:3b (attempt 1/20): Too few features
Error with model llama3.2:3b (attempt 2/20): Too few features
Error with model llama3.2:3b (attempt 3/20): Too few features
Error with model llama3.2:3b (attempt 4/20): Too few features
Error with model llama3.2:3b (attempt 5/20): Too few features
Error with model llama3.2:3b (attempt 6/20): Too few features
Error with model llama3.2:3b (attempt 7/20): Too few features
Error with model llama3.2:3b (attempt 8/20): Too few features
Error with model llama3.2:3b (attempt 9/20): Too few features
Error with model llama3.2:3b (attempt 10/20): Too few features
Error with model llama3.2:3b (attempt 11/20): Too few features
Error with model llama3.2:3b (attempt 12/20): Too few features
Error with model llama3.2:3b (attempt 13/20): Too few features
Error with model llama3.2:3b (attempt 14/20): Too few features
Error with model llama3.2:3b (attempt 15/20): Too few features
Error with model llama3.2:3b (attempt 16/20): Too few features
E

Processing models:  25%|███████████████████████▎                                                                     | 2/8 [02:42<09:10, 91.82s/it]

Error with model llama3.1 (attempt 1/20): Too few features
Error with model llama3.1 (attempt 2/20): Too few features
Error with model llama3.1 (attempt 3/20): Too few features
Error with model llama3.1 (attempt 4/20): Too few features
Error with model llama3.1 (attempt 5/20): Too few features
Error with model llama3.1 (attempt 6/20): Too few features
Error with model llama3.1 (attempt 7/20): Too few features
Error with model llama3.1 (attempt 8/20): Too few features
Error with model llama3.1 (attempt 9/20): Too few features
Error with model llama3.1 (attempt 10/20): Too few features
Error with model llama3.1 (attempt 11/20): Too few features
Error with model llama3.1 (attempt 12/20): Too few features
Error with model llama3.1 (attempt 13/20): Too few features
Error with model llama3.1 (attempt 14/20): Too few features
Error with model llama3.1 (attempt 15/20): Too few features
Error with model llama3.1 (attempt 16/20): Too few features
Error with model llama3.1 (attempt 17/20): Too fe

Processing models:  38%|██████████████████████████████████▌                                                         | 3/8 [08:25<17:11, 206.31s/it]

Error with model llama3.1 (attempt 20/20): Too few features
Failed to get valid features for model llama3.1 after 20 attempts
Saving partial features
Error with model gemma3:1b (attempt 1/20): Too few features
Error with model gemma3:1b (attempt 2/20): Too few features
Error with model gemma3:1b (attempt 3/20): Too few features
Error with model gemma3:1b (attempt 4/20): Too few features
Error with model gemma3:1b (attempt 5/20): Too few features
Error with model gemma3:1b (attempt 6/20): Too few features
Error with model gemma3:1b (attempt 7/20): Too few features
Error with model gemma3:1b (attempt 8/20): Too few features
Error with model gemma3:1b (attempt 9/20): Too few features
Error with model gemma3:1b (attempt 10/20): Too few features
Error with model gemma3:1b (attempt 11/20): Too few features
Error with model gemma3:1b (attempt 12/20): Too few features
Error with model gemma3:1b (attempt 13/20): Too few features
Error with model gemma3:1b (attempt 14/20): Too few features
Error

Processing models:  50%|██████████████████████████████████████████████                                              | 4/8 [10:43<11:58, 179.67s/it]

Error with model gemma3:1b (attempt 20/20): Too few features
Failed to get valid features for model gemma3:1b after 20 attempts
Saving partial features
Error with model gemma3:4b (attempt 1/20): Too few features
Error with model gemma3:4b (attempt 2/20): Too few features
Error with model gemma3:4b (attempt 3/20): Too few features
Error with model gemma3:4b (attempt 4/20): Too few features
Error with model gemma3:4b (attempt 5/20): Too few features
Error with model gemma3:4b (attempt 6/20): Too few features
Error with model gemma3:4b (attempt 7/20): Too few features
Error with model gemma3:4b (attempt 8/20): list indices must be integers or slices, not str
Error with model gemma3:4b (attempt 9/20): Too few features
Error with model gemma3:4b (attempt 10/20): list indices must be integers or slices, not str
Error with model gemma3:4b (attempt 11/20): list indices must be integers or slices, not str
Error with model gemma3:4b (attempt 12/20): Too few features
Error with model gemma3:4b (a

Processing models:  62%|█████████████████████████████████████████████████████████▌                                  | 5/8 [15:17<10:40, 213.66s/it]

Error with model gemma3:4b (attempt 20/20): list indices must be integers or slices, not str
Failed to get valid features for model gemma3:4b after 20 attempts
Error with model dolphin3 (attempt 1/20): Too few features
Error with model dolphin3 (attempt 2/20): Too few features
Error with model dolphin3 (attempt 3/20): Too few features
Error with model dolphin3 (attempt 4/20): Too few features
Error with model dolphin3 (attempt 5/20): Too few features
Error with model dolphin3 (attempt 6/20): list indices must be integers or slices, not str
Error with model dolphin3 (attempt 7/20): Too few features
Error with model dolphin3 (attempt 8/20): Too few features
Error with model dolphin3 (attempt 9/20): Too few features
Error with model dolphin3 (attempt 10/20): Too few features
Error with model dolphin3 (attempt 11/20): Too few features
Error with model dolphin3 (attempt 12/20): Too few features
Error with model dolphin3 (attempt 13/20): Too few features
Error with model dolphin3 (attempt 14

Processing models:  88%|████████████████████████████████████████████████████████████████████████████████▌           | 7/8 [23:04<03:29, 209.52s/it]

Error with model deepseek-llm (attempt 1/20): Too few features
Error with model deepseek-llm (attempt 2/20): Too few features
Error with model deepseek-llm (attempt 3/20): Too few features


Processing models: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [24:29<00:00, 183.74s/it]


In [31]:
print_feature_selections(llm_features_dict_llm_data)

Number of features 5:
    Model: llama3.2:1b
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: llama3.2:3b
        Features:
            - age
            - education_num
            - occupation
            - capital_gain
            - capital_loss

    Model: llama3.1
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: gemma3:1b
        Features:
            - age

    Model: dolphin3
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: mistral
        Features:
            - age
            - workclass
            - education_num
            - occupation
            - capital_gain

    Model: deepseek-llm
        Features:
            - age
            - workclass
            - education

In [32]:
# LLM failed models
print(failed_models)

['llama3.1', 'gemma3:1b', 'gemma3:4b']


# Comparison LLM Feature selection vs SOTA feature selection methods
Accuracy is our the metric used here

In [33]:
def generate_feature_selection_comparison(X_train, y_train, X_test, y_test, models, n_features_list, features_llm_model):
    comparison_tables = [] 

    for number_of_features in tqdm(n_features_list, desc="Evaluating feature counts"):    
        # Initialize a single-row DataFrame with number_of_features as index
        table = pd.DataFrame(index=[f"{number_of_features} features"])
        
        # Get relevant features
        _, features_shapley = evaluate_with_feature_selection(X_train, y_train, X_test, y_test, models, number_of_features, 'shapley', False)
        _, features_rfe = evaluate_with_feature_selection(X_train, y_train, X_test, y_test, models, number_of_features, 'rfe', False)
        _, features_sfs = evaluate_with_feature_selection(X_train, y_train, X_test, y_test, models, number_of_features, 'sfs', False)
        
        # Base row (all features)
        table['base'] = round(evaluate_models_with_metrics(models, X_train, y_train, X_test, y_test)['Test Accuracy'].iloc[0], 3)
        
        # Feature selection methods
        table['shap'] = round(evaluate_models_with_metrics(models, X_train[features_shapley], y_train, X_test[features_shapley], y_test)['Test Accuracy'].iloc[0], 3)
        table['rfe'] = round(evaluate_models_with_metrics(models, X_train[features_rfe], y_train, X_test[features_rfe], y_test)['Test Accuracy'].iloc[0], 3)
        table['sfs'] = round(evaluate_models_with_metrics(models, X_train[features_sfs], y_train, X_test[features_sfs], y_test)['Test Accuracy'].iloc[0], 3)
        
        # LLM model columns
        for model_name, features in features_llm_model[number_of_features].items():
            table[model_name] = round(evaluate_models_with_metrics(models, X_train[features], y_train, X_test[features], y_test)['Test Accuracy'].iloc[0], 3)
        
        # Add difference row
        base_values = table.loc[f"{number_of_features} features", 'base']
        diff_row = round(table.loc[f"{number_of_features} features"].copy() - base_values, 3)
        diff_row.name = f"{number_of_features} features difference with base"

        # Add percentage difference row
        pct_diff_row = round((table.loc[f"{number_of_features} features"].copy() - base_values) / base_values * 100, 2)
        pct_diff_row.name = f"{number_of_features} features percentage difference with base"
        
        # Use pd.concat()
        table = pd.concat([table, diff_row.to_frame().T, pct_diff_row.to_frame().T])
        
        # Store the table
        comparison_tables.append(table)
    
    # Combine all tables into one final table
    final_comparison_table = pd.concat(comparison_tables, axis=0)
    
    return final_comparison_table

In [34]:
# Base data
final_comparison_table_base_data = generate_feature_selection_comparison(
    X_train, y_train, X_test, y_test,
    models, n_features_list, llm_features_dict_base_data
)

display(final_comparison_table_base_data)

Evaluating feature counts: 100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [34:25<00:00, 688.60s/it]


Unnamed: 0,base,shap,rfe,sfs,llama3.2:1b,llama3.2:3b,llama3.1,gemma3:1b,gemma3:4b,dolphin3,mistral,deepseek-llm
5 features,0.866,0.855,0.857,0.859,0.827,0.827,0.827,0.837,0.827,0.827,0.837,0.836
5 features difference with base,0.0,-0.011,-0.009,-0.007,-0.039,-0.039,-0.039,-0.029,-0.039,-0.039,-0.029,-0.03
5 features percentage difference with base,0.0,-1.27,-1.04,-0.81,-4.5,-4.5,-4.5,-3.35,-4.5,-4.5,-3.35,-3.46
10 features,0.866,0.865,0.865,0.864,0.85,0.85,0.85,0.849,0.85,0.853,0.851,0.848
10 features difference with base,0.0,-0.001,-0.001,-0.002,-0.016,-0.016,-0.016,-0.017,-0.016,-0.013,-0.015,-0.018
10 features percentage difference with base,0.0,-0.12,-0.12,-0.23,-1.85,-1.85,-1.85,-1.96,-1.85,-1.5,-1.73,-2.08
15 features,0.866,0.864,0.865,0.866,0.865,0.863,0.867,0.866,0.859,0.865,0.866,0.864
15 features difference with base,0.0,-0.002,-0.001,0.0,-0.001,-0.003,0.001,0.0,-0.007,-0.001,0.0,-0.002
15 features percentage difference with base,0.0,-0.23,-0.12,0.0,-0.12,-0.35,0.12,0.0,-0.81,-0.12,0.0,-0.23


In [35]:
# LLM data
final_comparison_table_llm_data = generate_feature_selection_comparison(
    X_train_llm, y_train_llm, X_test_llm, y_test_llm,
    models, n_features_list, llm_features_dict_llm_data
)

display(final_comparison_table_llm_data)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Evaluating feature counts: 100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [28:15<00:00, 565.18s/it]


Unnamed: 0,base,shap,rfe,sfs,llama3.2:1b,llama3.2:3b,llama3.1,gemma3:1b,dolphin3,mistral,deepseek-llm
5 features,0.847,0.842,0.838,0.85,0.805,0.822,0.805,0.746,0.805,0.805,0.805
5 features difference with base,0.0,-0.005,-0.009,0.003,-0.042,-0.025,-0.042,-0.101,-0.042,-0.042,-0.042
5 features percentage difference with base,0.0,-0.59,-1.06,0.35,-4.96,-2.95,-4.96,-11.92,-4.96,-4.96,-4.96
10 features,0.847,0.833,0.846,0.857,0.838,0.856,0.827,0.746,0.845,0.838,0.814
10 features difference with base,0.0,-0.014,-0.001,0.01,-0.009,0.009,-0.02,-0.101,-0.002,-0.009,-0.033
10 features percentage difference with base,0.0,-1.65,-0.12,1.18,-1.06,1.06,-2.36,-11.92,-0.24,-1.06,-3.9
15 features,0.847,0.855,0.852,0.855,0.853,0.855,0.858,0.746,0.845,0.853,0.844
15 features difference with base,0.0,0.008,0.005,0.008,0.006,0.008,0.011,-0.101,-0.002,0.006,-0.003
15 features percentage difference with base,0.0,0.94,0.59,0.94,0.71,0.94,1.3,-11.92,-0.24,0.71,-0.35
