# Imports

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Parameters

In [4]:
# List of ml models
ml_models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

In [5]:
# LLMs
models = ['llama3.2:1b', 'llama3.2:3b', 'gemma3:1b', 'gemma3:4b', 'llama3.1', 'dolphin3', 'mistral', 'deepseek_llm']

In [6]:
# CSV file names
base_data_prep_name = "tabular_data_preprocessed_2025_04_03.csv"
llm_data_prep_name = "tabular_data_llm_preprocessed_2025_04_03.csv"

# Load Data

In [8]:
df = pd.read_csv(base_data_prep_name)
llm_df = pd.read_csv(llm_data_prep_name)

In [9]:
df.head()

Unnamed: 0,age,workclass,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,income,relationship_not_in_family,...,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,race_asian_pac_islander,race_black,race_other,race_white,sex_male
0,0.025996,2.137359,1.136512,-1.31846,0.146932,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,1.424944,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
1,0.828308,1.454401,1.136512,-0.609318,-0.144804,-0.217127,-2.213032,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
2,-0.046942,0.088484,-0.419335,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
3,1.047121,0.088484,-1.197259,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,0.70422
4,-0.776316,0.088484,1.136512,0.808965,-0.144804,-0.217127,-0.034087,-4.08338,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,-1.42001


In [10]:
llm_df.head()

Unnamed: 0,age,workclass,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,income,nlg,...,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,race_asian_pac_islander,race_black,race_other,race_white,sex_male
0,-1.51325,0.001008,-0.425643,0.369591,-0.14031,-0.208399,-1.625939,0.297816,0,"18-year-old Male from United-States, Never-mar...",...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,0.412478,0.707213
1,-1.587107,0.001008,-0.815427,1.320918,-0.14031,-0.208399,-1.625939,0.297816,0,"17-year-old Female from United-States, Never-m...",...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,0.412478,-1.414001
2,-0.996253,-0.839358,-0.425643,-0.106073,-0.14031,-0.208399,-0.038309,0.297816,0,"25-year-old Male from United-States, Never-mar...",...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,3.065128,-0.087511,-2.424372,0.707213
3,-1.365537,0.001008,-0.03586,1.320918,-0.14031,-0.208399,-0.832124,0.297816,0,"20-year-old Female from United-States, Never-m...",...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,0.412478,-1.414001
4,0.628595,0.001008,-0.425643,0.131759,-0.14031,-0.208399,1.231794,-0.479163,0,"47-year-old Male from Puerto-Rico, Never-marri...",...,-0.900885,-0.106428,1.39264,-0.168474,-0.184733,-0.183572,-0.326251,-0.087511,-2.424372,0.707213


# Experimential setup

In [12]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df['income'], test_size=0.2, random_state=42)

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
print(f"Test Set: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")

Training Set: X_train shape = (39073, 24), y_train shape = (39073,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)


In [13]:
# Split the data into training and test sets
X_train_llm, X_test_llm, y_train_llm, y_test_llm = train_test_split(llm_df.drop('income', axis=1), llm_df['income'], test_size=0.2, random_state=42)

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train_llm.shape}, y_train shape = {y_train_llm.shape}")
print(f"Test Set: X_test shape = {X_test_llm.shape}, y_test shape = {y_test_llm.shape}")

Training Set: X_train shape = (4000, 89), y_train shape = (4000,)
Test Set: X_test shape = (1000, 89), y_test shape = (1000,)


# Modeling and Performance metrics

## Base data set

In [16]:
# Function to train and evaluate models with multiple metrics
def evaluate_models_with_metrics(models, X_train, y_train, X_test, y_test):
    results = []
    
    for name, model in models:
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_test_pred = model.predict(X_test)

        # Store results for the model
        model_results = {
            'Model': name,
            'Test Accuracy': accuracy_score(y_test, y_test_pred),
            'Test Precision': precision_score(y_test, y_test_pred),
            'Test Recall': recall_score(y_test, y_test_pred),
            'Test F1-Score': f1_score(y_test, y_test_pred)
        }
        
        results.append(model_results)

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [17]:
# Evaluate models with multiple metrics and print results
results = evaluate_models_with_metrics(ml_models, X_train, y_train, X_test, y_test)
display(results)

Unnamed: 0,Model,Test Accuracy,Test Precision,Test Recall,Test F1-Score
0,Logistic Regression,0.841744,0.725097,0.553291,0.627649
1,Random Forest,0.847989,0.719033,0.606369,0.657913
2,SVM,0.845634,0.760615,0.524841,0.621106
3,KNN,0.829358,0.672518,0.569427,0.616693
4,Gradient Boosting,0.866414,0.792969,0.603397,0.685315


## LLM data set
We will make all the model-template partitions and check for each model what the LLM generated feature engineering does for the performance metrics

In [19]:
# LLM generated columns
templates = [
    "career_stage_classification",
    "occupational_demand_outlook",
    "education_roi",
    "years_of_experience",
    "socio_economic_mobility",
    "job_vs_education_match",
    "job_security_rating",
    "cultural_integration_score",
]

# Create a regex pattern that matches any of the template strings
pattern = '|'.join(templates)

# Filter columns - keep only those that DON'T match any template
base_cols = [col for col in llm_df.columns if not any(template in col for template in templates)]

# NLG is not used in classification and income is target variable
base_cols.remove('nlg')
base_cols.remove('income') 
print(base_cols)

['age', 'workclass', 'education_num', 'occupation', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'relationship_not_in_family', 'relationship_other_relative', 'relationship_own_child', 'relationship_unmarried', 'relationship_wife', 'marital_status_married_af_spouse', 'marital_status_married_civ_spouse', 'marital_status_married_spouse_absent', 'marital_status_never_married', 'marital_status_separated', 'marital_status_widowed', 'race_asian_pac_islander', 'race_black', 'race_other', 'race_white', 'sex_male']


In [20]:
# For each model
for model in models:
    # Get the LLM generated columns per model
    model_llm_col_list = []
    
    # For each template
    for template in templates:
        column_name = f"{template}_{model.replace(':', '_')}"
        model_llm_col_list.append(column_name)

    # Append LLM generated columns to the columns of the base data set 
    selected_cols = base_cols + model_llm_col_list
    assert len(model_llm_col_list) == 8

    # Evaluate models with multiple metrics
    results_base = evaluate_models_with_metrics(ml_models, X_train_llm[base_cols], y_train_llm, X_test_llm[base_cols], y_test_llm)
    results_llm = evaluate_models_with_metrics(ml_models, X_train_llm[selected_cols], y_train_llm, X_test_llm[selected_cols], y_test_llm)
    
    # Set index of the data frames
    results_llm = results_llm.set_index("Model")
    results_base = results_base.set_index("Model")
    
    # Compare llm results with base results
    numerical_cols = results_llm.select_dtypes(include=['number']).columns
    result_diff = round(results_llm[numerical_cols] - results_base[numerical_cols], 3)

    # Compute percentage change
    result_pct_change = round((result_diff / results_base[numerical_cols]) * 100, 2)
    
    # Display(results
    print(model)
    print("Base df:")
    display(results_base)
    print("LLM df:")
    display(results_llm)
    print("Difference df:")
    display(result_diff)
    print()
    print("Percentage Change df:")
    display(result_pct_change)

llama3.2:1b
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.831,0.713568,0.559055,0.626932
Random Forest,0.833,0.751445,0.511811,0.608899
SVM,0.819,0.726708,0.46063,0.563855
KNN,0.783,0.6,0.437008,0.505695
Gradient Boosting,0.857,0.770732,0.622047,0.688453


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.001,0.001,-0.008,-0.005
Random Forest,-0.001,0.04,-0.071,-0.032
SVM,-0.002,0.015,-0.035,-0.021
KNN,-0.013,-0.028,-0.047,-0.041
Gradient Boosting,-0.001,-0.004,0.0,-0.002



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.12,0.14,-1.41,-0.79
Random Forest,-0.12,5.62,-12.19,-4.99
SVM,-0.24,2.11,-7.06,-3.59
KNN,-1.63,-4.46,-9.71,-7.5
Gradient Boosting,-0.12,-0.52,0.0,-0.29


llama3.2:3b
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.834,0.711538,0.582677,0.640693
Random Forest,0.841,0.759563,0.547244,0.636156
SVM,0.825,0.718232,0.511811,0.597701
KNN,0.798,0.634021,0.484252,0.549107
Gradient Boosting,0.855,0.768473,0.614173,0.682713


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.002,-0.001,0.016,0.009
Random Forest,0.007,0.048,-0.035,-0.005
SVM,0.004,0.006,0.016,0.013
KNN,0.002,0.006,0.0,0.002
Gradient Boosting,-0.003,-0.006,-0.008,-0.007



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.24,-0.14,2.82,1.42
Random Forest,0.84,6.75,-6.01,-0.78
SVM,0.49,0.84,3.23,2.22
KNN,0.25,0.96,0.0,0.37
Gradient Boosting,-0.35,-0.77,-1.29,-1.01


gemma3:1b
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.83,0.716495,0.547244,0.620536
Random Forest,0.829,0.71066,0.551181,0.620843
SVM,0.825,0.718232,0.511811,0.597701
KNN,0.812,0.644737,0.57874,0.609959
Gradient Boosting,0.854,0.77,0.606299,0.678414


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.002,0.004,-0.02,-0.011
Random Forest,-0.005,-0.001,-0.031,-0.02
SVM,0.004,0.006,0.016,0.013
KNN,0.016,0.017,0.094,0.063
Gradient Boosting,-0.004,-0.005,-0.016,-0.012



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.24,0.56,-3.53,-1.74
Random Forest,-0.6,-0.14,-5.32,-3.12
SVM,0.49,0.84,3.23,2.22
KNN,2.01,2.71,19.41,11.52
Gradient Boosting,-0.47,-0.65,-2.57,-1.74


gemma3:4b
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.84,0.730392,0.586614,0.650655
Random Forest,0.833,0.708134,0.582677,0.639309
SVM,0.843,0.765027,0.551181,0.640732
KNN,0.816,0.671569,0.53937,0.598253
Gradient Boosting,0.849,0.756219,0.598425,0.668132


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.008,0.018,0.02,0.019
Random Forest,-0.001,-0.003,0.0,-0.001
SVM,0.022,0.053,0.055,0.056
KNN,0.02,0.044,0.055,0.052
Gradient Boosting,-0.009,-0.018,-0.024,-0.022



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.96,2.52,3.53,3.01
Random Forest,-0.12,-0.42,0.0,-0.16
SVM,2.68,7.45,11.09,9.58
KNN,2.51,7.01,11.36,9.51
Gradient Boosting,-1.05,-2.32,-3.86,-3.19


llama3.1
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.844,0.755208,0.570866,0.650224
SVM,0.826,0.729885,0.5,0.593458
KNN,0.779,0.573991,0.503937,0.536688
Gradient Boosting,0.845,0.751269,0.582677,0.656319


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.0,0.0,0.0,0.0
Random Forest,0.01,0.044,-0.012,0.01
SVM,0.005,0.018,0.004,0.009
KNN,-0.017,-0.054,0.02,-0.01
Gradient Boosting,-0.013,-0.023,-0.039,-0.034



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.0,0.0,0.0,0.0
Random Forest,1.2,6.18,-2.06,1.56
SVM,0.61,2.53,0.81,1.54
KNN,-2.14,-8.6,4.13,-1.83
Gradient Boosting,-1.52,-2.97,-6.27,-4.93


dolphin3
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.827,0.701493,0.555118,0.61978
Random Forest,0.844,0.763441,0.559055,0.645455
SVM,0.827,0.731429,0.503937,0.596737
KNN,0.794,0.618812,0.492126,0.548246
Gradient Boosting,0.849,0.756219,0.598425,0.668132


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.005,-0.011,-0.012,-0.012
Random Forest,0.01,0.052,-0.024,0.005
SVM,0.006,0.02,0.008,0.012
KNN,-0.002,-0.009,0.008,0.002
Gradient Boosting,-0.009,-0.018,-0.024,-0.022



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.6,-1.54,-2.12,-1.9
Random Forest,1.2,7.31,-4.12,0.78
SVM,0.73,2.81,1.61,2.05
KNN,-0.25,-1.43,1.65,0.37
Gradient Boosting,-1.05,-2.32,-3.86,-3.19


mistral
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.834,0.715686,0.574803,0.637555
Random Forest,0.834,0.734043,0.543307,0.624434
SVM,0.833,0.730159,0.543307,0.623025
KNN,0.816,0.676768,0.527559,0.59292
Gradient Boosting,0.853,0.763547,0.610236,0.678337


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.002,0.003,0.008,0.006
Random Forest,0.0,0.023,-0.039,-0.016
SVM,0.012,0.018,0.047,0.038
KNN,0.02,0.049,0.043,0.046
Gradient Boosting,-0.005,-0.011,-0.012,-0.012



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.24,0.42,1.41,0.95
Random Forest,0.0,3.23,-6.69,-2.5
SVM,1.46,2.53,9.47,6.5
KNN,2.51,7.81,8.88,8.41
Gradient Boosting,-0.58,-1.42,-1.93,-1.74


deepseek_llm
Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.832,0.712871,0.566929,0.631579
Random Forest,0.834,0.711538,0.582677,0.640693
SVM,0.821,0.711864,0.496063,0.584687
KNN,0.796,0.627551,0.484252,0.546667
Gradient Boosting,0.858,0.77451,0.622047,0.689956


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.83,0.707921,0.562992,0.627193
Random Forest,0.834,0.75,0.519685,0.613953
SVM,0.823,0.72,0.496063,0.587413
KNN,0.768,0.550459,0.472441,0.508475
Gradient Boosting,0.856,0.775,0.610236,0.682819


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.002,-0.005,-0.004,-0.004
Random Forest,0.0,0.038,-0.063,-0.027
SVM,0.002,0.008,0.0,0.003
KNN,-0.028,-0.077,-0.012,-0.038
Gradient Boosting,-0.002,0.0,-0.012,-0.007



Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.24,-0.7,-0.71,-0.63
Random Forest,0.0,5.34,-10.81,-4.21
SVM,0.24,1.12,0.0,0.51
KNN,-3.52,-12.27,-2.48,-6.95
Gradient Boosting,-0.23,0.0,-1.93,-1.01


## Fun Experiment
For fun we will give it all the LLM generated columns from each LLM to see how well it does, we will compare it to the base model.

In [22]:
# Evaluate models with multiple metrics and print results
results_llm = evaluate_models_with_metrics(ml_models, X_train_llm.drop(['nlg'], axis=1), y_train_llm, X_test_llm.drop(['nlg'], axis=1), y_test_llm)
results_base = evaluate_models_with_metrics(ml_models, X_train, y_train, X_test, y_test)

# Set index of the data frames
results_llm = results_llm.set_index("Model")
results_base = results_base.set_index("Model")

# Compare llm results with base results
numerical_cols = results_llm.select_dtypes(include=['number']).columns
result_diff = round(results_llm[numerical_cols] - results_base[numerical_cols], 3)

# Compute percentage change
result_pct_change = round((result_diff / results_base[numerical_cols]) * 100, 2)

# Display results
print("Base df:")
display(results_base)
print("LLM df:")
display(results_llm)
print("Difference df:")
display(result_diff)
print("Percentage Change df:")
display(result_pct_change)

Base df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.841744,0.725097,0.553291,0.627649
Random Forest,0.847989,0.719033,0.606369,0.657913
SVM,0.845634,0.760615,0.524841,0.621106
KNN,0.829358,0.672518,0.569427,0.616693
Gradient Boosting,0.866414,0.792969,0.603397,0.685315


LLM df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.838,0.732323,0.570866,0.641593
Random Forest,0.837,0.763006,0.519685,0.618267
SVM,0.843,0.759358,0.559055,0.643991
KNN,0.809,0.622568,0.629921,0.626223
Gradient Boosting,0.847,0.751244,0.594488,0.663736


Difference df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.004,0.007,0.018,0.014
Random Forest,-0.011,0.044,-0.087,-0.04
SVM,-0.003,-0.001,0.034,0.023
KNN,-0.02,-0.05,0.06,0.01
Gradient Boosting,-0.019,-0.042,-0.009,-0.022


Percentage Change df:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.48,0.97,3.25,2.23
Random Forest,-1.3,6.12,-14.35,-6.08
SVM,-0.35,-0.13,6.48,3.7
KNN,-2.41,-7.43,10.54,1.62
Gradient Boosting,-2.19,-5.3,-1.49,-3.21
