# Imports

In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

import ollama

from tqdm import tqdm
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from scipy.spatial.distance import pdist
from sklearn.decomposition import PCA

# Parameters

In [4]:
# List of models
ml_models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

In [5]:
# Number of rows to synthesize
n_rows = 5000

In [6]:
# Different temperatures for LLM at which data will be synthesized
temperatures = [5, 10, 15]

In [7]:
# CSV file names
base_data_prep_name = "tabular_data_preprocessed_2025_04_03.csv"

In [8]:
# Force CUDA usage
os.environ["OLLAMA_BACKEND"] = "cuda"
os.environ["OLLAMA_NUM_THREADS"] = "16"

# Load Data

In [10]:
df = pd.read_csv(base_data_prep_name)

In [11]:
df.head()

Unnamed: 0,age,workclass,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,income,relationship_not_in_family,...,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,race_asian_pac_islander,race_black,race_other,race_white,sex_male
0,0.025996,2.137359,1.136512,-1.31846,0.146932,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,1.424944,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
1,0.828308,1.454401,1.136512,-0.609318,-0.144804,-0.217127,-2.213032,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
2,-0.046942,0.088484,-0.419335,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
3,1.047121,0.088484,-1.197259,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,0.70422
4,-0.776316,0.088484,1.136512,0.808965,-0.144804,-0.217127,-0.034087,-4.08338,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,-1.42001


### Proxy meassures for homogeneity of base data

In [13]:
def euclidean(df):
    distances = pdist(df, metric='euclidean')
    avg_distance = np.mean(distances)
    print("Average pairwise distance:", avg_distance)


def pca(df):
    pca = PCA(n_components=3)
    transformed = pca.fit_transform(df)
    explained_variance = pca.explained_variance_ratio_
    print("Explained variance ratio:", explained_variance)
    
    cumulative_variance = np.sum(pca.explained_variance_ratio_)
    print("Cumulative explained variance:", cumulative_variance)

In [14]:
euclidean(df)
pca(df)

Average pairwise distance: 6.391776639787022
Explained variance ratio: [0.12171479 0.08500064 0.07121668]
Cumulative explained variance: 0.2779321020922514


# Experimential setup

In [16]:
# Split the data into training test set
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df['income'], test_size=0.2, random_state=42)

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
print(f"Test Set: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")

Training Set: X_train shape = (39073, 24), y_train shape = (39073,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)


# Modeling and Performance metrics

In [18]:
# Function to train and evaluate models with multiple metrics
def evaluate_models_with_metrics(models, X_train, y_train, X_test, y_test):
    results = []
    
    for name, model in models:
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_test_pred = model.predict(X_test)

        # Store results for the model
        model_results = {
            'Model': name,
            'Test Accuracy': accuracy_score(y_test, y_test_pred),
            'Test Precision': precision_score(y_test, y_test_pred),
            'Test Recall': recall_score(y_test, y_test_pred),
            'Test F1-Score': f1_score(y_test, y_test_pred)
        }
        
        results.append(model_results)

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [19]:
# Evaluate models with multiple metrics and print results
results_normal = evaluate_models_with_metrics(ml_models, X_train, y_train, X_test, y_test)
display(results_normal)

Unnamed: 0,Model,Test Accuracy,Test Precision,Test Recall,Test F1-Score
0,Logistic Regression,0.841744,0.725097,0.553291,0.627649
1,Random Forest,0.847989,0.719033,0.606369,0.657913
2,SVM,0.845634,0.760615,0.524841,0.621106
3,KNN,0.829358,0.672518,0.569427,0.616693
4,Gradient Boosting,0.866414,0.792969,0.603397,0.685315


# Data synthesis experiment

In the case where you have little data you can use LLMs to generate data (with controlled feature variability)

To prove that this works we can set up the following experiment
1. Train a model with 100 rows (small sample size)
2. Train another model with 200 rows where 100 rows from the existing set and then 100 rows generated from the LLM
3. Test first model on test set and validation set
4. Test second model on test set and validation set
5. Compare performance metrics

### Let's quickly make the base DF with our low sample size (100 samples)

In [23]:
# Split the data into training and test sets
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(df.drop('income', axis=1), df['income'], test_size=0.2, random_state=42)
X_train_base = X_train_base[0:99]
y_train_base = y_train_base[0:99]

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train_base.shape}, y_train shape = {y_train_base.shape}")
print(f"Test Set: X_test shape = {X_test_base.shape}, y_test shape = {y_test_base.shape}")

base_results = []
  
for name, ml_model in ml_models:
    # Train the model
    ml_model.fit(X_train_base, y_train_base)
    
    # Predict on test set
    y_test_pred_base = ml_model.predict(X_test_base)

    # Store results for the model
    model_results = {
        'Model': name,
        'Test Accuracy': accuracy_score(y_test_base, y_test_pred_base),
        'Test Precision': precision_score(y_test_base, y_test_pred_base),
        'Test Recall': recall_score(y_test_base, y_test_pred_base),
        'Test F1-Score': f1_score(y_test_base, y_test_pred_base)
    }
    
    base_results.append(model_results)

# Convert results to a pandas DataFrame
base_results = pd.DataFrame(base_results).set_index("Model")
display(base_results)

Training Set: X_train shape = (99, 24), y_train shape = (99,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.810523,0.636957,0.497665,0.55876
Random Forest,0.813594,0.655413,0.478132,0.552909
SVM,0.80305,0.64551,0.405945,0.498436
KNN,0.791995,0.592338,0.439915,0.504873
Gradient Boosting,0.804893,0.603504,0.555839,0.578691


### Models that we will test

In [25]:
prompt = '''Generate ONE realistic data row where ALL fields align with the variable constraints and logically justify the Target (does the person earn more or less than 50k dollars). 
Return ONLY the pipe-separated row with NO additional text or explanations.

Mandatory Format (pipe-separated, exact order):
Age|Sex|Country|Marital-Status|Occupation|Workclass|Education|Race|Relationship|Hours-Weekly|Capital-Gain|Capital-Loss|Target

Variable Constraints (NO deviations allowed):
Age: Integer (17-90)
Sex: Male or Female
Country: Exact values from: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands
Marital-Status: Exact values from: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
Occupation: Exact values from: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
Workclass: Exact values from: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
Education: Exact values from: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
Race: Exact values from: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
Relationship: Exact values from: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
Hours-Weekly: Integer (1-99)
Capital-Gain: Non-negative integer (0-99999)
Capital-Loss: Non-negative integer (0-99999)
Target: >50K or <50K
'''

In [26]:
def validate_input(input_string):
    # Split the input string by pipe character
    fields = input_string.split('|')
    assert len(fields) == 13, "Input must have exactly 13 fields separated by |"
    
    # Unpack all fields
    age, sex, country, marital_status, occupation, workclass, education, race, relationship, hours_weekly, capital_gain, capital_loss, target = fields
    
    # Validate Age
    age = int(age)
    assert 17 <= age <= 90, "Age must be between 17 and 90"
    
    # Validate Sex
    assert sex in {'Male', 'Female'}, "Sex must be either Male or Female"
    
    # Validate Country
    valid_countries = {
        'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
        'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'China', 'Cuba',
        'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam',
        'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos',
        'Ecuador', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua',
        'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago',
        'Peru', 'Hong', 'Holand-Netherlands'
    }
    assert country in valid_countries, f"Invalid country: {country}"
    
    # Validate Marital-Status
    valid_marital_statuses = {
        'Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
        'Married-spouse-absent', 'Married-AF-spouse'
    }
    assert marital_status in valid_marital_statuses, f"Invalid marital status: {marital_status}"
    
    # Validate Occupation
    valid_occupations = {
        'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
        'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical',
        'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv',
        'Armed-Forces'
    }
    assert occupation in valid_occupations, f"Invalid occupation: {occupation}"
    
    # Validate Workclass
    valid_workclasses = {
        'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov',
        'State-gov', 'Without-pay', 'Never-worked'
    }
    assert workclass in valid_workclasses, f"Invalid workclass: {workclass}"
    
    # Validate Education
    valid_educations = {
        'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm',
        'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th',
        'Doctorate', '5th-6th', 'Preschool'
    }
    assert education in valid_educations, f"Invalid education: {education}"
    
    # Validate Race
    valid_races = {
        'White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'
    }
    assert race in valid_races, f"Invalid race: {race}"
    
    # Validate Relationship
    valid_relationships = {
        'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'
    }
    assert relationship in valid_relationships, f"Invalid relationship: {relationship}"
    
    # Validate Hours-Weekly
    hours_weekly = int(hours_weekly)
    assert 1 <= hours_weekly <= 99, "Hours per week must be between 1 and 99"
    
    # Validate Capital-Gain
    capital_gain = int(capital_gain)
    assert 0 <= capital_gain <= 99999, "Capital gain must be between 0 and 99999"
    
    # Validate Capital-Loss
    capital_loss = int(capital_loss)
    assert 0 <= capital_loss <= 99999, "Capital loss must be between 0 and 99999"
    
    # Validate Target
    assert target in {'>50K', '<50K'}, "Target must be either >50K or <50K"
    
    return True

### Demonstration that "dumb" models have too much error

In [28]:
models = ['llama3.2:1b', 'llama3.2:3b']

In [29]:
# Initialize a dictionary to store results and errors per model
results = {}
error_counts_per_model = defaultdict(lambda: defaultdict(int))

# Iterate through models
for index, model in enumerate(models):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(models)) + ")")
    results[model] = []
    retries = 0
    valid_samples = 0
    total_samples = 0

    # Loop through the n_rows instances for each model
    for _ in tqdm(range(n_rows), desc=f"Generating Samples for {model}", leave=True):
        total_samples += 1
        try:
            # Generate data
            response = ollama.generate(model=model, prompt=prompt)['response']

            # Check if the response is valid
            validate_input(response)

            # Store the valid response
            results[model].append(response)
            valid_samples += 1
        except Exception as e:
            # Store error per model
            error_counts_per_model[model][str(e)] += 1
            retries += 1

            # Stop early for this model if too many errors (>10% errors)
            if retries >= int(n_rows*0.1):
                print(f"Too many errors for model {model}, stopping early.")
                break  

    # Calculate error rate
    error_rate = (retries / total_samples) * 100
    print(f"Model {model} Error Rate: {error_rate:.2f}%")

    # Print error summary for the current model
    print(f"Error Distribution Summary for Model {model}:")
    for error_msg, count in error_counts_per_model[model].items():
        print(f"  {error_msg}: {count} occurrences")
    print()

Processing Model: llama3.2:1b (Model 1/2)


Generating Samples for llama3.2:1b:  10%|████████████████████▌                                                                                                                                                                                        | 501/5000 [06:04<54:33,  1.37it/s]


Too many errors for model llama3.2:1b, stopping early.
Model llama3.2:1b Error Rate: 99.60%
Error Distribution Summary for Model llama3.2:1b:
  Input must have exactly 13 fields separated by |: 356 occurrences
  Invalid workclass: Other-service: 3 occurrences
  Invalid workclass: Exec-managerial: 7 occurrences
  Age must be between 17 and 90: 56 occurrences
  invalid literal for int() with base 10: '$100000': 1 occurrences
  Sex must be either Male or Female: 16 occurrences
  invalid literal for int() with base 10: '0.00000+': 1 occurrences
  Capital loss must be between 0 and 99999: 17 occurrences
  Target must be either >50K or <50K: 5 occurrences
  Invalid workclass: Prof-specialty: 6 occurrences
  Invalid marital status: Marry-civ-spouse: 8 occurrences
  Invalid race: None: 1 occurrences
  Invalid education: Education:Bachelors: 6 occurrences
  Capital gain must be between 0 and 99999: 6 occurrences
  Invalid education: Education:Prof-specialty: 1 occurrences
  Invalid marital stat

Generating Samples for llama3.2:3b:  13%|██████████████████████████▍                                                                                                                                                                                | 650/5000 [13:20<1:29:18,  1.23s/it]

Too many errors for model llama3.2:3b, stopping early.
Model llama3.2:3b Error Rate: 76.80%
Error Distribution Summary for Model llama3.2:3b:
  Input must have exactly 13 fields separated by |: 187 occurrences
  Sex must be either Male or Female: 280 occurrences
  Invalid marital status: Maiden-head-of-house-hold: 1 occurrences
  Invalid marital status: Maiden-civ-spouse: 2 occurrences
  Age must be between 17 and 90: 7 occurrences
  Capital loss must be between 0 and 99999: 13 occurrences
  Invalid marital status: Maiden-Married-civ-spouse: 1 occurrences
  Capital gain must be between 0 and 99999: 1 occurrences
  Invalid occupation: Farmers-fishers-and-related-occupations: 1 occurrences
  Invalid marital status: Merged-civ-spouse: 1 occurrences
  Target must be either >50K or <50K: 3 occurrences
  Invalid relationship: Married-spouse-absent: 1 occurrences
  Invalid education: Private: 1 occurrences
  Invalid marital status: Maiden-head-of-household: 1 occurrences






### Hence we use smart enough models

In [31]:
models = ['llama3.1', 'mistral']

### Lets generate some data
We will use different temperature and use it as a paramter to see if we can get less homogeneous data

In [33]:
# Initialize a dictionary to store results and errors per model
results = {}
error_counts_per_model = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

# Iterate through models
for index, model in enumerate(models):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(models)) + ")")
    results[model] = {}

    # Loop through the different temperatures
    for temperature in temperatures:
        print(f"  Using Temperature: {temperature}")
        results[model][temperature] = []
        retries = 0
        valid_samples = 0
        total_samples = 0
        
        # Loop through the x instances for each temperature setting
        for _ in tqdm(range(n_rows), desc=f"Generating Samples for {model} with Temp {temperature}", leave=True):
            total_samples += 1
            try:
                # Generate data with the specified temperature
                response = ollama.generate(model=model, prompt=prompt, options={"temperature": temperature})['response']

                # Check if the response is valid
                validate_input(response)

                # Store the valid response
                results[model][temperature].append(response)
                valid_samples += 1
            except Exception as e:
                # Store error per model and temperature
                error_counts_per_model[model][temperature][str(e)] += 1
                retries += 1

        # Calculate error rate
        error_rate = (retries / total_samples) * 100
        print(f"  Model {model} at Temp {temperature} - Error Rate: {error_rate:.2f}%")

        # Print error summary for the current model and temperature
        print(f"  Error Distribution Summary for Model {model} at Temp {temperature}:")
        for error_msg, count in error_counts_per_model[model][temperature].items():
            print(f"    {error_msg}: {count} occurrences")    
        print()

Processing Model: llama3.1 (Model 1/2)
  Using Temperature: 5


Generating Samples for llama3.1 with Temp 5: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [6:21:19<00:00,  4.58s/it]


  Model llama3.1 at Temp 5 - Error Rate: 26.50%
  Error Distribution Summary for Model llama3.1 at Temp 5:
    Input must have exactly 13 fields separated by |: 251 occurrences
    Capital loss must be between 0 and 99999: 260 occurrences
    invalid literal for int() with base 10: '1231>': 1 occurrences
    Sex must be either Male or Female: 251 occurrences
    Invalid country: MEXICO: 83 occurrences
    invalid literal for int() with base 10: '': 33 occurrences
    invalid literal for int() with base 10: 'Age': 174 occurrences
    Invalid country: MEX: 60 occurrences
    invalid literal for int() with base 10: '1022>': 1 occurrences
    invalid literal for int() with base 10: '1507>': 2 occurrences
    invalid literal for int() with base 10: '1001>': 2 occurrences
    invalid literal for int() with base 10: '1021>': 1 occurrences
    Capital gain must be between 0 and 99999: 17 occurrences
    invalid literal for int() with base 10: '2304>': 1 occurrences
    invalid literal for int(

Generating Samples for llama3.1 with Temp 10: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [7:53:11<00:00,  5.68s/it]


  Model llama3.1 at Temp 10 - Error Rate: 27.28%
  Error Distribution Summary for Model llama3.1 at Temp 10:
    Capital loss must be between 0 and 99999: 295 occurrences
    invalid literal for int() with base 10: 'Age': 121 occurrences
    Sex must be either Male or Female: 215 occurrences
    Input must have exactly 13 fields separated by |: 253 occurrences
    Invalid country: MEX: 68 occurrences
    Capital gain must be between 0 and 99999: 22 occurrences
    invalid literal for int() with base 10: '': 44 occurrences
    Invalid country: MEXICAN-HISPANIC-SPANISH ORIG, SPC 1ORIGIN-MOREAN-CUBAN-BAYESIAN-DONUT SPARROWHEADS-SI AM-PRT-LTH-OTHER-CARA CAB ROLLERS-TURCO: 1 occurrences
    invalid literal for int() with base 10: '3204>': 2 occurrences
    Invalid country: MEXICOFemale : 1 occurrences
    Invalid country: MEXICO: 95 occurrences
    Invalid country: MEXICAN-HISPANIC: 1 occurrences
    invalid literal for int() with base 10: '3201>': 2 occurrences
    invalid literal for int(

Generating Samples for llama3.1 with Temp 15: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [7:18:44<00:00,  5.26s/it]


  Model llama3.1 at Temp 15 - Error Rate: 29.24%
  Error Distribution Summary for Model llama3.1 at Temp 15:
    invalid literal for int() with base 10: 'Age': 120 occurrences
    Invalid country: MEXICAN: 4 occurrences
    Input must have exactly 13 fields separated by |: 300 occurrences
    Capital loss must be between 0 and 99999: 298 occurrences
    Sex must be either Male or Female: 239 occurrences
    Invalid country: MEXICOFederal-gov: 1 occurrences
    invalid literal for int() with base 10: '1022>': 1 occurrences
    Invalid country: MEX: 107 occurrences
    invalid literal for int() with base 10: '1234>': 3 occurrences
    Invalid country: MEXICO: 94 occurrences
    Capital gain must be between 0 and 99999: 23 occurrences
    invalid literal for int() with base 10: '2407>': 1 occurrences
    invalid literal for int() with base 10: '': 47 occurrences
    invalid literal for int() with base 10: '2304>': 1 occurrences
    invalid literal for int() with base 10: '4201>': 2 occurr

Generating Samples for mistral with Temp 5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [3:18:43<00:00,  2.38s/it]


  Model mistral at Temp 5 - Error Rate: 10.22%
  Error Distribution Summary for Model mistral at Temp 5:
    Invalid occupation: Exectutive: 15 occurrences
    Invalid relationship: Other: 23 occurrences
    Invalid occupation: Exectutive-managerial: 15 occurrences
    Invalid occupation: Execthinktank: 2 occurrences
    invalid literal for int() with base 10: '15000<': 1 occurrences
    Invalid marital status: Married-spouse: 21 occurrences
    Input must have exactly 13 fields separated by |: 211 occurrences
    Invalid occupation: Exectec-managerial: 30 occurrences
    Invalid occupation: Exect-managerial: 40 occurrences
    Invalid occupation: Executive: 16 occurrences
    Invalid occupation: Executive-managerial: 26 occurrences
    Invalid occupation: Execthys-managerial: 1 occurrences
    Invalid occupation: Exectemp-managerial: 13 occurrences
    Invalid education: Professional-school: 1 occurrences
    Invalid occupation: Exectechnicalmanagerial: 7 occurrences
    Invalid occup

Generating Samples for mistral with Temp 10: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [11:48:57<00:00,  8.51s/it]


  Model mistral at Temp 10 - Error Rate: 10.88%
  Error Distribution Summary for Model mistral at Temp 10:
    Input must have exactly 13 fields separated by |: 194 occurrences
    Invalid occupation: Exectemp-managerial: 18 occurrences
    Invalid occupation: Exectutive-managerial: 15 occurrences
    Invalid occupation: Executive: 24 occurrences
    Invalid occupation: Executive-managerial: 30 occurrences
    Invalid relationship: Other: 37 occurrences
    Invalid occupation: Execthespetiality: 1 occurrences
    Invalid marital status: Married-spouse: 21 occurrences
    Invalid occupation: Exectutive: 21 occurrences
    Invalid occupation: Execthic-managerial: 1 occurrences
    Invalid occupation: Execthicalspecialty: 1 occurrences
    Capital loss must be between 0 and 99999: 2 occurrences
    Invalid occupation: Exect-managerial: 40 occurrences
    Invalid occupation: Exectechnicalsupportmanager: 3 occurrences
    Invalid occupation: Exectechnical: 11 occurrences
    Invalid occupat

Generating Samples for mistral with Temp 15: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [5:44:32<00:00,  4.13s/it]

  Model mistral at Temp 15 - Error Rate: 12.00%
  Error Distribution Summary for Model mistral at Temp 15:
    Invalid relationship: Other: 52 occurrences
    Input must have exactly 13 fields separated by |: 225 occurrences
    invalid literal for int() with base 10: '15000<': 1 occurrences
    Invalid occupation: Exectec-managerial: 40 occurrences
    Invalid occupation: Executive: 26 occurrences
    Invalid occupation: Exectech-managerial: 24 occurrences
    Invalid occupation: Execthehelp-managerial: 1 occurrences
    Invalid occupation: Exectutive-managerial: 17 occurrences
    Invalid occupation: Exect-managerial: 37 occurrences
    Invalid occupation: Executive-managerial: 18 occurrences
    Invalid occupation: Exectechnicalsupport: 3 occurrences
    Invalid occupation: Exectemp-managerial: 13 occurrences
    invalid literal for int() with base 10: '30000<': 1 occurrences
    Invalid education: Prof-specialty: 16 occurrences
    Invalid occupation: Exectech-support: 6 occurrence




# Process and Evaluate results

In [35]:
def preprocess(df):
    # Correct column names
    df.columns = df.columns.str.lower().str.replace('-', '_')

    # Clean the income column by stripping spaces and removing periods
    df['target'] = df['target'].str.replace('.', '', regex=False).str.strip()
    
    # Encode '>50K' as 1 and '<=50K' as 0
    df['target'] = df['target'].map({'>50K': 1, '<50K': 0})

    # The dataset has native class for missing data: "?"
    df = df.fillna('?')

    # One-Hot Encoding for categorical variables
    df = pd.get_dummies(df, columns=['relationship', 'marital_status', 'race', 'sex'], drop_first=True)
    df.columns = df.columns.str.lower().str.replace('-', '_') # lower case and use _ instead of -
    
    # Integer Encoding for workclass, occupation, and native-country
    encoder = LabelEncoder()

    # col name missmatches
    df.rename(columns={'country': 'native_country', 'education': 'education_num', 'hours_weekly': 'hours_per_week'}, inplace=True)
    
    # Apply integer encoding
    df['workclass'] = encoder.fit_transform(df['workclass'])
    df['occupation'] = encoder.fit_transform(df['occupation'])
    df['native_country'] = encoder.fit_transform(df['native_country'])
    df['education_num'] = encoder.fit_transform(df['education_num'])
        
    # Convert boolean columns to integers
    df = df.astype(int)

    # HOT ENCODING ISSUE IF YOU MISS CERTAIN VALUES THE COLUMN WONT BE GENERATED
    # Get the columns in X_train_base that are missing in df
    missing_cols = set(X_train_base.columns) - set(df.columns)
    
    # Add the missing columns to df with 0 as the default value
    for col in missing_cols:
        df[col] = 0
    
    # REORDER COLUMNS
    ordered_columns = [col for col in X_train_base.columns if col != 'target']  
    
    # Add 'target' at the end
    ordered_columns += ['target']  
    
    # Reorder
    df = df[ordered_columns]
    
    # List of columns to scale (all columns except 'income')
    numeric_cols = df.columns[df.columns != 'target']
  
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Scale the numerical features and update the DataFrame
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [36]:
# Define the column structure
columns = [
    "Age", "Sex", "Country", "Marital-Status", "Occupation", "Workclass",  "Education",
    "Race", "Relationship", "Hours-Weekly", "Capital-Gain", "Capital-Loss", "Target"
]

for model in models:
    for temp in temperatures:
        # Get the list of pipe-separated strings
        rows = results[model][temp]
        
        # Split each string into a list of values
        split_rows = [row.split('|') for row in rows]
        
        # Create DataFrame
        df = pd.DataFrame(split_rows, columns=columns)
        df = preprocess(df)

        # Split the data into X and y
        X_train = df.drop('target', axis=1)
        y_train = df['target']
        
        # Print the shapes of the sets
        print("Model: " + model + ", Temperature = " + str(temp))
        print(f"Training Set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
        print(f"Test Set: X_test shape = {X_test_base.shape}, y_test shape = {y_test_base.shape}") # We use the test data from base!
        print()
        
        temp_result = []
        for name, ml_model in ml_models:
            # Train the model
            ml_model.fit(X_train, y_train)
        
            # Predict on test set
            y_test_pred_base = ml_model.predict(X_test_base)

            # Store results for the model
            model_results = {
                'Model': name,
                'Test Accuracy': accuracy_score(y_test_base, y_test_pred_base),
                'Test Precision': precision_score(y_test_base, y_test_pred_base),
                'Test Recall': recall_score(y_test_base, y_test_pred_base),
                'Test F1-Score': f1_score(y_test_base, y_test_pred_base)
            }
        
            temp_result.append(model_results)

        # Convert results to a pandas DataFrame
        results_df = pd.DataFrame(temp_result).set_index("Model")

        # Calculate difference dfs (absolute and % difference)
        result_diff = results_df - base_results
        result_pct_diff = (result_diff / base_results) * 100

        print("Results Data Synthesis:")
        display(results_df)
        print("Absolute Difference with base:")
        display(result_diff)
        print("Percentage Difference with base (%):")
        display(result_pct_diff)
        print()
        euclidean(df)
        pca(df)
        print()

Model: llama3.1, Temperature = 5
Training Set: X_train shape = (3675, 24), y_train shape = (3675,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)

Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.293582,0.237347,0.872187,0.373149
Random Forest,0.391545,0.241836,0.7138,0.361272
SVM,0.268605,0.241918,0.953291,0.385905
KNN,0.416931,0.225472,0.58259,0.325118
Gradient Boosting,0.26492,0.23766,0.928238,0.37843


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.516941,-0.39961,0.374522,-0.185611
Random Forest,-0.422049,-0.413578,0.235669,-0.191637
SVM,-0.534446,-0.403592,0.547346,-0.112531
KNN,-0.375064,-0.366866,0.142675,-0.179755
Gradient Boosting,-0.539973,-0.365844,0.372399,-0.200262


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-63.778732,-62.737348,75.255973,-33.218384
Random Forest,-51.874685,-63.101799,49.28952,-34.659766
SVM,-66.551944,-62.522938,134.832636,-22.576881
KNN,-47.356857,-61.935197,32.432432,-35.603945
Gradient Boosting,-67.086354,-60.619914,66.997708,-34.605935



Average pairwise distance: 5.395649650763941
Explained variance ratio: [0.22420605 0.09137142 0.07945485]
Cumulative explained variance: 0.39503231599062827

Model: llama3.1, Temperature = 10
Training Set: X_train shape = (3636, 24), y_train shape = (3636,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)

Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.385198,0.242851,0.732059,0.364713
Random Forest,0.471389,0.247256,0.583439,0.347321
SVM,0.459617,0.271776,0.739278,0.397443
KNN,0.478248,0.247048,0.568577,0.344437
Gradient Boosting,0.406695,0.242015,0.68535,0.357713


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.425325,-0.394105,0.234395,-0.194047
Random Forest,-0.342205,-0.408158,0.105308,-0.205589
SVM,-0.343433,-0.373733,0.333333,-0.100993
KNN,-0.313748,-0.345291,0.128662,-0.160436
Gradient Boosting,-0.398198,-0.361489,0.129512,-0.220979


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-52.475373,-61.873205,47.098976,-34.728136
Random Forest,-42.060896,-62.274839,22.024867,-37.183104
SVM,-42.766093,-57.897392,82.112971,-20.261916
KNN,-39.614838,-58.29277,29.247104,-31.777477
Gradient Boosting,-49.472212,-59.898306,23.300229,-38.185922



Average pairwise distance: 5.403706230133437
Explained variance ratio: [0.23574284 0.08675399 0.07921573]
Cumulative explained variance: 0.401712562871683

Model: llama3.1, Temperature = 15
Training Set: X_train shape = (3538, 24), y_train shape = (3538,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)

Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.492374,0.305672,0.869639,0.452347
Random Forest,0.47661,0.275553,0.718896,0.3984
SVM,0.339851,0.254144,0.898514,0.396218
KNN,0.457058,0.25231,0.637792,0.361579
Gradient Boosting,0.324701,0.238729,0.82293,0.370095


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.318149,-0.331285,0.371975,-0.106414
Random Forest,-0.336984,-0.37986,0.240764,-0.15451
SVM,-0.4632,-0.391366,0.492569,-0.102218
KNN,-0.334937,-0.340029,0.197877,-0.143294
Gradient Boosting,-0.480192,-0.364775,0.267091,-0.208597


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-39.252336,-52.010595,74.744027,-19.044595
Random Forest,-41.419225,-57.957308,50.35524,-27.94483
SVM,-57.680051,-60.629002,121.338912,-20.507811
KNN,-42.290293,-57.404462,44.980695,-28.382189
Gradient Boosting,-59.659163,-60.442883,48.051948,-36.046308



Average pairwise distance: 5.428985269476137
Explained variance ratio: [0.23204106 0.08782905 0.07876423]
Cumulative explained variance: 0.3986343308650695

Model: mistral, Temperature = 5
Training Set: X_train shape = (4489, 24), y_train shape = (4489,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.751459,0.152381,0.006794,0.013008
Random Forest,0.713891,0.127119,0.031847,0.050934
SVM,0.758931,0.0,0.0,0.0
KNN,0.654622,0.202569,0.147346,0.1706
Gradient Boosting,0.754632,0.243902,0.008493,0.016414


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.059064,-0.484576,-0.49087,-0.545752
Random Forest,-0.099703,-0.528295,-0.446285,-0.501976
SVM,-0.044119,-0.64551,-0.405945,-0.498436
KNN,-0.137373,-0.38977,-0.292569,-0.334273
Gradient Boosting,-0.050261,-0.359601,-0.547346,-0.562278


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-7.287194,-76.076711,-98.634812,-97.671966
Random Forest,-12.254655,-80.604811,-93.339254,-90.788041
SVM,-5.493945,-100.0,-100.0,-100.0
KNN,-17.345224,-65.801885,-66.505792,-66.209383
Gradient Boosting,-6.244436,-59.585608,-98.472116,-97.163666



Average pairwise distance: 5.940508158478192
Explained variance ratio: [0.14749869 0.07429649 0.0638824 ]
Cumulative explained variance: 0.2856775775177582

Model: mistral, Temperature = 10
Training Set: X_train shape = (4456, 24), y_train shape = (4456,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.753199,0.166667,0.005945,0.01148
Random Forest,0.751152,0.05814,0.002123,0.004097
SVM,0.758931,0.0,0.0,0.0
KNN,0.687583,0.179393,0.082803,0.113306
Gradient Boosting,0.758727,0.0,0.0,0.0


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.057324,-0.47029,-0.49172,-0.54728
Random Forest,-0.062442,-0.597274,-0.476008,-0.548813
SVM,-0.044119,-0.64551,-0.405945,-0.498436
KNN,-0.104412,-0.412946,-0.357113,-0.391567
Gradient Boosting,-0.046166,-0.603504,-0.555839,-0.578691


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-7.072493,-73.833902,-98.805461,-97.945432
Random Forest,-7.674887,-91.129332,-99.55595,-99.259068
SVM,-5.493945,-100.0,-100.0,-100.0
KNN,-13.183404,-69.714474,-81.177606,-77.557494
Gradient Boosting,-5.735724,-100.0,-100.0,-100.0



Average pairwise distance: 6.042585665656159
Explained variance ratio: [0.14922874 0.07005927 0.06487497]
Cumulative explained variance: 0.2841629725394981

Model: mistral, Temperature = 15
Training Set: X_train shape = (4400, 24), y_train shape = (4400,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.751868,0.210084,0.010616,0.02021
Random Forest,0.748797,0.143885,0.008493,0.016038
SVM,0.758931,0.0,0.0,0.0
KNN,0.717781,0.27567,0.104883,0.151953
Gradient Boosting,0.751356,0.13,0.00552,0.010591


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.058655,-0.426872,-0.487049,-0.53855
Random Forest,-0.064797,-0.511528,-0.469639,-0.536871
SVM,-0.044119,-0.64551,-0.405945,-0.498436
KNN,-0.074214,-0.316669,-0.335032,-0.35292
Gradient Boosting,-0.053537,-0.473504,-0.550318,-0.568101


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-7.236676,-67.017524,-97.866894,-96.383032
Random Forest,-7.964268,-78.046692,-98.223801,-97.099255
SVM,-5.493945,-100.0,-100.0,-100.0
KNN,-9.370557,-53.460791,-76.158301,-69.902697
Gradient Boosting,-6.651405,-78.459129,-99.006875,-98.1699



Average pairwise distance: 6.076252105833049
Explained variance ratio: [0.15446225 0.07023303 0.06447521]
Cumulative explained variance: 0.28917048568284953



In [64]:
# Get unique classes and counts
unique_classes, counts = np.unique(y_test, return_counts=True)
class_balance = dict(zip(unique_classes, counts))

print("Class Balance (Counts):", class_balance)
print("Class Balance (%):", {cls: count / len(y_test) * 100 for cls, count in class_balance.items()})

Class Balance (Counts): {0: 7414, 1: 2355}
Class Balance (%): {0: 75.89313133381104, 1: 24.106868666188966}
