# Imports

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

import ollama

from tqdm import tqdm
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from scipy.spatial.distance import pdist
from sklearn.decomposition import PCA

# Parameters

In [2]:
# List of models
ml_models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

In [3]:
# Number of rows to synthesize
n_rows = 5000

In [4]:
# Different temperatures for LLM at which data will be synthesized
temperatures = [5, 10, 15]

In [5]:
# CSV file names
base_data_prep_name = "tabular_data_preprocessed_2025_04_03.csv"

In [6]:
# Force CUDA usage
os.environ["OLLAMA_BACKEND"] = "cuda"
os.environ["OLLAMA_NUM_THREADS"] = "16"

# Load Data

In [7]:
df = pd.read_csv(base_data_prep_name)

In [8]:
df.head()

Unnamed: 0,age,workclass,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,income,relationship_not_in_family,...,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,race_asian_pac_islander,race_black,race_other,race_white,sex_male
0,0.025996,2.137359,1.136512,-1.31846,0.146932,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,1.424944,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
1,0.828308,1.454401,1.136512,-0.609318,-0.144804,-0.217127,-2.213032,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
2,-0.046942,0.088484,-0.419335,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,1.697524,...,-0.919604,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,-0.325728,-0.091554,0.411743,0.70422
3,1.047121,0.088484,-1.197259,-0.136557,-0.144804,-0.217127,-0.034087,0.289462,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,0.70422
4,-0.776316,0.088484,1.136512,0.808965,-0.144804,-0.217127,-0.034087,-4.08338,0,-0.589093,...,1.087425,-0.114128,-0.701782,-0.179829,-0.1791,-0.179161,3.070047,-0.091554,-2.428701,-1.42001


### Proxy meassures for homogeneity of base data

In [9]:
def euclidean(df):
    distances = pdist(df, metric='euclidean')
    avg_distance = np.mean(distances)
    print("Average pairwise distance:", avg_distance)


def pca(df):
    pca = PCA(n_components=3)
    transformed = pca.fit_transform(df)
    explained_variance = pca.explained_variance_ratio_
    print("Explained variance ratio:", explained_variance)
    
    cumulative_variance = np.sum(pca.explained_variance_ratio_)
    print("Cumulative explained variance:", cumulative_variance)

In [10]:
euclidean(df)
pca(df)

Average pairwise distance: 6.391776639787022
Explained variance ratio: [0.12171943 0.08500747 0.07145037]
Cumulative explained variance: 0.27817726369402135


# Experimential setup

In [11]:
# Split the data into training test set
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df['income'], test_size=0.2, random_state=42)

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
print(f"Test Set: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")

Training Set: X_train shape = (39073, 24), y_train shape = (39073,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)


# Modeling and Performance metrics

In [12]:
# Function to train and evaluate models with multiple metrics
def evaluate_models_with_metrics(models, X_train, y_train, X_test, y_test):
    results = []
    
    for name, model in models:
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_test_pred = model.predict(X_test)

        # Store results for the model
        model_results = {
            'Model': name,
            'Test Accuracy': accuracy_score(y_test, y_test_pred),
            'Test Precision': precision_score(y_test, y_test_pred),
            'Test Recall': recall_score(y_test, y_test_pred),
            'Test F1-Score': f1_score(y_test, y_test_pred)
        }
        
        results.append(model_results)

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [13]:
# Evaluate models with multiple metrics and print results
results_normal = evaluate_models_with_metrics(ml_models, X_train, y_train, X_test, y_test)
display(results_normal)

Unnamed: 0,Model,Test Accuracy,Test Precision,Test Recall,Test F1-Score
0,Logistic Regression,0.841744,0.725097,0.553291,0.627649
1,Random Forest,0.847989,0.719033,0.606369,0.657913
2,SVM,0.845634,0.760615,0.524841,0.621106
3,KNN,0.829358,0.672518,0.569427,0.616693
4,Gradient Boosting,0.866414,0.792969,0.603397,0.685315


# Data synthesis experiment

In the case where you have little data you can use LLMs to generate data (with controlled feature variability)

To prove that this works we can set up the following experiment
1. Train a model with 100 rows (small sample size)
2. Train another model with 200 rows where 100 rows from the existing set and then 100 rows generated from the LLM
3. Test first model on test set and validation set
4. Test second model on test set and validation set
5. Compare performance metrics

### Let's quickly make the base DF with our low sample size (100 samples)

In [14]:
# Split the data into training and test sets
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(df.drop('income', axis=1), df['income'], test_size=0.2, random_state=42)
X_train_base = X_train_base[0:99]
y_train_base = y_train_base[0:99]

# Print the shapes of the sets
print(f"Training Set: X_train shape = {X_train_base.shape}, y_train shape = {y_train_base.shape}")
print(f"Test Set: X_test shape = {X_test_base.shape}, y_test shape = {y_test_base.shape}")

base_results = []
  
for name, ml_model in ml_models:
    # Train the model
    ml_model.fit(X_train_base, y_train_base)
    
    # Predict on test set
    y_test_pred_base = ml_model.predict(X_test_base)

    # Store results for the model
    model_results = {
        'Model': name,
        'Test Accuracy': accuracy_score(y_test_base, y_test_pred_base),
        'Test Precision': precision_score(y_test_base, y_test_pred_base),
        'Test Recall': recall_score(y_test_base, y_test_pred_base),
        'Test F1-Score': f1_score(y_test_base, y_test_pred_base)
    }
    
    base_results.append(model_results)

# Convert results to a pandas DataFrame
base_results = pd.DataFrame(base_results).set_index("Model")
display(base_results)

Training Set: X_train shape = (99, 24), y_train shape = (99,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.810523,0.636957,0.497665,0.55876
Random Forest,0.813594,0.655413,0.478132,0.552909
SVM,0.80305,0.64551,0.405945,0.498436
KNN,0.791995,0.592338,0.439915,0.504873
Gradient Boosting,0.804893,0.603504,0.555839,0.578691


### Models that we will test

In [15]:
prompt = '''Generate ONE realistic data row where ALL fields align with the variable constraints and logically justify the Target (does the person earn more or less than 50k dollars). 
Return ONLY the pipe-separated row with NO additional text or explanations.

Mandatory Format (pipe-separated, exact order):
Age|Sex|Country|Marital-Status|Occupation|Workclass|Education|Race|Relationship|Hours-Weekly|Capital-Gain|Capital-Loss|Target

Variable Constraints (NO deviations allowed):
Age: Integer (17-90)
Sex: Male or Female
Country: Exact values from: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands
Marital-Status: Exact values from: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse
Occupation: Exact values from: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
Workclass: Exact values from: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
Education: Exact values from: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
Race: Exact values from: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
Relationship: Exact values from: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
Hours-Weekly: Integer (1-99)
Capital-Gain: Non-negative integer (0-99999)
Capital-Loss: Non-negative integer (0-99999)
Target: >50K or <50K
'''

In [16]:
def validate_input(input_string):
    # Split the input string by pipe character
    fields = input_string.split('|')
    assert len(fields) == 13, "Input must have exactly 13 fields separated by |"
    
    # Unpack all fields
    age, sex, country, marital_status, occupation, workclass, education, race, relationship, hours_weekly, capital_gain, capital_loss, target = fields
    
    # Validate Age
    age = int(age)
    assert 17 <= age <= 90, "Age must be between 17 and 90"
    
    # Validate Sex
    assert sex in {'Male', 'Female'}, "Sex must be either Male or Female"
    
    # Validate Country
    valid_countries = {
        'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
        'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'China', 'Cuba',
        'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam',
        'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos',
        'Ecuador', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua',
        'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago',
        'Peru', 'Hong', 'Holand-Netherlands'
    }
    assert country in valid_countries, f"Invalid country: {country}"
    
    # Validate Marital-Status
    valid_marital_statuses = {
        'Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
        'Married-spouse-absent', 'Married-AF-spouse'
    }
    assert marital_status in valid_marital_statuses, f"Invalid marital status: {marital_status}"
    
    # Validate Occupation
    valid_occupations = {
        'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
        'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical',
        'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv',
        'Armed-Forces'
    }
    assert occupation in valid_occupations, f"Invalid occupation: {occupation}"
    
    # Validate Workclass
    valid_workclasses = {
        'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov',
        'State-gov', 'Without-pay', 'Never-worked'
    }
    assert workclass in valid_workclasses, f"Invalid workclass: {workclass}"
    
    # Validate Education
    valid_educations = {
        'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm',
        'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th',
        'Doctorate', '5th-6th', 'Preschool'
    }
    assert education in valid_educations, f"Invalid education: {education}"
    
    # Validate Race
    valid_races = {
        'White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'
    }
    assert race in valid_races, f"Invalid race: {race}"
    
    # Validate Relationship
    valid_relationships = {
        'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'
    }
    assert relationship in valid_relationships, f"Invalid relationship: {relationship}"
    
    # Validate Hours-Weekly
    hours_weekly = int(hours_weekly)
    assert 1 <= hours_weekly <= 99, "Hours per week must be between 1 and 99"
    
    # Validate Capital-Gain
    capital_gain = int(capital_gain)
    assert 0 <= capital_gain <= 99999, "Capital gain must be between 0 and 99999"
    
    # Validate Capital-Loss
    capital_loss = int(capital_loss)
    assert 0 <= capital_loss <= 99999, "Capital loss must be between 0 and 99999"
    
    # Validate Target
    assert target in {'>50K', '<50K'}, "Target must be either >50K or <50K"
    
    return True

### Demonstration that "dumb" models have too much error

In [17]:
models = ['llama3.2:1b', 'llama3.2:3b']

In [18]:
# Initialize a dictionary to store results and errors per model
results = {}
error_counts_per_model = defaultdict(lambda: defaultdict(int))

# Iterate through models
for index, model in enumerate(models):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(models)) + ")")
    results[model] = []
    retries = 0
    valid_samples = 0
    total_samples = 0

    # Loop through the n_rows instances for each model
    for _ in tqdm(range(n_rows), desc=f"Generating Samples for {model}", leave=True):
        total_samples += 1
        try:
            # Generate data
            response = ollama.generate(model=model, prompt=prompt)['response']

            # Check if the response is valid
            validate_input(response)

            # Store the valid response
            results[model].append(response)
            valid_samples += 1
        except Exception as e:
            # Store error per model
            error_counts_per_model[model][str(e)] += 1
            retries += 1

            # Stop early for this model if too many errors (>10% errors)
            if retries >= int(n_rows*0.1):
                print(f"Too many errors for model {model}, stopping early.")
                break  

    # Calculate error rate
    error_rate = (retries / total_samples) * 100
    print(f"Model {model} Error Rate: {error_rate:.2f}%")

    # Print error summary for the current model
    print(f"Error Distribution Summary for Model {model}:")
    for error_msg, count in error_counts_per_model[model].items():
        print(f"  {error_msg}: {count} occurrences")
    print()

Processing Model: llama3.2:1b (Model 1/2)


Generating Samples for llama3.2:1b:  10%|████████████████████▍                                                                                                                                                                                      | 503/5000 [08:04<1:12:14,  1.04it/s]


Too many errors for model llama3.2:1b, stopping early.
Model llama3.2:1b Error Rate: 99.21%
Error Distribution Summary for Model llama3.2:1b:
  Input must have exactly 13 fields separated by |: 356 occurrences
  Sex must be either Male or Female: 24 occurrences
  Age must be between 17 and 90: 48 occurrences
  Invalid education: Education:Bachelors: 10 occurrences
  invalid literal for int() with base 10: 'Bachelors': 1 occurrences
  invalid literal for int() with base 10: '50000+': 1 occurrences
  Invalid education: Education=Bachelors: 2 occurrences
  Invalid education: Bachelor's: 1 occurrences
  Invalid marital status: MARRIED-civ-spouse: 2 occurrences
  Capital loss must be between 0 and 99999: 11 occurrences
  Invalid workclass: Exec-managerial: 4 occurrences
  Capital gain must be between 0 and 99999: 3 occurrences
  invalid literal for int() with base 10: 'M14': 1 occurrences
  Invalid marital status: Marr- Civ-Spouse: 1 occurrences
  Invalid marital status: Marry-civ-spouse: 7

Generating Samples for llama3.2:3b:  13%|███████████████████████████▏                                                                                                                                                                               | 670/5000 [14:24<1:33:04,  1.29s/it]

Too many errors for model llama3.2:3b, stopping early.
Model llama3.2:3b Error Rate: 74.52%
Error Distribution Summary for Model llama3.2:3b:
  Input must have exactly 13 fields separated by |: 198 occurrences
  Capital loss must be between 0 and 99999: 10 occurrences
  Sex must be either Male or Female: 268 occurrences
  Target must be either >50K or <50K: 3 occurrences
  invalid literal for int() with base 10: '8.0': 1 occurrences
  Invalid relationship: Married-spouse-absent: 1 occurrences
  Invalid marital status: Maiden-civ-spouse: 3 occurrences
  Invalid marital status: Merged-civ-spouse: 1 occurrences
  Age must be between 17 and 90: 6 occurrences
  Invalid education: Adm-clerical: 2 occurrences
  Invalid occupation: Farmers-and-fishermen: 1 occurrences
  Invalid occupation: Farmers-fishers-and-related-occupations: 1 occurrences
  Hours per week must be between 1 and 99: 1 occurrences
  Invalid relationship: Married-civ-spouse: 1 occurrences
  Invalid marital status: Marry-civil




### Hence we use smart enough models

In [19]:
models = ['llama3.1', 'mistral']

### Lets generate some data
We will use different temperature and use it as a paramter to see if we can get less homogeneous data

In [20]:
# Initialize a dictionary to store results and errors per model
results = {}
error_counts_per_model = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

# Iterate through models
for index, model in enumerate(models):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(models)) + ")")
    results[model] = {}

    # Loop through the different temperatures
    for temperature in temperatures:
        print(f"  Using Temperature: {temperature}")
        results[model][temperature] = []
        retries = 0
        valid_samples = 0
        total_samples = 0
        pbar = tqdm(total=n_rows, desc=f"Generating Samples for {model} with Temp {temperature}", leave=True)

        # Loop through the x instances for each temperature setting
        while valid_samples < n_rows:            
            total_samples += 1
            
            try:
                # Generate data with the specified temperature
                response = ollama.generate(model=model, prompt=prompt, options={"temperature": temperature})['response']
                
                # Check if the response is valid
                validate_input(response)

                # Store the valid response
                results[model][temperature].append(response)
                valid_samples += 1
                pbar.update(1) # Update tqdm
            except Exception as e:
                # Store error per model and temperature
                error_counts_per_model[model][temperature][str(e)] += 1
                retries += 1

        # Close tqdm
        pbar.close()
        
        # Calculate error rate
        error_rate = (retries / total_samples) * 100
        print(f"  Model {model} at Temp {temperature} - Error Rate: {error_rate:.2f}%")

        # Print error summary for the current model and temperature
        print(f"  Error Distribution Summary for Model {model} at Temp {temperature}:")
        for error_msg, count in error_counts_per_model[model][temperature].items():
            print(f"    {error_msg}: {count} occurrences")    
        print()

Processing Model: llama3.1 (Model 1/2)
  Using Temperature: 5


Generating Samples for llama3.1 with Temp 5: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [8:16:57<00:00,  5.96s/it]


  Model llama3.1 at Temp 5 - Error Rate: 24.35%
  Error Distribution Summary for Model llama3.1 at Temp 5:
    Invalid country: MEXICO: 86 occurrences
    invalid literal for int() with base 10: 'Age': 214 occurrences
    Sex must be either Male or Female: 258 occurrences
    Capital loss must be between 0 and 99999: 359 occurrences
    Input must have exactly 13 fields separated by |: 297 occurrences
    invalid literal for int() with base 10: '3203>': 2 occurrences
    invalid literal for int() with base 10: '30-39': 1 occurrences
    Invalid country: MEXICCO: 7 occurrences
    Invalid country: MEXICAN-FEMALES: 1 occurrences
    invalid literal for int() with base 10: '-2143>': 1 occurrences
    invalid literal for int() with base 10: '4219>': 1 occurrences
    Invalid country: MEX: 74 occurrences
    invalid literal for int() with base 10: '4207>': 2 occurrences
    invalid literal for int() with base 10: '3007>': 1 occurrences
    invalid literal for int() with base 10: '3432>': 1 

Generating Samples for llama3.1 with Temp 10: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [8:40:49<00:00,  6.25s/it]


  Model llama3.1 at Temp 10 - Error Rate: 28.14%
  Error Distribution Summary for Model llama3.1 at Temp 10:
    Invalid country: MEX: 89 occurrences
    Invalid country: MEXICO: 141 occurrences
    Sex must be either Male or Female: 331 occurrences
    invalid literal for int() with base 10: 'Age': 205 occurrences
    invalid literal for int() with base 10: '': 69 occurrences
    Input must have exactly 13 fields separated by |: 367 occurrences
    invalid literal for int() with base 10: '5434>': 1 occurrences
    Capital loss must be between 0 and 99999: 399 occurrences
    invalid literal for int() with base 10: '1234>': 3 occurrences
    invalid literal for int() with base 10: '3213>': 1 occurrences
    invalid literal for int() with base 10: '2271>': 1 occurrences
    invalid literal for int() with base 10: '1503>': 1 occurrences
    Invalid country: MEXICOCountry: 1 occurrences
    Target must be either >50K or <50K: 35 occurrences
    invalid literal for int() with base 10: '143

Generating Samples for llama3.1 with Temp 15: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [8:42:38<00:00,  6.27s/it]


  Model llama3.1 at Temp 15 - Error Rate: 28.67%
  Error Distribution Summary for Model llama3.1 at Temp 15:
    Invalid country: MEXICCO: 19 occurrences
    Invalid country: MEXICO: 142 occurrences
    Capital loss must be between 0 and 99999: 378 occurrences
    Input must have exactly 13 fields separated by |: 339 occurrences
    invalid literal for int() with base 10: '2109>': 4 occurrences
    invalid literal for int() with base 10: '-1106>': 1 occurrences
    Sex must be either Male or Female: 364 occurrences
    invalid literal for int() with base 10: '1507>': 1 occurrences
    invalid literal for int() with base 10: '2102>': 1 occurrences
    Invalid country: MEX: 134 occurrences
    invalid literal for int() with base 10: '1089>': 1 occurrences
    invalid literal for int() with base 10: '2305>': 1 occurrences
    invalid literal for int() with base 10: 'Age': 198 occurrences
    invalid literal for int() with base 10: '2101>': 2 occurrences
    invalid literal for int() with 

Generating Samples for mistral with Temp 5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [3:50:37<00:00,  2.77s/it]


  Model mistral at Temp 5 - Error Rate: 10.51%
  Error Distribution Summary for Model mistral at Temp 5:
    Invalid occupation: Exectechnicalanalyst: 2 occurrences
    Invalid occupation: Exect-managerial: 58 occurrences
    Input must have exactly 13 fields separated by |: 222 occurrences
    Invalid occupation: Exectutive: 11 occurrences
    Invalid relationship: Other: 36 occurrences
    Invalid occupation: Exectempmanagerial: 17 occurrences
    Invalid occupation: Exectec-managerial: 31 occurrences
    Invalid occupation: Exectemp-managerial: 17 occurrences
    Invalid occupation: Execthroughmanagement: 1 occurrences
    Invalid occupation: Exectech-support: 7 occurrences
    Invalid occupation: Exectechnicalworker: 2 occurrences
    Invalid occupation: Exectechnicalsupportanalyst: 2 occurrences
    Invalid occupation: Exectechnicalspecialist: 4 occurrences
    Invalid occupation: Exectechnicalspecialty: 4 occurrences
    Invalid occupation: Exectech-managerial: 28 occurrences
   

Generating Samples for mistral with Temp 10: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [3:53:17<00:00,  2.80s/it]


  Model mistral at Temp 10 - Error Rate: 11.60%
  Error Distribution Summary for Model mistral at Temp 10:
    Input must have exactly 13 fields separated by |: 256 occurrences
    invalid literal for int() with base 10: '2000<': 1 occurrences
    Invalid occupation: Exectutive: 22 occurrences
    Invalid relationship: Other: 44 occurrences
    Capital loss must be between 0 and 99999: 2 occurrences
    Invalid occupation: Executive-management: 13 occurrences
    Invalid occupation: Exectechnicalmanagerial: 7 occurrences
    Invalid occupation: Execthe-managerial: 1 occurrences
    Invalid occupation: Exectechnicalsupportanalyst: 3 occurrences
    Invalid occupation: Exectech-managerial: 30 occurrences
    Invalid occupation: Exectechnical: 8 occurrences
    Invalid occupation: Exect-managerial: 47 occurrences
    Invalid occupation: Exectutive-managerial: 31 occurrences
    Invalid marital status: Married-spouse: 19 occurrences
    Invalid education: Prof-specialty: 9 occurrences
    

Generating Samples for mistral with Temp 15: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [3:56:08<00:00,  2.83s/it]

  Model mistral at Temp 15 - Error Rate: 12.08%
  Error Distribution Summary for Model mistral at Temp 15:
    Input must have exactly 13 fields separated by |: 235 occurrences
    Invalid occupation: Exectech-support: 6 occurrences
    Invalid marital status: Married-spouse: 25 occurrences
    Invalid occupation: Executive: 32 occurrences
    Invalid occupation: Exectech-managerial: 30 occurrences
    Invalid occupation: Execthical-management: 1 occurrences
    Invalid occupation: Executive-managerial: 40 occurrences
    Invalid occupation: Exect-managerial: 62 occurrences
    Invalid relationship: Other: 44 occurrences
    Invalid occupation: Exectec-managerial: 37 occurrences
    Invalid occupation: Exectechnicalsupportanalyst: 5 occurrences
    Invalid occupation: Exectechnicalanalyst: 4 occurrences
    Invalid occupation: Exectechnicalsupportmanager: 2 occurrences
    Invalid occupation: Execthreader: 1 occurrences
    Invalid occupation: Exectutive-managerial: 14 occurrences
    




# Process and Evaluate results

In [21]:
def preprocess(df):
    # Correct column names
    df.columns = df.columns.str.lower().str.replace('-', '_')

    # Clean the income column by stripping spaces and removing periods
    df['target'] = df['target'].str.replace('.', '', regex=False).str.strip()
    
    # Encode '>50K' as 1 and '<=50K' as 0
    df['target'] = df['target'].map({'>50K': 1, '<50K': 0})

    # The dataset has native class for missing data: "?"
    df = df.fillna('?')

    # One-Hot Encoding for categorical variables
    df = pd.get_dummies(df, columns=['relationship', 'marital_status', 'race', 'sex'], drop_first=True)
    df.columns = df.columns.str.lower().str.replace('-', '_') # lower case and use _ instead of -
    
    # Integer Encoding for workclass, occupation, and native-country
    encoder = LabelEncoder()

    # col name missmatches
    df.rename(columns={'country': 'native_country', 'education': 'education_num', 'hours_weekly': 'hours_per_week'}, inplace=True)
    
    # Apply integer encoding
    df['workclass'] = encoder.fit_transform(df['workclass'])
    df['occupation'] = encoder.fit_transform(df['occupation'])
    df['native_country'] = encoder.fit_transform(df['native_country'])
    df['education_num'] = encoder.fit_transform(df['education_num'])
        
    # Convert boolean columns to integers
    df = df.astype(int)

    # HOT ENCODING ISSUE IF YOU MISS CERTAIN VALUES THE COLUMN WONT BE GENERATED
    # Get the columns in X_train_base that are missing in df
    missing_cols = set(X_train_base.columns) - set(df.columns)
    
    # Add the missing columns to df with 0 as the default value
    for col in missing_cols:
        df[col] = 0
    
    # REORDER COLUMNS
    ordered_columns = [col for col in X_train_base.columns if col != 'target']  
    
    # Add 'target' at the end
    ordered_columns += ['target']  
    
    # Reorder
    df = df[ordered_columns]
    
    # List of columns to scale (all columns except 'income')
    numeric_cols = df.columns[df.columns != 'target']
  
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Scale the numerical features and update the DataFrame
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [22]:
# Define the column structure
columns = [
    "Age", "Sex", "Country", "Marital-Status", "Occupation", "Workclass",  "Education",
    "Race", "Relationship", "Hours-Weekly", "Capital-Gain", "Capital-Loss", "Target"
]

for model in models:
    for temp in temperatures:
        # Get the list of pipe-separated strings
        rows = results[model][temp]
        
        # Split each string into a list of values
        split_rows = [row.split('|') for row in rows]
        
        # Create DataFrame
        df = pd.DataFrame(split_rows, columns=columns)
        df = preprocess(df)

        # Split the data into X and y
        X_train = df.drop('target', axis=1)
        y_train = df['target']
        
        # Print the shapes of the sets
        print("Model: " + model + ", Temperature = " + str(temp))
        print(f"Training Set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
        print(f"Test Set: X_test shape = {X_test_base.shape}, y_test shape = {y_test_base.shape}") # We use the test data from base!
        print()
        
        temp_result = []
        for name, ml_model in ml_models:
            # Train the model
            ml_model.fit(X_train, y_train)
        
            # Predict on test set
            y_test_pred_base = ml_model.predict(X_test_base)

            # Store results for the model
            model_results = {
                'Model': name,
                'Test Accuracy': accuracy_score(y_test_base, y_test_pred_base),
                'Test Precision': precision_score(y_test_base, y_test_pred_base),
                'Test Recall': recall_score(y_test_base, y_test_pred_base),
                'Test F1-Score': f1_score(y_test_base, y_test_pred_base)
            }
        
            temp_result.append(model_results)

        # Convert results to a pandas DataFrame
        results_df = pd.DataFrame(temp_result).set_index("Model")

        # Calculate difference dfs (absolute and % difference)
        result_diff = results_df - base_results
        result_pct_diff = (result_diff / base_results) * 100

        print("Results Data Synthesis:")
        display(results_df)
        print("Absolute Difference with base:")
        display(result_diff)
        print("Percentage Difference with base (%):")
        display(result_pct_diff)
        print()
        euclidean(df)
        pca(df)
        print()

Model: llama3.1, Temperature = 5
Training Set: X_train shape = (5000, 24), y_train shape = (5000,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)

Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.404545,0.278871,0.926964,0.428754
Random Forest,0.508036,0.282751,0.677282,0.398949
SVM,0.292456,0.249918,0.966879,0.397174
KNN,0.449995,0.263405,0.713376,0.384748
Gradient Boosting,0.274235,0.238311,0.915499,0.378179


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.405978,-0.358086,0.429299,-0.130007
Random Forest,-0.305558,-0.372662,0.199151,-0.15396
SVM,-0.510595,-0.395592,0.560934,-0.101262
KNN,-0.342,-0.328933,0.273461,-0.120126
Gradient Boosting,-0.530658,-0.365193,0.35966,-0.200512


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-50.088406,-56.218249,86.262799,-23.26697
Random Forest,-37.556618,-56.859085,41.651865,-27.845417
SVM,-63.581899,-61.283673,138.179916,-20.315877
KNN,-43.182112,-55.53126,62.162162,-23.793254
Gradient Boosting,-65.929035,-60.512097,64.705882,-34.649236



Average pairwise distance: 5.381076505087187
Explained variance ratio: [0.22623504 0.08725215 0.08004693]
Cumulative explained variance: 0.3935341243906264

Model: llama3.1, Temperature = 10
Training Set: X_train shape = (5000, 24), y_train shape = (5000,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)

Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.329205,0.244709,0.854352,0.380448
Random Forest,0.366568,0.243337,0.77155,0.369986
SVM,0.275258,0.241153,0.934607,0.383383
KNN,0.431774,0.247072,0.662845,0.359968
Gradient Boosting,0.283038,0.240018,0.911253,0.379958


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.481318,-0.392247,0.356688,-0.178312
Random Forest,-0.447026,-0.412076,0.293418,-0.182924
SVM,-0.527792,-0.404357,0.528662,-0.115053
KNN,-0.360221,-0.345267,0.22293,-0.144906
Gradient Boosting,-0.521855,-0.363486,0.355414,-0.198734


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-59.383683,-61.581472,71.672355,-31.912118
Random Forest,-54.94464,-62.872685,61.367673,-33.083839
SVM,-65.723391,-62.641523,130.230126,-23.082846
KNN,-45.482745,-58.288737,50.675676,-28.701375
Gradient Boosting,-64.835305,-60.229273,63.94194,-34.34195



Average pairwise distance: 5.399144464757518
Explained variance ratio: [0.23384967 0.08594825 0.07841923]
Cumulative explained variance: 0.39821715212913855

Model: llama3.1, Temperature = 15
Training Set: X_train shape = (5000, 24), y_train shape = (5000,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)

Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.408025,0.25584,0.762633,0.383147
Random Forest,0.351827,0.243717,0.802972,0.373937
SVM,0.328795,0.215928,0.678132,0.327556
KNN,0.476507,0.244111,0.558811,0.339788
Gradient Boosting,0.299621,0.230898,0.81741,0.360082


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.402498,-0.381116,0.264968,-0.175614
Random Forest,-0.461767,-0.411696,0.324841,-0.178972
SVM,-0.474255,-0.429582,0.272187,-0.17088
KNN,-0.315488,-0.348228,0.118896,-0.165085
Gradient Boosting,-0.505272,-0.372606,0.261571,-0.218609


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-49.659005,-59.833922,53.242321,-31.42917
Random Forest,-56.756417,-62.814764,67.939609,-32.369188
SVM,-59.056724,-66.549302,67.050209,-34.283191
KNN,-39.834561,-58.788672,27.027027,-32.698306
Gradient Boosting,-62.775022,-61.740364,47.058824,-37.776457



Average pairwise distance: 5.424702196909009
Explained variance ratio: [0.2391896  0.08620023 0.07784094]
Cumulative explained variance: 0.40323076771481875

Model: mistral, Temperature = 5
Training Set: X_train shape = (5000, 24), y_train shape = (5000,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.754837,0.22973,0.007219,0.013998
Random Forest,0.752585,0.296053,0.019108,0.035899
SVM,0.758931,0.0,0.0,0.0
KNN,0.722797,0.292108,0.105308,0.154806
Gradient Boosting,0.753711,0.363636,0.028875,0.053501


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.055686,-0.407227,-0.490446,-0.544763
Random Forest,-0.061009,-0.359361,-0.459023,-0.51701
SVM,-0.044119,-0.64551,-0.405945,-0.498436
KNN,-0.069198,-0.30023,-0.334607,-0.350067
Gradient Boosting,-0.051182,-0.239868,-0.526964,-0.52519


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-6.870422,-63.933216,-98.549488,-97.494896
Random Forest,-7.498742,-54.829625,-96.003552,-93.507167
SVM,-5.493945,-100.0,-100.0,-100.0
KNN,-8.737237,-50.685567,-76.061776,-69.337556
Gradient Boosting,-6.358896,-39.745816,-94.805195,-90.7548



Average pairwise distance: 5.920811158639295
Explained variance ratio: [0.15224094 0.06934319 0.06413392]
Cumulative explained variance: 0.28571804918251786

Model: mistral, Temperature = 10
Training Set: X_train shape = (5000, 24), y_train shape = (5000,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.752687,0.123457,0.004246,0.00821
Random Forest,0.732317,0.193396,0.03482,0.059014
SVM,0.758931,0.0,0.0,0.0
KNN,0.694339,0.213962,0.100212,0.136495
Gradient Boosting,0.758317,0.125,0.000425,0.000846


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.057836,-0.5135,-0.493418,-0.55055
Random Forest,-0.081278,-0.462017,-0.443312,-0.493895
SVM,-0.044119,-0.64551,-0.405945,-0.498436
KNN,-0.097656,-0.378377,-0.339703,-0.368378
Gradient Boosting,-0.046576,-0.478504,-0.555414,-0.577845


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-7.13564,-80.617705,-99.146758,-98.530644
Random Forest,-9.989935,-70.492476,-92.717584,-89.326636
SVM,-5.493945,-100.0,-100.0,-100.0
KNN,-12.330361,-63.878436,-77.220077,-72.964487
Gradient Boosting,-5.786595,-79.287624,-99.923606,-99.853742



Average pairwise distance: 6.04305757034766
Explained variance ratio: [0.15367553 0.06962515 0.06591316]
Cumulative explained variance: 0.2892138408226211

Model: mistral, Temperature = 15
Training Set: X_train shape = (5000, 24), y_train shape = (5000,)
Test Set: X_test shape = (9769, 24), y_test shape = (9769,)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results Data Synthesis:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.750026,0.121739,0.005945,0.011336
Random Forest,0.732931,0.21267,0.039915,0.067215
SVM,0.758931,0.0,0.0,0.0
KNN,0.707851,0.196102,0.068365,0.101385
Gradient Boosting,0.758829,0.333333,0.000425,0.000848


Absolute Difference with base:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-0.060497,-0.515217,-0.49172,-0.547424
Random Forest,-0.080663,-0.442744,-0.438217,-0.485695
SVM,-0.044119,-0.64551,-0.405945,-0.498436
KNN,-0.084144,-0.396236,-0.37155,-0.403488
Gradient Boosting,-0.046064,-0.270171,-0.555414,-0.577843


Percentage Difference with base (%):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-7.464006,-80.887372,-98.805461,-97.971218
Random Forest,-9.914444,-67.551819,-91.651865,-87.84342
SVM,-5.493945,-100.0,-100.0,-100.0
KNN,-10.624273,-66.893538,-84.459459,-79.918647
Gradient Boosting,-5.723006,-44.766998,-99.923606,-99.853432



Average pairwise distance: 6.088627431210506
Explained variance ratio: [0.15339238 0.07029152 0.06398613]
Cumulative explained variance: 0.2876700405669278



In [23]:
# Get unique classes and counts
unique_classes, counts = np.unique(y_test, return_counts=True)
class_balance = dict(zip(unique_classes, counts))

print("Class Balance (Counts):", class_balance)
print("Class Balance (%):", {cls: count / len(y_test) * 100 for cls, count in class_balance.items()})

Class Balance (Counts): {0: 7414, 1: 2355}
Class Balance (%): {0: 75.89313133381104, 1: 24.106868666188966}


In [25]:
# Iterate through models
for index, model in enumerate(models):
    # Loop through the different temperatures
    for temperature in temperatures:
        # Get the list of pipe-separated strings
        rows = results[model][temperature]

        # Split each string into a list of values
        split_rows = [row.split('|') for row in rows]

        # Create DataFrame
        df = pd.DataFrame(split_rows, columns=columns)

        # Get class balance (counts and percentages)
        counts = df['Target'].value_counts()
        percentages = df['Target'].value_counts(normalize=True) * 100

        # Combine into a single DataFrame for clarity
        balance_df = pd.DataFrame({
            'Count': counts,
            'Percentage': percentages.round(2)
        })

        # Print results
        print(f"\nModel: {model}, Temperature: {temperature}")
        print(balance_df)


Model: llama3.1, Temperature: 5
        Count  Percentage
Target                   
>50K     2826       56.52
<50K     2174       43.48

Model: llama3.1, Temperature: 10
        Count  Percentage
Target                   
>50K     2715        54.3
<50K     2285        45.7

Model: llama3.1, Temperature: 15
        Count  Percentage
Target                   
>50K     2682       53.64
<50K     2318       46.36

Model: mistral, Temperature: 5
        Count  Percentage
Target                   
<50K     4036       80.72
>50K      964       19.28

Model: mistral, Temperature: 10
        Count  Percentage
Target                   
<50K     4005        80.1
>50K      995        19.9

Model: mistral, Temperature: 15
        Count  Percentage
Target                   
<50K     4025        80.5
>50K      975        19.5
