# Imports

In [1]:
import os
import numpy as np
import pandas as pd

from PIL import Image
import base64
import io

import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dense, Concatenate, Rescaling
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

from tensorflow.keras import backend as K
import gc

import ollama

from tqdm import tqdm

import psutil
import subprocess
import time

In [2]:
# Force CUDA usage
os.environ["OLLAMA_BACKEND"] = "cuda"
os.environ["OLLAMA_NUM_THREADS"] = "16"

# Parameters

In [3]:
# Ollama visual models
llms = ['gemma3:4b', 'llava:7b', 'llava-llama3:8b']

In [4]:
# Number of rows to generate compressions for
n_rows = 2500

In [5]:
# NN Parameters
epochs = 25
batch_size = 16

In [6]:
# Initialize CountVectorizer for Bag-of-Words
bow_max_features = 10000
vectorizer = CountVectorizer(stop_words='english', max_features=bow_max_features, ngram_range=(1, 2))

# Loading data

In [7]:
# Work df is randomly sampled df with the size of n_rows which is the number of rows that will be processed by LLMs
_, work_df = train_test_split(pd.read_csv("houses_preprocessed.csv"), test_size=n_rows, shuffle=True, random_state=42)

In [8]:
work_df.head()

Unnamed: 0,n_citi,bed,bath,sqft,price,image
4772,-0.530372,-0.489366,-0.472771,-0.47392,898000,houses_preprocessed/4793.jpg
3707,1.018093,-0.489366,-0.472771,-0.753836,554900,houses_preprocessed/3727.jpg
14159,-1.286806,0.477001,0.570296,0.513102,969000,houses_preprocessed/14333.jpg
6934,0.145969,1.443367,0.570296,0.771561,634900,houses_preprocessed/7055.jpg
13453,0.341752,-1.455732,-1.515838,-1.502881,397000,houses_preprocessed/13627.jpg


# LLM Compression Generation

## Method to allign image for Ollama visual models

In [9]:
def df_image_path_to_base64(image_path):
    # Memory management
    with Image.open(image_path) as img:
        with io.BytesIO() as buffered:
            img.save(buffered, format="JPEG")

            return base64.b64encode(buffered.getvalue()).decode('utf-8')

In [10]:
def check_memory(llm):
    # Code to fix memory leak, if above 85% memory usage reload Ollama model
    if psutil.virtual_memory().percent > 85:
        print("Reseting Memory...")
        subprocess.run(['ollama', 'stop', llm])
        time.sleep(5)
        subprocess.run(['ollama', 'run', llm])

## Qualified loop

In [11]:
prompt = '''Analyze the given image of the house in great detail, focusing on key features that influence real estate value.
Do not attempt to estimate or provide a price! Only describe observable attributes that an appraiser or real estate agent would use to determine value.
Provide a neutral, detailed description, without pricing opinions. Provide only objective observations for valuation purposes.'''

In [12]:
# Iterate through models
for index, model in enumerate(llms):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(llms)) + ")")

    # For each row
    for index, row in tqdm(work_df.iterrows(), total=len(work_df), desc="Parsing rows"):     
        # Memory leak fix
        check_memory(model)  
        
        # Do the necessary image conversion
        image = df_image_path_to_base64(row['image'])
        
        # Regress the price, get rid of commas and periods 
        response = ollama.generate(model=model, prompt=prompt, images=[image])['response']               
       
        # Store response
        work_df.at[index, f"{model}_summary"] = response

Processing Model: gemma3:4b (Model 1/3)


Parsing rows:  19%|█████████████████████████████████████████▍                                                                                                                                                                                    | 467/2500 [6:26:55<23:27:40, 41.54s/it]

Reseting Memory...


Parsing rows:  35%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                                                               | 874/2500 [11:28:49<20:05:45, 44.49s/it]

Reseting Memory...


Parsing rows:  51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                           | 1282/2500 [16:37:56<14:53:11, 44.00s/it]

Reseting Memory...


Parsing rows:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 1689/2500 [21:41:45<10:00:30, 44.43s/it]

Reseting Memory...


Parsing rows:  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 2090/2500 [26:48:52<5:21:05, 46.99s/it]

Reseting Memory...


Parsing rows:  97%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 2433/2500 [31:02:35<1:01:07, 54.73s/it]

Reseting Memory...


Parsing rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [31:57:07<00:00, 46.01s/it]


Processing Model: llava:7b (Model 2/3)


Parsing rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [32:01:05<00:00, 46.11s/it]


Processing Model: llava-llama3:8b (Model 3/3)


Parsing rows:  32%|███████████████████████████████████████████████████████████████████████▌                                                                                                                                                      | 806/2500 [7:56:41<32:51:53, 69.84s/it]

Reseting Memory...


Parsing rows:  32%|███████████████████████████████████████████████████████████████████████▋                                                                                                                                                      | 807/2500 [7:58:00<34:06:21, 72.52s/it]

Reseting Memory...


Parsing rows:  32%|███████████████████████████████████████████████████████████████████████▊                                                                                                                                                      | 808/2500 [7:59:32<36:49:24, 78.35s/it]

Reseting Memory...


Parsing rows:  32%|███████████████████████████████████████████████████████████████████████▊                                                                                                                                                      | 809/2500 [8:00:54<37:21:24, 79.53s/it]

Reseting Memory...


Parsing rows:  32%|███████████████████████████████████████████████████████████████████████▉                                                                                                                                                      | 810/2500 [8:01:53<34:21:16, 73.18s/it]

Reseting Memory...


Parsing rows:  32%|████████████████████████████████████████████████████████████████████████                                                                                                                                                      | 811/2500 [8:03:12<35:15:06, 75.14s/it]

Reseting Memory...


Parsing rows:  32%|████████████████████████████████████████████████████████████████████████                                                                                                                                                      | 812/2500 [8:04:34<36:09:54, 77.13s/it]

Reseting Memory...


Parsing rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [33:54:44<00:00, 48.83s/it]


In [13]:
work_df.head()

Unnamed: 0,n_citi,bed,bath,sqft,price,image,gemma3:4b_summary,llava:7b_summary,llava-llama3:8b_summary
4772,-0.530372,-0.489366,-0.472771,-0.47392,898000,houses_preprocessed/4793.jpg,"Okay, here’s a detailed, objective description...",The image shows a single-story residential ho...,The image presents a serene residential scene ...
3707,1.018093,-0.489366,-0.472771,-0.753836,554900,houses_preprocessed/3727.jpg,"Okay, here's a detailed, objective description...",The image displays a single-family residentia...,The image presents a single-story house painte...
14159,-1.286806,0.477001,0.570296,0.513102,969000,houses_preprocessed/14333.jpg,"Okay, here’s a detailed, objective description...",This image features a two-story residential h...,The image presents a two-story house painted i...
6934,0.145969,1.443367,0.570296,0.771561,634900,houses_preprocessed/7055.jpg,"Okay, here's a detailed, objective description...",The image shows a two-story residential house...,"The image captures a serene suburban scene, do..."
13453,0.341752,-1.455732,-1.515838,-1.502881,397000,houses_preprocessed/13627.jpg,"Here's a detailed, objective description of th...",The image shows a single-story house with sev...,The image captures a quaint scene of a house p...


# Experimental set up

## Train and Test the models on the same data partioning

In [14]:
# Split the df with LLM generated compressions into train and test
X_work_df = work_df[work_df.columns.difference(['price'])]
y_work_df = work_df['price']
X_train, X_test, y_train, y_test = train_test_split(X_work_df, y_work_df, test_size=0.2, random_state=42)

### Compression dfs

In [15]:
# Train data with compression cols
X_train_tab_compression = X_train[X_train.columns.difference(['image'])] # pd
X_train_img = X_train['image'] # pd

# Test data ith compression cols
X_test_tab_compression = X_test[X_test.columns.difference(['image'])] # pd 
X_test_img = X_test['image'] # pd

# Print shapes
print("Compression Training Data Shapes:")
print(f"Tabular features: {X_train_tab_compression.shape}")
print(f"Image features: {X_train_img.shape}")
print(f"Target prices: {y_train.shape}")
print("\nCompression Test Data Shapes:")
print(f"Tabular features: {X_test_tab_compression.shape}")
print(f"Image features: {X_test_img.shape}")
print(f"Target prices: {y_test.shape}")

Compression Training Data Shapes:
Tabular features: (2000, 7)
Image features: (2000,)
Target prices: (2000,)

Compression Test Data Shapes:
Tabular features: (500, 7)
Image features: (500,)
Target prices: (500,)


### Base dfs

In [16]:
base_cols = ['n_citi', 'bed', 'bath', 'sqft']

# Train and Test data - no compression cols
X_train_tab = X_train[base_cols].values 
X_test_tab = X_test[base_cols].values 

# Print shapes
print("Training Data Shapes:")
print(f"Tabular features: {X_train_tab.shape}")
print("\nTest Data Shapes:")
print(f"Tabular features: {X_test_tab.shape}")

Training Data Shapes:
Tabular features: (2000, 4)

Test Data Shapes:
Tabular features: (500, 4)


# Neural Networks and Models

## Base NN and Resnet

In [17]:
def base_nn(input_size_tabular):
    # Image processing branch
    img_input = Input(shape=(311, 415, 3), name='image_input')
    x = Conv2D(32, (3, 3), activation='relu')(img_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)
    
    # Tabular data processing branch
    tabular_input = Input(shape=(input_size_tabular,), name='tabular_input')
    y = Dense(64, activation='relu')(tabular_input)
    y = Dense(32, activation='relu')(y)
    
    # Combine both branches
    combined = Concatenate()([x, y])
    z = Dense(64, activation='relu')(combined)
    output = Dense(1)(z)  # Regression output for price prediction
    
    nn_model = Model(inputs=[img_input, tabular_input], outputs=output)
    
    # Compile the model
    nn_model.compile(optimizer='adam',
                  loss='mae',
                  metrics=['mae', 'R2Score'])
    
    # Display model summary debug
    # nn_model.summary()

    return nn_model

In [18]:
def resnet_nn(input_size_tabular):
    # Image processing branch with pre-trained ResNet50
    res_net = ResNet50(weights='imagenet', include_top=False, input_shape=(311, 415, 3))
    
    # Unfreeze only the last 10 layers of resnet (fine-tuning) 
    res_net.trainable = False 
    for layer in res_net.layers[-10:]:
        layer.trainable = True

    # Image processing branch
    img_input = Input(shape=(311, 415, 3), name='image_input')
    x = res_net(img_input)
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)
    
    # Tabular data processing branch
    tabular_input = Input(shape=(input_size_tabular,), name='tabular_input')
    y = Dense(64, activation='relu')(tabular_input)
    y = Dense(32, activation='relu')(y)
    
    # Combine both branches
    combined = Concatenate()([x, y])
    z = Dense(64, activation='relu')(combined)
    output = Dense(1)(z)  # Regression output for price prediction
    
    # Define the model
    res_net_model = Model(inputs=[img_input, tabular_input], outputs=output)
    
    # Compile the model
    res_net_model.compile(optimizer='adam', 
                          loss='mae',
                          metrics=['mae', 'R2Score'])
   
    # Display model summary debug
    # res_net_model.summary()

    return res_net_model

In [19]:
'''
I did not write this code, the code is from: https://www.tensorflow.org/tutorials/load_data/images
It helps us train the NN more dynamically, it loads images on the go, such that not all RAM is used up.
It does try to maximise RAM usage this is basically what the tf.data.AUTOTUNE does.
'''

# Loads an image and normalizes it from [0,1]
def process_example(image_path, tabular_features, label):
    # Load raw bytes and convert to RGB
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)

    # Normalize image to [0, 1] and convert to float32
    image = tf.image.convert_image_dtype(image, tf.float32)

    return (image, tabular_features), label


# Creates on the fly data sets to train/test the model, we need this to not exceed memory
def create_dataset(image_paths, tabular_data, labels, shuffle=True):
    # Convert to tensors
    image_paths = tf.convert_to_tensor(image_paths)
    tabular_data = tf.convert_to_tensor(tabular_data, dtype=tf.float32)
    labels = tf.convert_to_tensor(labels, dtype=tf.float32)

    # Build dataset
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, tabular_data, labels))
    dataset = dataset.map(lambda img, tab, lbl: process_example(img, tab, lbl), num_parallel_calls=tf.data.AUTOTUNE)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(image_paths))
    
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset


def train_and_evaluate_nn(nn, 
                          X_train_img_paths, X_train_tab, y_train,
                          X_test_img_paths, X_test_tab, y_test,
                          verbose=1):

    # Dynamic dataset loading
    train_ds = create_dataset(X_train_img_paths, X_train_tab, y_train, shuffle=True) # Shuffle to break ordering
    test_ds = create_dataset(X_test_img_paths, X_test_tab, y_test, shuffle=False) # No shuffle, we arent learning, just predicting

    # Train and Test
    history = nn.fit(train_ds, epochs=epochs, verbose=verbose)
    test_loss, test_mae, r2 = nn.evaluate(test_ds, verbose=0)

    return history, test_loss, test_mae, r2

## Logistic regression

In [20]:
def train_and_evaluate_lin_model(model, X_train_tab, y_train, X_test_tab, y_test):
    # Train the model
    model.fit(X_train_tab, y_train)
    
    # Evaluate the model
    y_test_pred = model.predict(X_test_tab)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    
    return mae_test, r2

### Train and Evaluate

In [21]:
# Create NNs with tabular features = 4 (n_citi, bed, bath, sqft)
nn_base = base_nn(4)
nn_resnet = resnet_nn(4)
lin = LinearRegression()

In [22]:
# NN
print("Training Base NN")
nn_base_hist, _, nn_base_mae, nn_base_r2 = train_and_evaluate_nn(nn_base, X_train_img, X_train_tab, y_train, X_test_img, X_test_tab, y_test)
print(f"NN Base MAE: {nn_base_mae:.0f}\nNN Base R2: {nn_base_r2:.2f}")

# Try to clear NN from memory
K.clear_session()
gc.collect()

Training Base NN
Epoch 1/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 1s/step - R2Score: -3.0475 - loss: 627222.6250 - mae: 627222.6250
Epoch 2/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 1s/step - R2Score: -0.1603 - loss: 268338.7500 - mae: 268338.7500
Epoch 3/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 1s/step - R2Score: -0.1530 - loss: 267106.5938 - mae: 267106.5938
Epoch 4/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 1s/step - R2Score: -0.1470 - loss: 271209.4688 - mae: 271209.4688
Epoch 5/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 1s/step - R2Score: -0.1206 - loss: 279016.1250 - mae: 279016.1250
Epoch 6/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 1s/step - R2Score: -0.0982 - loss: 275680.7812 - mae: 275680.7812
Epoch 7/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 1s/step - R2Score: -0.0657 - loss:

0

In [23]:
# Resnet
print("Training Resnet")
nn_resnet_hist, _, nn_resnet_mae, nn_resnet_r2 = train_and_evaluate_nn(nn_resnet, X_train_img, X_train_tab, y_train, X_test_img, X_test_tab, y_test)
print(f"Resnet MAE: {nn_resnet_mae:.0f}\nResnet R2: {nn_resnet_r2:.2f}")

# Try to clear NN from memory
K.clear_session()
gc.collect()

Training Resnet
Epoch 1/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 2s/step - R2Score: -3.8917 - loss: 686153.0625 - mae: 686153.0625
Epoch 2/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 3s/step - R2Score: -3.3183 - loss: 667710.2500 - mae: 667710.2500
Epoch 3/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 3s/step - R2Score: -1.8100 - loss: 453433.2812 - mae: 453433.2812
Epoch 4/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 3s/step - R2Score: -0.0618 - loss: 268738.9688 - mae: 268738.9688
Epoch 5/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 3s/step - R2Score: 0.0785 - loss: 248846.0000 - mae: 248846.0000
Epoch 6/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 3s/step - R2Score: 0.0991 - loss: 247852.4375 - mae: 247852.4375
Epoch 7/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 3s/step - R2Score: 0.1297 - loss: 239

0

In [24]:
# LR
print("Training LR")
lr_mae, lr_r2 = train_and_evaluate_lin_model(lin, X_train_tab, y_train, X_test_tab, y_test)
print(f"LR MAE: {lr_mae:.0f}\nLR R2: {lr_r2:.2f}")

Training LR
LR MAE: 223539
LR R2: 0.39


# Measure Compression

In [25]:
for llm in llms:
    text_col = f"{llm}_summary"

    image_sizes = []
    summary_sizes = []

    for idx, row in work_df.iterrows():
        image_path = row['image']
        summary_text = row[text_col]

        image_size = os.path.getsize(image_path)
        summary_size = len(summary_text.encode('utf-8'))

        image_sizes.append(image_size)
        summary_sizes.append(summary_size)

    # Totals
    total_image_size = sum(image_sizes)
    total_summary_size = sum(summary_sizes)
    total_abs_compression = total_image_size - total_summary_size

    # Averages
    avg_image_size = total_image_size / len(image_sizes)
    avg_summary_size = total_summary_size / len(summary_sizes)
    avg_compression_pct = ((avg_image_size - avg_summary_size) / avg_image_size) * 100
    avg_abs_compression = avg_image_size - avg_summary_size

    print(f"[{llm}] Average Image Size: {avg_image_size:.0f} bytes")
    print(f"[{llm}] Average Text Size: {avg_summary_size:.0f} bytes")
    print(f"[{llm}] Average Compression: {avg_compression_pct:.2f}%")
    print(f"[{llm}] Average Absolute Compression: {avg_abs_compression:.0f} bytes")
    print(f"[{llm}] Total Absolute Compression: {total_abs_compression:.0f} bytes\n")

[gemma3:4b] Average Image Size: 47296 bytes
[gemma3:4b] Average Text Size: 1923 bytes
[gemma3:4b] Average Compression: 95.93%
[gemma3:4b] Average Absolute Compression: 45373 bytes
[gemma3:4b] Total Absolute Compression: 113433243 bytes

[llava:7b] Average Image Size: 47296 bytes
[llava:7b] Average Text Size: 2310 bytes
[llava:7b] Average Compression: 95.12%
[llava:7b] Average Absolute Compression: 44986 bytes
[llava:7b] Total Absolute Compression: 112465902 bytes

[llava-llama3:8b] Average Image Size: 47296 bytes
[llava-llama3:8b] Average Text Size: 996 bytes
[llava-llama3:8b] Average Compression: 97.89%
[llava-llama3:8b] Average Absolute Compression: 46300 bytes
[llava-llama3:8b] Total Absolute Compression: 115750563 bytes



# Comparison BOW (Image Summarization) Model vs Default Setup

In [26]:
def bow(llm, model, X_train_bow, y_train, X_test_bow, y_test):
    # Column with the compression text per LLM
    text_col = f"{llm}_summary"
    
    # Transform training and test text data
    X_train_text = vectorizer.fit_transform(X_train_bow[text_col])
    X_test_text = vectorizer.transform(X_test_bow[text_col])

    # Train model
    model.fit(X_train_text, y_train)
    
    # Make predictions
    y_test_pred = model.predict(X_test_text)
    
    # Calculate metrics
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    return test_mae, test_r2

In [27]:
def create_comparison(llm, model_results):
    # DF Structure
    comparison_data = {
        'Model': [],
        'MAE': [],
        'R2': []
    }
    
    for model_name, mae, r2 in model_results:
        comparison_data['Model'].append(model_name)
        comparison_data['MAE'].append(round(mae))
        comparison_data['R2'].append(round(r2, 3))
    
    # Make into df
    comparison_df = pd.DataFrame(comparison_data).set_index("Model")
    
    # Display df
    print(f"Comparison of Models for {llm}")
    display(comparison_df)
    print()
    return comparison_df

In [28]:
# List of ml models
ml_models = [
    ('BOW Logistic Regression', LogisticRegression(random_state=42)),
    ('BOW Random Forest', RandomForestClassifier(random_state=42)),
    ('BOW SVM', SVC(random_state=42)),
    ('BOW KNN', KNeighborsClassifier()),
    ('BOW Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

# For each LLM
for llm in llms:
    # Save results for each ML model
    model_results = []
    
    # For each ML model
    for model_name, model in ml_models:
        # Create, train and evaluate bag of words model 
        mae, r2 = bow(llm, model, X_train_tab_compression, y_train, X_test_tab_compression, y_test)
        model_results.append((model_name, mae, r2)) # Save results for each ML model

    # Store default results
    model_results.append(('NN Base', nn_base_mae, nn_base_r2))
    model_results.append(('NN Resnet', nn_resnet_mae, nn_resnet_r2))
    model_results.append(('LR (tabular only)', lr_mae, lr_r2))
    
    # Create and display the comparison per LLM
    create_comparison(llm, model_results)

Comparison of Models for gemma3:4b


Unnamed: 0_level_0,MAE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOW Logistic Regression,298159,-0.129
BOW Random Forest,310983,-0.211
BOW SVM,282080,0.015
BOW KNN,386348,-0.801
BOW Gradient Boosting,314268,-0.181
NN Base,227783,0.33
NN Resnet,228431,0.34
LR (tabular only),223539,0.387



Comparison of Models for llava:7b


Unnamed: 0_level_0,MAE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOW Logistic Regression,293683,-0.033
BOW Random Forest,319841,-0.239
BOW SVM,284108,0.001
BOW KNN,407877,-0.993
BOW Gradient Boosting,331829,-0.343
NN Base,227783,0.33
NN Resnet,228431,0.34
LR (tabular only),223539,0.387





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Comparison of Models for llava-llama3:8b


Unnamed: 0_level_0,MAE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
BOW Logistic Regression,307549,-0.168
BOW Random Forest,362486,-0.577
BOW SVM,283264,0.002
BOW KNN,385486,-0.804
BOW Gradient Boosting,336167,-0.439
NN Base,227783,0.33
NN Resnet,228431,0.34
LR (tabular only),223539,0.387





# Comparison NN (tabular, image and text) VS NN (tabular and image)

In [29]:
def nn_tab_img_text(input_size_tabular):
    # Image data branch
    img_input = Input(shape=(311, 415, 3), name='image_input')
    x = Conv2D(32, (3, 3), activation='relu')(img_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)

    # Tabular data branch
    tabular_input = Input(shape=(input_size_tabular,), name='tabular_input')
    y = Dense(64, activation='relu')(tabular_input)
    y = Dense(32, activation='relu')(y)

    # Textual BoW branch
    text_input = Input(shape=(bow_max_features,), name='text_input')
    t = Dense(256, activation='relu')(text_input)
    t = Dense(64, activation='relu')(t)

    # Combine all three branches
    combined = Concatenate()([x, y, t])
    z = Dense(64, activation='relu')(combined)
    output = Dense(1)(z)  # Regression output for price prediction

    # Define the model with all three inputs
    nn_model = Model(inputs=[img_input, tabular_input, text_input], outputs=output)

    # Compile the model
    nn_model.compile(optimizer='adam',
                     loss='mae',
                     metrics=['mae', 'R2Score'])

    # Display model summary debug
    # nn_model.summary()

    return nn_model

In [30]:
'''
I did not write this code, the code is from: https://www.tensorflow.org/tutorials/load_data/images
It helps us train the NN more dynamically, it loads images on the go, such that not all RAM is used up.
It does try to maximise RAM usage this is basically what the tf.data.AUTOTUNE does.
'''

# Loads an image and normalizes it from [0,1]
def process_example(image_path, tabular_features, text_features, label):
    # Load raw bytes and convert to RGB
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)

    # Normalize image to [0, 1] and convert to float32
    image = tf.image.convert_image_dtype(image, tf.float32)

    return (image, tabular_features, text_features), label


# Creates on the fly data sets to train/test the model, we need this to not exceed memory
def create_dataset(image_paths, tabular_data, text_data, labels, shuffle=True):
    # Convert to tensors
    image_paths = tf.convert_to_tensor(image_paths)
    tabular_data = tf.convert_to_tensor(tabular_data, dtype=tf.float32)
    text_data = tf.convert_to_tensor(text_data, dtype=tf.float32)
    labels = tf.convert_to_tensor(labels, dtype=tf.float32)

    # Build dataset
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, tabular_data, text_data, labels))
    dataset = dataset.map(lambda img, tab, txt, lbl: process_example(img, tab, txt, lbl), num_parallel_calls=tf.data.AUTOTUNE)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(image_paths))
    
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset


def train_and_evaluate_nn_tab_img_text(nn, 
                                       X_train_img_paths, X_train_tab, X_train_text, y_train, 
                                       X_test_img_paths, X_test_tab, X_test_text, y_test, 
                                       verbose=1):

    # Dynamic dataset loading
    train_ds = create_dataset(X_train_img_paths, X_train_tab, X_train_text, y_train, shuffle=True) # Shuffle to break ordering
    test_ds = create_dataset(X_test_img_paths, X_test_tab, X_test_text, y_test, shuffle=False) # No shuffle, we arent learning, just predicting

    # Train and Test
    history = nn.fit(train_ds, epochs=epochs, verbose=verbose)
    test_loss, test_mae, r2 = nn.evaluate(test_ds, verbose=0)

    return history, test_loss, test_mae, r2

In [31]:
import logging

# Supress retracing warning
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # '0' = all messages, '3' = fatal only
tf.get_logger().setLevel(logging.ERROR)

In [32]:
# For each LLM
for llm in llms:
    model_results = []
    
    # NN with image, tabular features (4 base cols) and text (BoW)
    nn_tab_img_text_model = nn_tab_img_text(4) # Retracing
    
    # Column with the compression text per LLM
    text_col = f"{llm}_summary"
    
    # Transform training and test text data
    X_train_text = vectorizer.fit_transform(X_train[text_col]).toarray()
    X_test_text = vectorizer.transform(X_test[text_col]).toarray()

    # Create, train and evaluate NN
    print("Training NN (image, tabular, text)")
    _, _, mae, r2 = train_and_evaluate_nn_tab_img_text(nn_tab_img_text_model, 
                                                       X_train_img, X_train_tab, X_train_text, y_train,
                                                       X_test_img, X_test_tab, X_test_text, y_test)
    model_results.append(('NN (image, tabular, text)', mae, r2))

    # Store default results
    model_results.append(('NN Base (image, tabular)', nn_base_mae, nn_base_r2))
    model_results.append(('NN Resnet (image, tabular)', nn_resnet_mae, nn_resnet_r2))
    model_results.append(('LR (tabular)', lr_mae, lr_r2))
    
    # Create and display the comparison per LLM
    print()
    create_comparison(llm, model_results)
    
    # Try to clear NN from memory
    K.clear_session()
    gc.collect()

Training NN (image, tabular, text)
Epoch 1/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 1s/step - R2Score: -2.9077 - loss: 609352.5000 - mae: 609352.5000
Epoch 2/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 1s/step - R2Score: -0.1847 - loss: 283381.6875 - mae: 283381.6875
Epoch 3/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 2s/step - R2Score: -0.1098 - loss: 276760.7188 - mae: 276760.7188
Epoch 4/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 2s/step - R2Score: -0.0420 - loss: 259812.9062 - mae: 259812.9062
Epoch 5/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 2s/step - R2Score: 0.1323 - loss: 241556.6250 - mae: 241556.6250
Epoch 6/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 1s/step - R2Score: 0.2758 - loss: 208074.6250 - mae: 208074.6250
Epoch 7/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 1s/step - R2Score:

Unnamed: 0_level_0,MAE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
"NN (image, tabular, text)",235452,0.345
"NN Base (image, tabular)",227783,0.33
"NN Resnet (image, tabular)",228431,0.34
LR (tabular),223539,0.387



Training NN (image, tabular, text)
Epoch 1/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 2s/step - R2Score: -3.2735 - loss: 615984.1250 - mae: 615984.1250
Epoch 2/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 2s/step - R2Score: -0.1619 - loss: 278372.8438 - mae: 278372.8438
Epoch 3/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 2s/step - R2Score: -0.1536 - loss: 285162.0938 - mae: 285162.0938
Epoch 4/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 2s/step - R2Score: -0.1132 - loss: 269768.7812 - mae: 269768.7812
Epoch 5/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 2s/step - R2Score: -0.0320 - loss: 263212.5938 - mae: 263212.5938
Epoch 6/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 2s/step - R2Score: 0.0547 - loss: 252206.0469 - mae: 252206.0469
Epoch 7/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 2s/step - R2Scor

Unnamed: 0_level_0,MAE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
"NN (image, tabular, text)",247099,0.294
"NN Base (image, tabular)",227783,0.33
"NN Resnet (image, tabular)",228431,0.34
LR (tabular),223539,0.387



Training NN (image, tabular, text)
Epoch 1/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 2s/step - R2Score: -2.8036 - loss: 615537.3125 - mae: 615537.3125
Epoch 2/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 2s/step - R2Score: -0.1516 - loss: 285395.8125 - mae: 285395.8125
Epoch 3/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 2s/step - R2Score: -0.1553 - loss: 291642.2188 - mae: 291642.2188
Epoch 4/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 2s/step - R2Score: -0.1037 - loss: 281018.2500 - mae: 281018.2500
Epoch 5/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 2s/step - R2Score: -0.0280 - loss: 260381.0156 - mae: 260381.0156
Epoch 6/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 2s/step - R2Score: 0.0865 - loss: 247966.6250 - mae: 247966.6250
Epoch 7/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 2s/step - R2Scor

Unnamed: 0_level_0,MAE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
"NN (image, tabular, text)",249079,0.253
"NN Base (image, tabular)",227783,0.33
"NN Resnet (image, tabular)",228431,0.34
LR (tabular),223539,0.387



