# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import time
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tqdm import tqdm

import ollama

In [2]:
# Force CUDA usage
os.environ["OLLAMA_BACKEND"] = "cuda"
os.environ["OLLAMA_NUM_THREADS"] = "16"

### Parameters

In [3]:
# List of models
ml_models = [
    ('Logistic Regression', LogisticRegression(random_state=42, max_iter=1000)), # 2 sec
    ('Random Forest', RandomForestClassifier(random_state=42)), # 2 min
    ('SVM', SVC(probability=True, random_state=42)), # 30 min
    ('KNN', KNeighborsClassifier()), # 30 sec
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)) # 30 sec
]

In [4]:
# LLMs
models = ['llama3.2:1b', 'llama3.2:3b', 'gemma3:1b', 'gemma3:4b', 'llama3.1', 'dolphin3', 'mistral', 'deepseek-llm']

In [5]:
# Vectorize (Bag of words representation)
vectorizer = CountVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))

In [6]:
# Number of rows that LLM will generate and compare to
n_rows = 5000

In [7]:
# list of all letters to stop on (LLM should only return number 1 to 10
stop_chars = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")

# Load data

In [8]:
# Load the CSV file into a pandas DataFrame
# We can only make a fair comparison if we partition similar train and test set 
# train 4000 --> test 1000
# train llm features 4000 --> 1000 test 
df = pd.read_csv('IMB_preprocessed_2025_04_06.csv')[:n_rows] 

# Display the first 5 records
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. \nThe filming t...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# LLM Feature Generation

### Qualitative loop

In [9]:
prompt = '''Analyze this movie review and provide a WHOLE NUMBER rating between 1 and 10 for the "{metric}".
Return ONLY a single digit, with no other text, symbols, or explanations.

Review:
{review}'''

In [10]:
# Review metrics
metrics = ['plot', 'characters', 'acting & performances', 'cinematography']

In [14]:
# Iterate through models
for index, model in enumerate(models):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(models)) + ")")

    # For each row
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Classifying rows"):
        # For each feature that we will generate 
        for metric in metrics:
            # Format the prompt based on the row (review) and the metric
            formatted_prompt = prompt.format(review=row['review'], metric=metric)

            raw_response = None
            attempts = 0
            success = False
            
            # Try up to 10 times
            while attempts < 10 and not success:
                try:
                    # Generate feature 
                    raw_response = ollama.generate(model=model, prompt=formatted_prompt, options={"stop": stop_chars})['response'] # Stop with letters
                    response = int(''.join(filter(str.isdigit, raw_response)))
    
                    # Assert validity response
                    assert 1 <= response <= 10
                    
                    # Store response
                    df.at[index, f"{model}_{metric}"] = response
                    success = True
                    
                except Exception as e:
                    attempts += 1
            
            # If all attempts failed, store 5
            if not success:
                df.at[index, f"{model}_{metric}"] = 5
                print(f"All attempts failed for model {model}, metric {metric}, row {index}. Storing default value 5.")
                print(raw_response)

Processing Model: dolphin3 (Model 1/3)


Classifying rows:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3024/5000 [3:43:32<1:52:06,  3.40s/it]

All attempts failed for model dolphin3, metric plot, row 3024. Storing default value 5.
I
All attempts failed for model dolphin3, metric characters, row 3024. Storing default value 5.
Do
All attempts failed for model dolphin3, metric acting & performances, row 3024. Storing default value 5.
T


Classifying rows:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3025/5000 [3:43:45<3:23:03,  6.17s/it]

All attempts failed for model dolphin3, metric cinematography, row 3024. Storing default value 5.
Do


Classifying rows:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3654/5000 [4:29:25<1:50:56,  4.95s/it]

All attempts failed for model dolphin3, metric plot, row 3654. Storing default value 5.
B
All attempts failed for model dolphin3, metric characters, row 3654. Storing default value 5.
I
All attempts failed for model dolphin3, metric acting & performances, row 3654. Storing default value 5.
I


Classifying rows:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3655/5000 [4:29:37<2:42:07,  7.23s/it]

All attempts failed for model dolphin3, metric cinematography, row 3654. Storing default value 5.
Th


Classifying rows: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [6:09:43<00:00,  4.44s/it]


Processing Model: mistral (Model 2/3)


Classifying rows:   3%|█████▉                                                                                                                                                                                                                       | 133/5000 [08:15<4:56:12,  3.65s/it]

All attempts failed for model mistral, metric plot, row 133. Storing default value 5.
0 (N
All attempts failed for model mistral, metric characters, row 133. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 133. Storing default value 5.
0 (B


Classifying rows:   3%|█████▉                                                                                                                                                                                                                       | 134/5000 [08:22<6:05:21,  4.51s/it]

All attempts failed for model mistral, metric cinematography, row 133. Storing default value 5.
0 (T


Classifying rows:   4%|████████▎                                                                                                                                                                                                                    | 189/5000 [12:07<5:03:35,  3.79s/it]

All attempts failed for model mistral, metric characters, row 189. Storing default value 5.
0


Classifying rows:   4%|████████▍                                                                                                                                                                                                                    | 190/5000 [12:17<7:33:02,  5.65s/it]

All attempts failed for model mistral, metric cinematography, row 189. Storing default value 5.
0 (Th


Classifying rows:   5%|██████████▋                                                                                                                                                                                                                  | 242/5000 [15:07<3:29:00,  2.64s/it]

All attempts failed for model mistral, metric characters, row 242. Storing default value 5.
0 (T


Classifying rows:   7%|███████████████▋                                                                                                                                                                                                             | 355/5000 [22:18<6:16:21,  4.86s/it]

All attempts failed for model mistral, metric plot, row 355. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 355. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 355. Storing default value 5.
0


Classifying rows:   8%|██████████████████▍                                                                                                                                                                                                          | 417/5000 [26:38<3:53:30,  3.06s/it]

All attempts failed for model mistral, metric cinematography, row 416. Storing default value 5.
0 (N


Classifying rows:  10%|███████████████████████                                                                                                                                                                                                      | 523/5000 [33:34<4:06:21,  3.30s/it]

All attempts failed for model mistral, metric characters, row 523. Storing default value 5.
0


Classifying rows:  14%|███████████████████████████████▊                                                                                                                                                                                             | 721/5000 [46:22<5:54:19,  4.97s/it]

All attempts failed for model mistral, metric plot, row 721. Storing default value 5.
0


Classifying rows:  16%|██████████████████████████████████▌                                                                                                                                                                                          | 783/5000 [49:35<2:39:49,  2.27s/it]

All attempts failed for model mistral, metric plot, row 783. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 783. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 783. Storing default value 5.
0


Classifying rows:  21%|████████████████████████████████████████████▊                                                                                                                                                                             | 1027/5000 [1:03:44<4:15:41,  3.86s/it]

All attempts failed for model mistral, metric plot, row 1027. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 1027. Storing default value 5.
0 (B


Classifying rows:  21%|██████████████████████████████████████████████▋                                                                                                                                                                           | 1070/5000 [1:06:25<3:26:36,  3.15s/it]

All attempts failed for model mistral, metric characters, row 1070. Storing default value 5.
0 (T
All attempts failed for model mistral, metric acting & performances, row 1070. Storing default value 5.
0 (T


Classifying rows:  22%|████████████████████████████████████████████████                                                                                                                                                                          | 1102/5000 [1:08:17<4:39:12,  4.30s/it]

All attempts failed for model mistral, metric plot, row 1102. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 1102. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 1102. Storing default value 5.
0


Classifying rows:  25%|██████████████████████████████████████████████████████▊                                                                                                                                                                   | 1258/5000 [1:16:46<3:26:26,  3.31s/it]

All attempts failed for model mistral, metric plot, row 1258. Storing default value 5.
4.5
All attempts failed for model mistral, metric characters, row 1258. Storing default value 5.
4.5
All attempts failed for model mistral, metric acting & performances, row 1258. Storing default value 5.
4.5


Classifying rows:  25%|██████████████████████████████████████████████████████▉                                                                                                                                                                   | 1259/5000 [1:17:00<6:53:12,  6.63s/it]

All attempts failed for model mistral, metric cinematography, row 1258. Storing default value 5.
4.5


Classifying rows:  29%|██████████████████████████████████████████████████████████████▎                                                                                                                                                           | 1429/5000 [1:25:58<3:01:51,  3.06s/it]

All attempts failed for model mistral, metric cinematography, row 1428. Storing default value 5.
0


Classifying rows:  33%|███████████████████████████████████████████████████████████████████████▏                                                                                                                                                  | 1634/5000 [1:39:52<2:54:01,  3.10s/it]

All attempts failed for model mistral, metric plot, row 1634. Storing default value 5.
7.5
All attempts failed for model mistral, metric acting & performances, row 1634. Storing default value 5.
7.5


Classifying rows:  41%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                 | 2029/5000 [2:03:47<3:26:27,  4.17s/it]

All attempts failed for model mistral, metric characters, row 2029. Storing default value 5.
0


Classifying rows:  44%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                         | 2206/5000 [2:14:31<2:42:51,  3.50s/it]

All attempts failed for model mistral, metric plot, row 2206. Storing default value 5.
0


Classifying rows:  48%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                 | 2405/5000 [2:26:57<2:01:27,  2.81s/it]

All attempts failed for model mistral, metric plot, row 2405. Storing default value 5.
0


Classifying rows:  49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                               | 2439/5000 [2:29:09<3:42:41,  5.22s/it]

All attempts failed for model mistral, metric plot, row 2439. Storing default value 5.
11 (I


Classifying rows:  49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                               | 2440/5000 [2:29:21<5:06:36,  7.19s/it]

All attempts failed for model mistral, metric cinematography, row 2439. Storing default value 5.
11


Classifying rows:  54%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 2680/5000 [2:44:13<1:37:43,  2.53s/it]

All attempts failed for model mistral, metric characters, row 2680. Storing default value 5.
0 (T


Classifying rows:  55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 2750/5000 [2:48:23<2:00:02,  3.20s/it]

All attempts failed for model mistral, metric plot, row 2750. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 2750. Storing default value 5.
0 (z
All attempts failed for model mistral, metric acting & performances, row 2750. Storing default value 5.
0


Classifying rows:  57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2827/5000 [2:53:39<2:09:09,  3.57s/it]

All attempts failed for model mistral, metric characters, row 2827. Storing default value 5.
0


Classifying rows:  57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2828/5000 [2:53:48<3:06:53,  5.16s/it]

All attempts failed for model mistral, metric cinematography, row 2827. Storing default value 5.
0 (T


Classifying rows:  60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 2991/5000 [3:05:00<2:11:24,  3.92s/it]

All attempts failed for model mistral, metric characters, row 2991. Storing default value 5.
0 (Th


Classifying rows:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3024/5000 [3:06:59<1:29:31,  2.72s/it]

All attempts failed for model mistral, metric plot, row 3024. Storing default value 5.
 Th
All attempts failed for model mistral, metric characters, row 3024. Storing default value 5.
 Titl
All attempts failed for model mistral, metric acting & performances, row 3024. Storing default value 5.
 Th


Classifying rows:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3025/5000 [3:07:19<4:19:33,  7.89s/it]

All attempts failed for model mistral, metric cinematography, row 3024. Storing default value 5.
 Titl


Classifying rows:  65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3260/5000 [3:21:35<1:43:17,  3.56s/it]

All attempts failed for model mistral, metric characters, row 3260. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 3260. Storing default value 5.
0


Classifying rows:  65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 3261/5000 [3:21:43<2:24:39,  4.99s/it]

All attempts failed for model mistral, metric cinematography, row 3260. Storing default value 5.
0 (Th


Classifying rows:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 3628/5000 [3:43:57<1:09:19,  3.03s/it]

All attempts failed for model mistral, metric cinematography, row 3627. Storing default value 5.
0 (Th


Classifying rows:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3654/5000 [3:45:29<1:35:20,  4.25s/it]

All attempts failed for model mistral, metric plot, row 3654. Storing default value 5.
 Th
All attempts failed for model mistral, metric characters, row 3654. Storing default value 5.
 I
All attempts failed for model mistral, metric acting & performances, row 3654. Storing default value 5.
 I


Classifying rows:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3655/5000 [3:45:38<2:09:12,  5.76s/it]

All attempts failed for model mistral, metric cinematography, row 3654. Storing default value 5.
 I


Classifying rows:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 3932/5000 [4:02:28<57:50,  3.25s/it]

All attempts failed for model mistral, metric plot, row 3932. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 3932. Storing default value 5.
0 (T
All attempts failed for model mistral, metric acting & performances, row 3932. Storing default value 5.
0 (T


Classifying rows:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 3933/5000 [4:02:34<1:13:54,  4.16s/it]

All attempts failed for model mistral, metric cinematography, row 3932. Storing default value 5.
0 (T


Classifying rows:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 4036/5000 [4:08:30<53:35,  3.34s/it]

All attempts failed for model mistral, metric characters, row 4036. Storing default value 5.
0 (T


Classifying rows:  82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 4096/5000 [4:12:03<43:35,  2.89s/it]

All attempts failed for model mistral, metric plot, row 4096. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 4096. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 4096. Storing default value 5.
0


Classifying rows:  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 4223/5000 [4:20:18<1:16:57,  5.94s/it]

All attempts failed for model mistral, metric plot, row 4223. Storing default value 5.
0 (T


Classifying rows:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 4258/5000 [4:22:03<43:15,  3.50s/it]

All attempts failed for model mistral, metric plot, row 4258. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 4258. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 4258. Storing default value 5.
0


Classifying rows:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 4259/5000 [4:22:11<1:01:52,  5.01s/it]

All attempts failed for model mistral, metric cinematography, row 4258. Storing default value 5.
0


Classifying rows:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4422/5000 [4:32:13<43:04,  4.47s/it]

All attempts failed for model mistral, metric plot, row 4422. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 4422. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 4422. Storing default value 5.
0


Classifying rows:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4423/5000 [4:32:21<51:17,  5.33s/it]

All attempts failed for model mistral, metric cinematography, row 4422. Storing default value 5.
0


Classifying rows:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 4442/5000 [4:33:41<38:22,  4.13s/it]

All attempts failed for model mistral, metric plot, row 4442. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 4442. Storing default value 5.
0 (T
All attempts failed for model mistral, metric acting & performances, row 4442. Storing default value 5.
0


Classifying rows:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 4679/5000 [4:49:06<23:54,  4.47s/it]

All attempts failed for model mistral, metric plot, row 4679. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 4679. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 4679. Storing default value 5.
0


Classifying rows:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 4680/5000 [4:49:12<26:57,  5.05s/it]

All attempts failed for model mistral, metric cinematography, row 4679. Storing default value 5.
0


Classifying rows:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 4687/5000 [4:49:55<33:58,  6.51s/it]

All attempts failed for model mistral, metric cinematography, row 4686. Storing default value 5.
0 (


Classifying rows:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 4728/5000 [4:52:30<14:45,  3.25s/it]

All attempts failed for model mistral, metric plot, row 4728. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 4728. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 4728. Storing default value 5.
0


Classifying rows:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 4729/5000 [4:52:35<17:59,  3.98s/it]

All attempts failed for model mistral, metric cinematography, row 4728. Storing default value 5.
0


Classifying rows:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 4771/5000 [4:55:21<21:28,  5.63s/it]

All attempts failed for model mistral, metric plot, row 4771. Storing default value 5.
0
All attempts failed for model mistral, metric characters, row 4771. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 4771. Storing default value 5.
0


Classifying rows:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉          | 4772/5000 [4:55:28<22:25,  5.90s/it]

All attempts failed for model mistral, metric cinematography, row 4771. Storing default value 5.
0


Classifying rows:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 4866/5000 [5:01:40<13:43,  6.15s/it]

All attempts failed for model mistral, metric characters, row 4866. Storing default value 5.
0
All attempts failed for model mistral, metric acting & performances, row 4866. Storing default value 5.
0


Classifying rows:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 4936/5000 [5:06:21<02:35,  2.43s/it]

All attempts failed for model mistral, metric characters, row 4936. Storing default value 5.
0


Classifying rows: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [5:10:06<00:00,  3.72s/it]


Processing Model: deepseek-llm (Model 3/3)


Classifying rows:   2%|███▋                                                                                                                                                                                                                          | 84/5000 [06:52<9:32:19,  6.99s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 83. Storing default value 5.
T


Classifying rows:   4%|█████████▌                                                                                                                                                                                                                   | 215/5000 [17:13<5:18:43,  4.00s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 214. Storing default value 5.
T


Classifying rows:   5%|███████████▌                                                                                                                                                                                                                | 264/5000 [21:10<12:48:51,  9.74s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 263. Storing default value 5.
Th


Classifying rows:   7%|████████████████                                                                                                                                                                                                            | 365/5000 [29:23<10:38:53,  8.27s/it]

All attempts failed for model deepseek-llm, metric plot, row 365. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric characters, row 365. Storing default value 5.
Th


Classifying rows:   7%|████████████████                                                                                                                                                                                                            | 366/5000 [29:43<15:28:47, 12.03s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 365. Storing default value 5.
Th


Classifying rows:   8%|██████████████████▍                                                                                                                                                                                                          | 417/5000 [33:54<5:23:31,  4.24s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 416. Storing default value 5.
T


Classifying rows:  10%|██████████████████████                                                                                                                                                                                                       | 499/5000 [40:48<5:23:24,  4.31s/it]

All attempts failed for model deepseek-llm, metric plot, row 499. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric characters, row 499. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 499. Storing default value 5.
Th


Classifying rows:  10%|██████████████████████                                                                                                                                                                                                      | 500/5000 [41:10<12:01:59,  9.63s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 499. Storing default value 5.
Th


Classifying rows:  10%|██████████████████████▋                                                                                                                                                                                                      | 514/5000 [42:01<4:51:04,  3.89s/it]

All attempts failed for model deepseek-llm, metric plot, row 514. Storing default value 5.
An


Classifying rows:  13%|█████████████████████████████▋                                                                                                                                                                                               | 672/5000 [54:52<4:13:20,  3.51s/it]

All attempts failed for model deepseek-llm, metric plot, row 672. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric characters, row 672. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 672. Storing default value 5.
Th


Classifying rows:  17%|█████████████████████████████████████▏                                                                                                                                                                                     | 848/5000 [1:08:15<4:02:11,  3.50s/it]

All attempts failed for model deepseek-llm, metric characters, row 848. Storing default value 5.
P


Classifying rows:  17%|█████████████████████████████████████▏                                                                                                                                                                                     | 849/5000 [1:08:24<5:55:02,  5.13s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 848. Storing default value 5.
P


Classifying rows:  17%|█████████████████████████████████████▋                                                                                                                                                                                     | 860/5000 [1:09:12<6:43:17,  5.84s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 859. Storing default value 5.
P


Classifying rows:  19%|█████████████████████████████████████████▎                                                                                                                                                                                 | 943/5000 [1:15:19<4:36:42,  4.09s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 943. Storing default value 5.
4.0


Classifying rows:  19%|█████████████████████████████████████████▎                                                                                                                                                                                 | 944/5000 [1:15:35<8:43:38,  7.75s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 943. Storing default value 5.
Th


Classifying rows:  19%|█████████████████████████████████████████▍                                                                                                                                                                                 | 946/5000 [1:15:47<7:25:09,  6.59s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 945. Storing default value 5.
T


Classifying rows:  20%|████████████████████████████████████████████▍                                                                                                                                                                             | 1018/5000 [1:20:59<5:46:33,  5.22s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 1018. Storing default value 5.
4/10


Classifying rows:  21%|██████████████████████████████████████████████                                                                                                                                                                            | 1057/5000 [1:24:27<4:26:12,  4.05s/it]

All attempts failed for model deepseek-llm, metric plot, row 1057. Storing default value 5.
An


Classifying rows:  22%|████████████████████████████████████████████████                                                                                                                                                                          | 1102/5000 [1:27:56<5:48:54,  5.37s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 1102. Storing default value 5.
4/10


Classifying rows:  22%|███████████████████████████████████████████████▊                                                                                                                                                                         | 1103/5000 [1:28:18<11:04:59, 10.24s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1102. Storing default value 5.
4/10


Classifying rows:  24%|███████████████████████████████████████████████████▋                                                                                                                                                                      | 1186/5000 [1:34:03<4:57:24,  4.68s/it]

All attempts failed for model deepseek-llm, metric plot, row 1186. Storing default value 5.
P


Classifying rows:  25%|██████████████████████████████████████████████████████▌                                                                                                                                                                   | 1252/5000 [1:38:55<4:43:22,  4.54s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1251. Storing default value 5.
T


Classifying rows:  25%|███████████████████████████████████████████████████████▏                                                                                                                                                                  | 1267/5000 [1:40:11<4:49:19,  4.65s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 1267. Storing default value 5.
4.2


Classifying rows:  27%|███████████████████████████████████████████████████████████▌                                                                                                                                                              | 1367/5000 [1:47:01<4:11:57,  4.16s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1366. Storing default value 5.
T


Classifying rows:  28%|████████████████████████████████████████████████████████████▏                                                                                                                                                             | 1381/5000 [1:47:52<3:29:21,  3.47s/it]

All attempts failed for model deepseek-llm, metric characters, row 1381. Storing default value 5.
Th


Classifying rows:  28%|████████████████████████████████████████████████████████████▎                                                                                                                                                             | 1382/5000 [1:48:10<7:56:06,  7.90s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1381. Storing default value 5.
Th


Classifying rows:  28%|██████████████████████████████████████████████████████████████                                                                                                                                                            | 1423/5000 [1:50:58<4:27:51,  4.49s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1422. Storing default value 5.
T


Classifying rows:  29%|███████████████████████████████████████████████████████████████                                                                                                                                                           | 1447/5000 [1:52:57<8:44:00,  8.85s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1446. Storing default value 5.
Th


Classifying rows:  30%|████████████████████████████████████████████████████████████████▍                                                                                                                                                         | 1478/5000 [1:55:47<4:56:14,  5.05s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 1478. Storing default value 5.
Th


Classifying rows:  31%|██████████████████████████████████████████████████████████████████▊                                                                                                                                                       | 1531/5000 [2:00:37<3:22:50,  3.51s/it]

All attempts failed for model deepseek-llm, metric plot, row 1531. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric characters, row 1531. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 1531. Storing default value 5.
Th


Classifying rows:  31%|██████████████████████████████████████████████████████████████████▍                                                                                                                                                      | 1532/5000 [2:01:06<10:44:17, 11.15s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1531. Storing default value 5.
Th


Classifying rows:  31%|███████████████████████████████████████████████████████████████████▏                                                                                                                                                      | 1540/5000 [2:01:39<4:15:55,  4.44s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1539. Storing default value 5.
T


Classifying rows:  34%|█████████████████████████████████████████████████████████████████████████▌                                                                                                                                                | 1687/5000 [2:13:15<6:43:47,  7.31s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1686. Storing default value 5.
Th


Classifying rows:  35%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                                              | 1732/5000 [2:16:51<3:26:50,  3.80s/it]

All attempts failed for model deepseek-llm, metric characters, row 1732. Storing default value 5.
T


Classifying rows:  35%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                                              | 1733/5000 [2:16:57<4:03:32,  4.47s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 1732. Storing default value 5.
T


Classifying rows:  35%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                                              | 1741/5000 [2:17:35<3:21:15,  3.71s/it]

All attempts failed for model deepseek-llm, metric plot, row 1741. Storing default value 5.
Th


Classifying rows:  40%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                  | 2018/5000 [2:38:53<4:04:18,  4.92s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2017. Storing default value 5.
P


Classifying rows:  40%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                 | 2023/5000 [2:39:36<7:54:42,  9.57s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2022. Storing default value 5.
Th


Classifying rows:  41%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                               | 2073/5000 [2:44:00<6:16:09,  7.71s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2072. Storing default value 5.
Th


Classifying rows:  45%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2228/5000 [2:55:42<4:06:55,  5.34s/it]

All attempts failed for model deepseek-llm, metric plot, row 2228. Storing default value 5.
Th


Classifying rows:  45%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                        | 2229/5000 [2:55:56<6:14:20,  8.11s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2228. Storing default value 5.
Th


Classifying rows:  45%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                        | 2245/5000 [2:57:16<5:00:41,  6.55s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2244. Storing default value 5.
Th


Classifying rows:  46%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                     | 2317/5000 [3:03:00<2:54:10,  3.89s/it]

All attempts failed for model deepseek-llm, metric plot, row 2317. Storing default value 5.
St
All attempts failed for model deepseek-llm, metric acting & performances, row 2317. Storing default value 5.
T


Classifying rows:  48%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                 | 2393/5000 [3:08:57<5:17:56,  7.32s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 2393. Storing default value 5.
Th


Classifying rows:  48%|████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                 | 2394/5000 [3:09:10<6:33:30,  9.06s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2393. Storing default value 5.
Th


Classifying rows:  50%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                            | 2522/5000 [3:19:26<3:00:53,  4.38s/it]

All attempts failed for model deepseek-llm, metric plot, row 2522. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 2522. Storing default value 5.
Th


Classifying rows:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 2523/5000 [3:19:48<6:28:25,  9.41s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2522. Storing default value 5.
Th


Classifying rows:  52%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                         | 2587/5000 [3:24:54<4:08:26,  6.18s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2586. Storing default value 5.
T


Classifying rows:  52%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2622/5000 [3:27:44<4:04:48,  6.18s/it]

All attempts failed for model deepseek-llm, metric plot, row 2622. Storing default value 5.
4/5
All attempts failed for model deepseek-llm, metric characters, row 2622. Storing default value 5.
9/10
All attempts failed for model deepseek-llm, metric acting & performances, row 2622. Storing default value 5.
4/5


Classifying rows:  52%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                       | 2623/5000 [3:28:11<8:09:08, 12.35s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2622. Storing default value 5.
4/5


Classifying rows:  56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 2789/5000 [3:41:27<3:12:11,  5.22s/it]

All attempts failed for model deepseek-llm, metric plot, row 2789. Storing default value 5.
Th


Classifying rows:  56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 2790/5000 [3:41:44<5:23:50,  8.79s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2789. Storing default value 5.
Th


Classifying rows:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2822/5000 [3:44:23<2:41:16,  4.44s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 2821. Storing default value 5.
T


Classifying rows:  59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 2949/5000 [3:55:31<2:02:26,  3.58s/it]

All attempts failed for model deepseek-llm, metric characters, row 2949. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 2949. Storing default value 5.
Th


Classifying rows:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3022/5000 [4:01:34<2:57:20,  5.38s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3021. Storing default value 5.
P


Classifying rows:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 3024/5000 [4:01:40<2:17:51,  4.19s/it]

All attempts failed for model deepseek-llm, metric plot, row 3024. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric characters, row 3024. Storing default value 5.
B
All attempts failed for model deepseek-llm, metric acting & performances, row 3024. Storing default value 5.
Th


Classifying rows:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 3025/5000 [4:02:15<7:27:28, 13.59s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3024. Storing default value 5.
Th


Classifying rows:  61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 3060/5000 [4:05:21<3:05:52,  5.75s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3059. Storing default value 5.
4.0


Classifying rows:  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 3181/5000 [4:14:10<2:19:32,  4.60s/it]

All attempts failed for model deepseek-llm, metric characters, row 3181. Storing default value 5.
T


Classifying rows:  64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 3216/5000 [4:17:00<1:58:08,  3.97s/it]

All attempts failed for model deepseek-llm, metric plot, row 3216. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 3216. Storing default value 5.
Th


Classifying rows:  64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 3217/5000 [4:17:20<4:17:19,  8.66s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3216. Storing default value 5.
Th


Classifying rows:  65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 3231/5000 [4:18:31<2:31:52,  5.15s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3230. Storing default value 5.
T


Classifying rows:  65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 3255/5000 [4:20:27<2:01:49,  4.19s/it]

All attempts failed for model deepseek-llm, metric plot, row 3255. Storing default value 5.
T


Classifying rows:  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3421/5000 [4:33:37<2:36:22,  5.94s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3420. Storing default value 5.
Th


Classifying rows:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 3453/5000 [4:36:33<3:10:34,  7.39s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3452. Storing default value 5.
4/10


Classifying rows:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 3455/5000 [4:36:43<2:40:17,  6.22s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3454. Storing default value 5.
P


Classifying rows:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 3488/5000 [4:39:18<1:38:07,  3.89s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3487. Storing default value 5.
P


Classifying rows:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 3497/5000 [4:40:04<2:26:56,  5.87s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3496. Storing default value 5.
P


Classifying rows:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                          | 3651/5000 [4:51:35<2:21:46,  6.31s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3650. Storing default value 5.
#movi


Classifying rows:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3654/5000 [4:51:52<2:06:43,  5.65s/it]

All attempts failed for model deepseek-llm, metric characters, row 3654. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 3654. Storing default value 5.
Th


Classifying rows:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 3655/5000 [4:52:26<5:14:50, 14.05s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3654. Storing default value 5.
Th


Classifying rows:  75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 3754/5000 [4:59:44<1:50:23,  5.32s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3753. Storing default value 5.
P


Classifying rows:  76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 3789/5000 [5:02:33<2:16:40,  6.77s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3788. Storing default value 5.
T


Classifying rows:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 3939/5000 [5:14:24<1:14:38,  4.22s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 3939. Storing default value 5.
Th


Classifying rows:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 3940/5000 [5:14:45<2:44:07,  9.29s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 3939. Storing default value 5.
Th


Classifying rows:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                           | 4007/5000 [5:19:39<2:06:02,  7.62s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4006. Storing default value 5.
3/10


Classifying rows:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 4059/5000 [5:23:36<1:17:33,  4.95s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4058. Storing default value 5.
T


Classifying rows:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 4212/5000 [5:35:51<1:49:29,  8.34s/it]

All attempts failed for model deepseek-llm, metric characters, row 4212. Storing default value 5.



Classifying rows:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 4259/5000 [5:39:16<58:49,  4.76s/it]

All attempts failed for model deepseek-llm, metric plot, row 4259. Storing default value 5.
T


Classifying rows:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                               | 4274/5000 [5:40:43<1:12:08,  5.96s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4273. Storing default value 5.
T


Classifying rows:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 4297/5000 [5:42:18<52:44,  4.50s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 4297. Storing default value 5.
Th


Classifying rows:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 4353/5000 [5:46:41<44:08,  4.09s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 4353. Storing default value 5.
Th


Classifying rows:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4354/5000 [5:46:58<1:27:01,  8.08s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4353. Storing default value 5.
Th


Classifying rows:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 4393/5000 [5:50:20<1:34:14,  9.32s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4392. Storing default value 5.
Th


Classifying rows:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 4422/5000 [5:52:23<51:24,  5.34s/it]

All attempts failed for model deepseek-llm, metric plot, row 4422. Storing default value 5.
3/10
All attempts failed for model deepseek-llm, metric characters, row 4422. Storing default value 5.
4/10
All attempts failed for model deepseek-llm, metric acting & performances, row 4422. Storing default value 5.
4/10


Classifying rows:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                         | 4423/5000 [5:52:42<1:31:05,  9.47s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4422. Storing default value 5.
St


Classifying rows:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 4452/5000 [5:55:18<54:34,  5.98s/it]

All attempts failed for model deepseek-llm, metric plot, row 4452. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 4452. Storing default value 5.
4/10


Classifying rows:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 4453/5000 [5:55:51<2:09:03, 14.16s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4452. Storing default value 5.
Th


Classifying rows:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                      | 4482/5000 [5:57:58<30:14,  3.50s/it]

All attempts failed for model deepseek-llm, metric plot, row 4482. Storing default value 5.
P


Classifying rows:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 4565/5000 [6:04:41<42:02,  5.80s/it]

All attempts failed for model deepseek-llm, metric acting & performances, row 4565. Storing default value 5.
4/10


Classifying rows:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 4566/5000 [6:04:58<1:05:46,  9.09s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4565. Storing default value 5.
9/10


Classifying rows:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                   | 4568/5000 [6:05:05<45:02,  6.26s/it]

All attempts failed for model deepseek-llm, metric characters, row 4568. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 4568. Storing default value 5.
Th


Classifying rows:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 4569/5000 [6:05:24<1:12:36, 10.11s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4568. Storing default value 5.
Th


Classifying rows:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 4686/5000 [6:15:12<40:19,  7.70s/it]

All attempts failed for model deepseek-llm, metric plot, row 4686. Storing default value 5.
P


Classifying rows:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 4761/5000 [6:21:03<22:43,  5.70s/it]

All attempts failed for model deepseek-llm, metric characters, row 4761. Storing default value 5.
Th


Classifying rows:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 4776/5000 [6:22:33<19:03,  5.10s/it]

All attempts failed for model deepseek-llm, metric plot, row 4776. Storing default value 5.
Th


Classifying rows:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 4777/5000 [6:22:47<28:41,  7.72s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4776. Storing default value 5.
Th


Classifying rows:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 4810/5000 [6:25:41<22:04,  6.97s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4809. Storing default value 5.
T


Classifying rows:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 4863/5000 [6:29:38<11:03,  4.85s/it]

All attempts failed for model deepseek-llm, metric plot, row 4863. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric characters, row 4863. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 4863. Storing default value 5.
Th


Classifying rows:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 4864/5000 [6:29:59<22:09,  9.77s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4863. Storing default value 5.
Th


Classifying rows:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 4899/5000 [6:32:52<07:06,  4.22s/it]

All attempts failed for model deepseek-llm, metric plot, row 4899. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric characters, row 4899. Storing default value 5.
Th
All attempts failed for model deepseek-llm, metric acting & performances, row 4899. Storing default value 5.
Th


Classifying rows:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 4900/5000 [6:33:12<15:15,  9.15s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4899. Storing default value 5.
Th


Classifying rows:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 4902/5000 [6:33:26<13:44,  8.42s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4901. Storing default value 5.
B


Classifying rows:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 4951/5000 [6:37:32<04:26,  5.45s/it]

All attempts failed for model deepseek-llm, metric plot, row 4951. Storing default value 5.
Th


Classifying rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 4986/5000 [6:40:14<01:14,  5.35s/it]

All attempts failed for model deepseek-llm, metric cinematography, row 4985. Storing default value 5.
T


Classifying rows: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [6:41:12<00:00,  4.81s/it]


In [15]:
models = ['llama3.2:1b', 'llama3.2:3b', 'gemma3:1b', 'gemma3:4b', 'llama3.1', 'dolphin3', 'mistral', 'deepseek-llm']

In [16]:
df.head()

Unnamed: 0,review,sentiment,llama3.2:1b_plot,llama3.2:1b_characters,llama3.2:1b_acting & performances,llama3.2:1b_cinematography,llama3.2:3b_plot,llama3.2:3b_characters,llama3.2:3b_acting & performances,llama3.2:3b_cinematography,...,dolphin3_acting & performances,dolphin3_cinematography,mistral_plot,mistral_characters,mistral_acting & performances,mistral_cinematography,deepseek-llm_plot,deepseek-llm_characters,deepseek-llm_acting & performances,deepseek-llm_cinematography
0,One of the other reviewers has mentioned that ...,1,4.0,1.0,9.0,5.0,9.0,8.0,8.0,8.0,...,9.0,8.0,8.0,8.0,8.0,8.0,7.0,9.0,9.0,7.0
1,A wonderful little production. \nThe filming t...,1,6.0,5.0,8.0,3.0,9.0,8.0,8.0,8.0,...,9.0,8.0,9.0,9.0,9.0,9.0,4.0,4.0,4.0,4.0
2,I thought this was a wonderful way to spend ti...,1,3.0,2.0,1.0,3.0,7.0,8.0,8.0,8.0,...,8.0,7.0,8.0,7.0,8.0,8.0,4.0,4.0,4.0,4.0
3,Basically there's a family where a little boy ...,0,2.0,2.0,2.0,2.0,5.0,7.0,6.0,4.0,...,5.0,6.0,5.0,6.0,6.0,6.0,7.0,4.0,6.0,4.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,5.0,3.0,6.0,6.0,7.0,8.0,7.0,8.0,...,9.0,9.0,7.0,8.0,8.0,9.0,4.0,4.0,4.0,4.0


# Comparison

### Split the data
We make a train and test split, however the train is seperated in the LLM generated features and the review text

In [17]:
def experimental_setup(data, model):
    # We select the cols review, sentiment and the metric columns for the model (LLM)
    selected_cols = ['review', 'sentiment'] + [f"{model}_{metric}" for metric in metrics]
    df = data.copy()[selected_cols]
    
    # Split into X and y
    X = df.drop('sentiment', axis=1)
    y = df['sentiment']
    
    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_text = X_train['review']
    X_train_tab = X_train.drop('review', axis=1)
    X_test_text = X_test['review']
    X_test_tab = X_test.drop('review', axis=1)
    
    # Transform text reviews to Bag of Words representation
    X_train_text = vectorizer.fit_transform(X_train_text)
    X_test_text = vectorizer.transform(X_test_text)
    
    # Print the shapes of the sets
    print(f"Train shapes - Text: {X_train_text.shape}, Tab: {X_train_tab.shape}, Label: {y_train.shape}")
    print(f"Test shapes  - Text: {X_test_text.shape}, Tab: {X_test_tab.shape}, Label: {y_test.shape}")
    print()

    return X_train_text, X_train_tab, y_train, X_test_text, X_test_tab, y_test

### Method to train model and get performance metrics

In [18]:
# Function to train and evaluate models with multiple metrics
def evaluate_models_with_metrics(models, X_train, y_train, X_test, y_test):
    results = []

    for name, model in models:      
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_test_pred = model.predict(X_test)

        # Store results for the model
        model_results = {
            'Model': name,
            'Test Accuracy': accuracy_score(y_test, y_test_pred),
            'Test Precision': precision_score(y_test, y_test_pred),
            'Test Recall': recall_score(y_test, y_test_pred),
            'Test F1-Score': f1_score(y_test, y_test_pred)
        }
        
        results.append(model_results)

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(results).set_index("Model")
    
    return results_df

## Textual vs LLM Generated Features - Classifcations

In [19]:
# For each LLM
for index, model in enumerate(models):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(models)) + ")")

    # Generate the data split
    X_train_text, X_train_tab, y_train, X_test_text, X_test_tab, y_test = experimental_setup(df, model)

    # Evaluate models with multiple metrics and print results
    textual_results = evaluate_models_with_metrics(ml_models, X_train_text, y_train, X_test_text, y_test)
    tabular_results = evaluate_models_with_metrics(ml_models, X_train_tab, y_train, X_test_tab, y_test) 
    
    # Compute absolute and percentage differences
    abs_diff = (textual_results - tabular_results).abs().round(3)
    percent_diff = ((abs_diff / tabular_results) * 100).round(2)
    
    # Display results
    print("Predictions on review text:")
    display(textual_results)
    print("Predictions on tabular features derived from text (by LLM):")
    display(tabular_results)
    print("Absolute Differences:")
    display(abs_diff)
    print("Percentage Differences:")
    display(percent_diff)
    print()

Processing Model: llama3.2:1b (Model 1/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.674,0.660714,0.629787,0.64488
Random Forest,0.597,0.571734,0.568085,0.569904
SVM,0.679,0.662309,0.646809,0.654467
KNN,0.635,0.610994,0.614894,0.612937
Gradient Boosting,0.667,0.643606,0.653191,0.648363


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.192,0.179,0.253,0.216
Random Forest,0.237,0.262,0.24,0.251
SVM,0.162,0.144,0.223,0.183
KNN,0.049,0.05,0.064,0.057
Gradient Boosting,0.119,0.099,0.181,0.137


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,28.49,27.09,40.17,33.49
Random Forest,39.7,45.83,42.25,44.04
SVM,23.86,21.74,34.48,27.96
KNN,7.72,8.18,10.41,9.3
Gradient Boosting,17.84,15.38,27.71,21.13



Processing Model: llama3.2:3b (Model 2/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.875,0.854209,0.885106,0.869383
Random Forest,0.851,0.829569,0.859574,0.844305
SVM,0.874,0.852459,0.885106,0.868476
KNN,0.857,0.845666,0.851064,0.848356
Gradient Boosting,0.873,0.852156,0.882979,0.867294


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.009,0.014,0.002,0.008
Random Forest,0.017,0.004,0.051,0.024
SVM,0.033,0.046,0.015,0.031
KNN,0.271,0.285,0.3,0.293
Gradient Boosting,0.087,0.11,0.049,0.082


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,1.03,1.64,0.23,0.92
Random Forest,2.0,0.48,5.93,2.84
SVM,3.78,5.4,1.69,3.57
KNN,31.62,33.7,35.25,34.54
Gradient Boosting,9.97,12.91,5.55,9.45



Processing Model: gemma3:1b (Model 3/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.805,0.855297,0.704255,0.772462
Random Forest,0.826,0.877551,0.731915,0.798144
SVM,0.811,0.838554,0.740426,0.786441
KNN,0.827,0.875949,0.73617,0.8
Gradient Boosting,0.828,0.86165,0.755319,0.804989


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.061,0.015,0.179,0.089
Random Forest,0.008,0.044,0.077,0.023
SVM,0.03,0.032,0.13,0.051
KNN,0.241,0.315,0.185,0.244
Gradient Boosting,0.042,0.119,0.079,0.019


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,7.58,1.75,25.42,11.52
Random Forest,0.97,5.01,10.52,2.88
SVM,3.7,3.82,17.56,6.48
KNN,29.14,35.96,25.13,30.5
Gradient Boosting,5.07,13.81,10.46,2.36



Processing Model: gemma3:4b (Model 4/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.938,0.943478,0.923404,0.933333
Random Forest,0.939,0.951435,0.917021,0.933911
SVM,0.939,0.949451,0.919149,0.934054
KNN,0.942,0.953744,0.921277,0.937229
Gradient Boosting,0.939,0.945534,0.923404,0.934338


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.072,0.103,0.04,0.072
Random Forest,0.105,0.118,0.109,0.113
SVM,0.098,0.143,0.049,0.097
KNN,0.356,0.393,0.37,0.381
Gradient Boosting,0.153,0.203,0.089,0.149


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,7.68,10.92,4.33,7.71
Random Forest,11.18,12.4,11.89,12.1
SVM,10.44,15.06,5.33,10.38
KNN,37.79,41.21,40.16,40.65
Gradient Boosting,16.29,21.47,9.64,15.95



Processing Model: llama3.1 (Model 5/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.914,0.884,0.940426,0.91134
Random Forest,0.904,0.894515,0.902128,0.898305
SVM,0.907,0.890269,0.914894,0.902413
KNN,0.91,0.907725,0.9,0.903846
Gradient Boosting,0.91,0.894191,0.917021,0.905462


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.048,0.044,0.057,0.05
Random Forest,0.07,0.061,0.094,0.078
SVM,0.066,0.084,0.045,0.065
KNN,0.324,0.347,0.349,0.348
Gradient Boosting,0.124,0.152,0.083,0.12


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,5.25,4.98,6.06,5.49
Random Forest,7.74,6.82,10.42,8.68
SVM,7.28,9.44,4.92,7.2
KNN,35.6,38.23,38.78,38.5
Gradient Boosting,13.63,17.0,9.05,13.25



Processing Model: dolphin3 (Model 6/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.912,0.894628,0.921277,0.907757
Random Forest,0.903,0.890985,0.904255,0.897571
SVM,0.911,0.897704,0.914894,0.906217
KNN,0.909,0.900634,0.906383,0.903499
Gradient Boosting,0.906,0.896624,0.904255,0.900424


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.046,0.055,0.038,0.047
Random Forest,0.069,0.058,0.096,0.077
SVM,0.07,0.091,0.045,0.069
KNN,0.323,0.34,0.355,0.348
Gradient Boosting,0.12,0.154,0.07,0.115


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,5.04,6.15,4.12,5.18
Random Forest,7.64,6.51,10.62,8.58
SVM,7.68,10.14,4.92,7.61
KNN,35.53,37.75,39.17,38.52
Gradient Boosting,13.25,17.18,7.74,12.77



Processing Model: mistral (Model 7/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.953,0.941545,0.959574,0.950474
Random Forest,0.944,0.933054,0.948936,0.940928
SVM,0.954,0.952991,0.948936,0.950959
KNN,0.945,0.940552,0.942553,0.941552
Gradient Boosting,0.954,0.951064,0.951064,0.951064


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.087,0.101,0.077,0.089
Random Forest,0.11,0.1,0.14,0.12
SVM,0.113,0.146,0.079,0.114
KNN,0.359,0.38,0.391,0.386
Gradient Boosting,0.168,0.209,0.117,0.165


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,9.13,10.73,8.02,9.36
Random Forest,11.65,10.72,14.75,12.75
SVM,11.84,15.32,8.33,11.99
KNN,37.99,40.4,41.48,41.0
Gradient Boosting,17.61,21.98,12.3,17.35



Processing Model: deepseek-llm (Model 8/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Predictions on review text:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866,0.840081,0.882979,0.860996
Random Forest,0.834,0.833333,0.808511,0.820734
SVM,0.841,0.806706,0.870213,0.837257
KNN,0.586,0.560606,0.551064,0.555794
Gradient Boosting,0.786,0.742424,0.834043,0.785571


Predictions on tabular features derived from text (by LLM):


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.686,0.681395,0.623404,0.651111
Random Forest,0.838,0.776978,0.919149,0.842105
SVM,0.779,0.704433,0.912766,0.795181
KNN,0.745,0.804533,0.604255,0.690158
Gradient Boosting,0.837,0.771681,0.92766,0.842512


Absolute Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.18,0.159,0.26,0.21
Random Forest,0.004,0.056,0.111,0.021
SVM,0.062,0.102,0.043,0.042
KNN,0.159,0.244,0.053,0.134
Gradient Boosting,0.051,0.029,0.094,0.057


Percentage Differences:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,26.24,23.33,41.71,32.25
Random Forest,0.48,7.21,12.08,2.49
SVM,7.96,14.48,4.71,5.28
KNN,21.34,30.33,8.77,19.42
Gradient Boosting,6.09,3.76,10.13,6.77





## Textual vs Ensemble - Classifcations

In [20]:
def evaluate_ensemble_model(ml_model_textual, ml_model_tabular): 
    assert ml_model_textual[0] == ml_model_tabular[0]
    model_name = ml_model_textual[0]
    
    # Extract model from tuple
    ml_model_tabular = ml_model_tabular[1]
    ml_model_textual = ml_model_textual[1]
    
    # Get prediction probabilities for both modalities       
    probas_tab_test = ml_model_tabular.predict_proba(X_test_tab)
    probas_txt_test = ml_model_textual.predict_proba(X_test_text)

    # Combine probabilities (average them)
    combined_proba_test = (probas_tab_test + probas_txt_test) / 2
    
    # Get final predictions
    y_pred_test = np.argmax(combined_proba_test, axis=1)

    # Store results
    ensemble_results = ({
        'Model': model_name,
        'Test Accuracy': accuracy_score(y_test, y_pred_test),
        'Test Precision': precision_score(y_test, y_pred_test),
        'Test Recall': recall_score(y_test, y_pred_test),
        'Test F1-Score': f1_score(y_test, y_pred_test)
    })

    return ensemble_results

In [21]:
# For each LLM
for index, model in enumerate(models):
    print("Processing Model: " + model + " (Model " + str(index + 1) + "/" + str(len(models)) + ")")

    # Generate the data split
    X_train_text, X_train_tab, y_train, X_test_text, X_test_tab, y_test = experimental_setup(df, model)

    # Evaluate models with multiple metrics and print results
    # This also has the effect of training the models in ml_models
    textual_results = evaluate_models_with_metrics(ml_models, X_train_text, y_train, X_test_text, y_test)
    textual_models = deepcopy(ml_models) # Save the list of trained models
        
    tabular_results = evaluate_models_with_metrics(ml_models, X_train_tab, y_train, X_test_tab, y_test)
    tabular_models = deepcopy(ml_models) # Save the list of trained models

    # We will store the results of the ensemble method
    ensemble_results_lst = []
    
    # Loop over the tabular and textual ml models to make ensemble models
    for ml_model_tabular, ml_model_textual in zip(tabular_models, textual_models):
        ensemble_results_lst.append(evaluate_ensemble_model(ml_model_textual, ml_model_tabular))

    ensemble_results = pd.DataFrame(ensemble_results_lst).set_index("Model")

    # Calculate percentage difference tables
    text_diff_pct = (100 * (ensemble_results - textual_results) / textual_results).round(2)
    tab_diff_pct = (100 * (ensemble_results - tabular_results) / tabular_results).round(2)
    
    # Display all results
    print("Ensemble Results:")
    display(ensemble_results)
    print("% Difference ensemble vs textual model:")
    display(text_diff_pct)
    print("% Difference ensemble vs tabular model:")
    display(tab_diff_pct)
    print()

Processing Model: llama3.2:1b (Model 1/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.877,0.860707,0.880851,0.870662
Random Forest,0.739,0.717256,0.734043,0.725552
SVM,0.856,0.82996,0.87234,0.850622
KNN,0.642,0.652174,0.510638,0.572792
Gradient Boosting,0.813,0.796646,0.808511,0.802534


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,1.27,2.46,-0.24,1.12
Random Forest,-11.39,-13.93,-9.21,-11.6
SVM,1.78,2.88,0.24,1.6
KNN,9.56,16.33,-7.34,3.06
Gradient Boosting,3.44,7.3,-3.06,2.16


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,30.12,30.27,39.86,35.01
Random Forest,23.79,25.45,29.21,27.31
SVM,26.07,25.31,34.87,29.97
KNN,1.1,6.74,-16.96,-6.55
Gradient Boosting,21.89,23.78,23.78,23.78



Processing Model: llama3.2:3b (Model 2/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.913,0.903158,0.912766,0.907937
Random Forest,0.898,0.881743,0.904255,0.892857
SVM,0.911,0.902748,0.908511,0.90562
KNN,0.853,0.87471,0.802128,0.836848
Gradient Boosting,0.901,0.892178,0.897872,0.895016


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,5.43,7.51,3.37,5.45
Random Forest,7.67,5.81,11.84,8.79
SVM,8.32,11.91,4.4,8.17
KNN,45.56,56.03,45.56,50.57
Gradient Boosting,14.63,20.17,7.65,13.93


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,4.34,5.73,3.13,4.43
Random Forest,5.52,6.29,5.2,5.75
SVM,4.23,5.9,2.64,4.28
KNN,-0.47,3.43,-5.75,-1.36
Gradient Boosting,3.21,4.7,1.69,3.2



Processing Model: gemma3:1b (Model 3/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.893,0.88535,0.887234,0.886291
Random Forest,0.879,0.864301,0.880851,0.872497
SVM,0.873,0.853608,0.880851,0.867016
KNN,0.801,0.867209,0.680851,0.762813
Gradient Boosting,0.868,0.831373,0.902128,0.865306


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,3.12,5.39,0.48,2.94
Random Forest,5.4,3.72,8.95,6.31
SVM,3.8,5.81,1.22,3.55
KNN,36.69,54.69,23.55,37.25
Gradient Boosting,10.43,11.98,8.16,10.15


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,10.93,3.51,25.98,14.74
Random Forest,6.42,-1.51,20.35,9.32
SVM,7.64,1.8,18.97,10.25
KNN,-3.14,-1.0,-7.51,-4.65
Gradient Boosting,4.83,-3.51,19.44,7.49



Processing Model: gemma3:4b (Model 4/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.927,0.914405,0.931915,0.923077
Random Forest,0.945,0.950108,0.931915,0.940924
SVM,0.941,0.940043,0.934043,0.937033
KNN,0.922,0.947489,0.882979,0.914097
Gradient Boosting,0.946,0.946352,0.938298,0.942308


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,7.04,8.85,5.54,7.21
Random Forest,13.31,14.01,15.26,14.64
SVM,11.89,16.53,7.33,11.92
KNN,57.34,69.01,60.23,64.47
Gradient Boosting,20.36,27.47,12.5,19.95


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-1.17,-3.08,0.92,-1.1
Random Forest,0.64,-0.14,1.62,0.75
SVM,0.21,-0.99,1.62,0.32
KNN,-2.12,-0.66,-4.16,-2.47
Gradient Boosting,0.75,0.09,1.61,0.85



Processing Model: llama3.1 (Model 5/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.925,0.912317,0.929787,0.920969
Random Forest,0.92,0.904564,0.92766,0.915966
SVM,0.926,0.9125,0.931915,0.922105
KNN,0.893,0.911565,0.855319,0.882547
Gradient Boosting,0.927,0.917895,0.92766,0.922751


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,6.81,8.6,5.3,6.97
Random Forest,10.31,8.55,14.74,11.6
SVM,10.11,13.11,7.09,10.13
KNN,52.39,62.6,55.21,58.79
Gradient Boosting,17.94,23.63,11.22,17.46


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,1.2,3.2,-1.13,1.06
Random Forest,1.77,1.12,2.83,1.97
SVM,2.09,2.5,1.86,2.18
KNN,-1.87,0.42,-4.96,-2.36
Gradient Boosting,1.87,2.65,1.16,1.91



Processing Model: dolphin3 (Model 6/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.932,0.920502,0.93617,0.92827
Random Forest,0.913,0.903158,0.912766,0.907937
SVM,0.929,0.923567,0.925532,0.924548
KNN,0.893,0.915332,0.851064,0.882029
Gradient Boosting,0.928,0.923404,0.923404,0.923404


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,7.62,9.57,6.02,7.81
Random Forest,9.47,8.38,12.89,10.62
SVM,10.46,14.49,6.36,10.43
KNN,52.39,63.28,54.44,58.7
Gradient Boosting,18.07,24.38,10.71,17.55


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,2.19,2.89,1.62,2.26
Random Forest,1.11,1.37,0.94,1.15
SVM,1.98,2.88,1.16,2.02
KNN,-1.76,1.63,-6.1,-2.38
Gradient Boosting,2.43,2.99,2.12,2.55



Processing Model: mistral (Model 7/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.93,0.909836,0.944681,0.926931
Random Forest,0.945,0.933194,0.951064,0.942044
SVM,0.949,0.948608,0.942553,0.945571
KNN,0.939,0.955457,0.912766,0.933624
Gradient Boosting,0.949,0.944798,0.946809,0.945802


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,7.39,8.3,6.99,7.66
Random Forest,13.31,11.98,17.63,14.78
SVM,12.84,17.59,8.31,12.94
KNN,60.24,70.43,65.64,67.98
Gradient Boosting,20.74,27.26,13.52,20.4


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,-2.41,-3.37,-1.55,-2.48
Random Forest,0.11,0.01,0.22,0.12
SVM,-0.52,-0.46,-0.67,-0.57
KNN,-0.63,1.58,-3.16,-0.84
Gradient Boosting,-0.52,-0.66,-0.45,-0.55



Processing Model: deepseek-llm (Model 8/8)
Train shapes - Text: (4000, 10000), Tab: (4000, 4), Label: (4000,)
Test shapes  - Text: (1000, 10000), Tab: (1000, 4), Label: (1000,)

Ensemble Results:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.869,0.850932,0.874468,0.862539
Random Forest,0.868,0.816479,0.92766,0.868526
SVM,0.881,0.846154,0.912766,0.878199
KNN,0.777,0.845938,0.642553,0.730351
Gradient Boosting,0.887,0.83871,0.940426,0.88666


% Difference ensemble vs textual model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.35,1.29,-0.96,0.18
Random Forest,4.08,-2.02,14.74,5.82
SVM,4.76,4.89,4.89,4.89
KNN,32.59,50.9,16.6,31.41
Gradient Boosting,12.85,12.97,12.76,12.87


% Difference ensemble vs tabular model:


Unnamed: 0_level_0,Test Accuracy,Test Precision,Test Recall,Test F1-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,26.68,24.88,40.27,32.47
Random Forest,3.58,5.08,0.93,3.14
SVM,13.09,20.12,0.0,10.44
KNN,4.3,5.15,6.34,5.82
Gradient Boosting,5.97,8.69,1.38,5.24



