In [8]:
import numpy as np
import matplotlib.pyplot as plt
from image_extractor import data_loader
import pandas as pd


NUM_SAMPLES = 50000 ### Change the amount of data seen by the model
PRICE_CAP = None
VERBOSE = False

splits = np.load("test_predictions.npz")

idx_test  = splits["idx_test"]
test_predicted = splits["test_predicted"]

df = data_loader(NUM_SAMPLES, PRICE_CAP, verbose=VERBOSE)

rng = np.random.default_rng(42)

actual = df["price"].values[idx_test].astype(float)
pred   = test_predicted.astype(float) 

analysis_df = df.iloc[idx_test].copy()

analysis_df["price_true"] = actual
analysis_df["price_pred"] = pred
analysis_df["error"] = analysis_df["price_pred"] - analysis_df["price_true"]
analysis_df["abs_error"] = np.abs(analysis_df["error"])
analysis_df["pct_error"] = analysis_df["abs_error"] / analysis_df["price_true"] * 100

worst_abs = analysis_df.sort_values("abs_error", ascending=False)

cols = [
    "price_true",
    "price_pred",
    "abs_error",
    "pct_error",
    "catalog_content",
    "image_path"
]

print(worst_abs[cols].head(20))



       price_true  price_pred   abs_error  pct_error  \
4146       613.58   58.076099  555.503901  90.534877   
44231      404.47   31.823341  372.646659  92.132089   
46320      439.98   74.917488  365.062512  82.972524   
42206      390.98   47.636250  343.343750  87.816193   
4859       369.81   65.858887  303.951113  82.191156   
19970      374.98   87.267372  287.712628  76.727460   
27063      306.00   19.405827  286.594173  93.658227   
12030      303.56   55.106022  248.453978  81.846745   
2059       270.69   23.655968  247.034032  91.260864   
48715      241.09   18.538574  222.551426  92.310517   
20457      276.31   60.778015  215.531985  78.003686   
29046      260.45   45.554585  214.895415  82.509278   
23186      230.89   29.767967  201.122033  87.107295   
7251       229.99   42.036373  187.953627  81.722521   
37452      228.46   42.355827  186.104173  81.460287   
17095      226.93   42.715065  184.214935  81.176986   
29860      185.11    4.846960  180.263040  97.38

In [9]:
analysis_df["price_bucket"] = pd.qcut(
    analysis_df["price_true"],
    q=5,
    labels=["Very Low", "Low", "Mid", "High", "Very High"]
)

analysis_df.groupby("price_bucket")[["abs_error", "pct_error"]].mean()


  analysis_df.groupby("price_bucket")[["abs_error", "pct_error"]].mean()


Unnamed: 0_level_0,abs_error,pct_error
price_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
Very Low,5.207667,183.998032
Low,4.098641,52.935027
Mid,4.489637,30.993928
High,10.082799,38.880959
Very High,38.464502,54.002917


In [12]:
for row in range(10):

    item = worst_abs.iloc[row]

    print("TRUE PRICE:", item["price_true"])
    print("PRED PRICE:", item["price_pred"])
    print("ABS ERROR:", item["abs_error"])
    print("\nCATALOG CONTENT:\n")
    print(item["catalog_content"])


TRUE PRICE: 613.58
PRED PRICE: 58.07609939575195
ABS ERROR: 555.5039006042481

CATALOG CONTENT:

Item Name: Beverage/Topping Heated Condiment Dispenser from Benchmark #21011
Bullet Point 1: 5 Quart Capacity
Bullet Point 2: Clear Polycarbonate Bowl
Bullet Point 3: 1100 Watts
Bullet Point 4: Adjustable Thermostat
Bullet Point 5: Rotating Paddles
Product Description: Heated Beverage / Topping Condiment Dispenser from Benchmark #21011, great for beverages, toppings and sauces like hot chocolate, nacho cheese, drawn butter, au jus, broths, hot cider, teas, gravy, syrups, popcorn butter topping, cream sauces & more. This 5-quart capacity heated dispenser uses rotating paddles making it perfect for many beverages, toppings and sauces. It has an adjustable thermostat and 1100 watts of heating capacity. The easy-to-disassemble tap and the removable clear polycarbonate bowl are easily cleaned in a sink or dishwasher. The bowl can also be removed for refrigerated storage of unused product thereby