In [2]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./../../data/all_keywords_merged.csv')

In [9]:
df.columns

Index(['keyword', 'source_file', 'asin', 'item_name', 'brand', 'image_count',
       'main_image_url', 'has_aplus', 'has_brand_story', 'review_count',
       'avg_rating', 'bsr_best', 'bsr_paths', 'units_per_month',
       'sales_velocity_daily', 'product_url', 'image_list'],
      dtype='object')

In [20]:
image_url_col = df['main_image_url']

In [21]:
image_url_col.iloc[0]

'https://m.media-amazon.com/images/I/31S4tOQj4SL.jpg'

# From Dylan: 
I worked on finding labels for the images since I think its going to be the strongest points. You guys can decide to use this notebook as a basis to further go deeper into the images. Do not alter this notebook as I may continue working on it. Create a new one adjacent to this one with your name.

I decided to try to get these:
- Clutter vs. Simplicity: segmentation models (detect background %), edge density, number of distinct color clusters.**DONE: ADDED TO DF**
- Presence of Text: many low-quality sellers add text like “BEST DEAL” or “50% OFF.” Best sellers often don’t use text overlays on the main image. Tesseract, EasyOCR, label is has_text. **DONE: no text in images**
- Composition & Focus: we will need AI for this and image detection. 1 object (main product) is usually better. Use object detection models (YOLO, Faster R-CNN) or just bounding-box heuristics. Might be unfeasible but worth trying.
- Image Quality Basics: of cours:  sharpness / focus,  brightness / exposure, contrast, saturation / color vibrance, noise / compression artifacts.
- Product Visibility: % of image covered by product. Cropped vs. fully visible. Overlapping/occluded? Maybe this is the ratio of product bounding box to total image.
- Color and Aesthetic Features: Too many competing colors implies cluttered. Harmonious palette may be professional. Extract color histograms / clustering (k-means?)

I decided to use ChatGPT for the image processing, specially color processing. If you're interested in learning more about it (because your career demands it), take your time to learn it properly.

In [22]:
import os, io, asyncio, random, hashlib
import aiohttp
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
import cv2

from sklearn.cluster import KMeans


  from .autonotebook import tqdm as notebook_tqdm


In [23]:
USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0 Safari/537.36"
)

def safe_name_from_url(url: str) -> str:
    h = hashlib.md5(url.encode("utf-8")).hexdigest()[:16]
    base = url.split("/")[-1].split("?")[0]
    ext = ".jpg" if "." not in base else f".{base.split('.')[-1]}"
    return f"{h}{ext}"

async def fetch_one(session, url, save_dir, sem):
    name = safe_name_from_url(url)
    out_path = os.path.join(save_dir, name)
    if os.path.exists(out_path):
        return out_path

    headers = {"User-Agent": USER_AGENT, "Accept": "image/avif,image/webp,image/*,*/*;q=0.8"}
    async with sem:
        for attempt in range(4):
            try:
                async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=20)) as r:
                    if r.status != 200:
                        await asyncio.sleep(0.7 * (attempt + 1))
                        continue
                    data = await r.read()
                    img = Image.open(io.BytesIO(data)).convert("RGB")
                    os.makedirs(save_dir, exist_ok=True)
                    img.save(out_path, format="JPEG", quality=95)
                    return out_path
            except Exception:
                await asyncio.sleep(0.7 * (attempt + 1))
        return None

async def download_all(urls, save_dir="images_amz", max_concurrency=8):
    sem = asyncio.Semaphore(max_concurrency)
    conn = aiohttp.TCPConnector(limit=0, ssl=False)
    async with aiohttp.ClientSession(connector=conn) as session:
        tasks = [fetch_one(session, u, save_dir, sem) for u in urls]
        results = []
        for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Downloading"):
            results.append(await f)
    return results


In [None]:
import nest_asyncio, asyncio
nest_asyncio.apply()

urls = df["main_image_url"].dropna().tolist()
paths = asyncio.run(download_all(urls, save_dir="images_amz", max_concurrency=8))

  self.__wakeup, context=self._context)
Downloading: 100%|██████████| 17363/17363 [13:02<00:00, 22.20it/s] 


ValueError: Length of values (17363) does not match length of index (17375)

In [28]:
import os, hashlib
import numpy as np
import pandas as pd

def expected_path_from_url(url, save_dir="images_amz"):
    if pd.isna(url):
        return np.nan
    base = url.split("/")[-1].split("?")[0]
    ext = ".jpg" if "." not in base else f".{base.split('.')[-1]}"
    name = hashlib.md5(url.encode("utf-8")).hexdigest()[:16] + ext
    path = os.path.join(save_dir, name)
    return path if os.path.exists(path) else np.nan

df["image_path"] = df["main_image_url"].apply(expected_path_from_url)


In [None]:
import numpy as np, cv2, pandas as pd
from PIL import Image
from sklearn.cluster import KMeans

def load_cv2(path):
    data = np.fromfile(path, dtype=np.uint8)
    return cv2.imdecode(data, cv2.IMREAD_COLOR) # BGR uint8

def edge_density(img_bgr, low=100, high=200):
    if img_bgr is None: return np.nan
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    h, w = gray.shape[:2]
    edges = cv2.Canny(gray, threshold1=low, threshold2=high)
    return float(np.count_nonzero(edges)) / float(h*w + 1e-9)

def background_fractions(img_bgr):
    if img_bgr is None:
        return dict(bg_white_pct=np.nan, bg_neutral_pct=np.nan)
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    H,S,V = cv2.split(hsv)
    Sf, Vf = S.astype(np.float32)/255.0, V.astype(np.float32)/255.0
    white_mask   = (Vf > 0.95) & (Sf < 0.10) # bright and low saturation
    neutral_mask = (Sf < 0.12) # low saturation = neutral bg
    return dict(
        bg_white_pct=float(np.mean(white_mask)),
        bg_neutral_pct=float(np.mean(neutral_mask)),
    )

def palette_complexity(img_bgr, K=5, sample_px=64*64, significant=0.05):
    if img_bgr is None:
        return dict(n_clusters_sig=np.nan, color_entropy=np.nan, largest_cluster_pct=np.nan)

    h, w = img_bgr.shape[:2]
    scale = max(1, int(np.sqrt((h*w)/sample_px)))
    small = cv2.resize(img_bgr, (w//scale, h//scale), interpolation=cv2.INTER_AREA)

    lab = cv2.cvtColor(small, cv2.COLOR_BGR2LAB).reshape(-1, 3).astype(np.float32)
    km = KMeans(n_clusters=K, n_init=5, random_state=42)
    labels = km.fit_predict(lab)
    counts = np.bincount(labels, minlength=K).astype(np.float32)
    weights = counts / counts.sum()

    n_clusters_sig = int(np.sum(weights >= significant))
    eps = 1e-12
    color_entropy = -np.sum(weights * np.log(weights + eps)) / np.log(K + eps)  # normalized [0,1]
    largest_cluster_pct = float(weights.max())

    return dict(
        n_clusters_sig=n_clusters_sig,
        color_entropy=float(color_entropy),
        largest_cluster_pct=largest_cluster_pct
    )

def clutter_features_for_path(path):
    try:
        img = load_cv2(path)
        ed = edge_density(img)
        bg = background_fractions(img)
        pal = palette_complexity(img, K=5)
        return dict(
            image_path=path,
            edge_density=ed,
            bg_white_pct=bg["bg_white_pct"],
            bg_neutral_pct=bg["bg_neutral_pct"],
            n_clusters_sig=pal["n_clusters_sig"],
            color_entropy=pal["color_entropy"],
            largest_cluster_pct=pal["largest_cluster_pct"],
        )
    except Exception:
        return dict(
            image_path=path,
            edge_density=np.nan,
            bg_white_pct=np.nan,
            bg_neutral_pct=np.nan,
            n_clusters_sig=np.nan,
            color_entropy=np.nan,
            largest_cluster_pct=np.nan,
        )


In [31]:
from tqdm.auto import tqdm

have = df["image_path"].notna()
feat_rows = [clutter_features_for_path(p) for p in tqdm(df.loc[have, "image_path"], desc="Clutter features")]
df_feats = pd.DataFrame(feat_rows)


Clutter features: 100%|██████████| 17334/17334 [10:50<00:00, 26.63it/s]


In [32]:
for col in ["edge_density","n_clusters_sig","color_entropy","bg_white_pct","bg_neutral_pct","largest_cluster_pct"]:
    m = df_feats[col].mean()
    s = df_feats[col].std(ddof=0) + 1e-9 #small number so we dont divide by 0 in some cases.
    df_feats[col+"_z"] = (df_feats[col] - m)/s #this is the zscore (stats)

df_feats["clutter_score"] = ( #higher = more clutter
    + 0.45 * df_feats["edge_density_z"]
    + 0.30 * df_feats["n_clusters_sig_z"]
    + 0.25 * df_feats["color_entropy_z"]
    - 0.30 * df_feats["bg_white_pct_z"]
    - 0.20 * df_feats["bg_neutral_pct_z"]
    - 0.15 * df_feats["largest_cluster_pct_z"]
)

df_model = df.merge(df_feats, on="image_path", how="left")


In [33]:
simple = df_model.sort_values("clutter_score", ascending=True).head(5)
busy   = df_model.sort_values("clutter_score", ascending=False).head(5)

cols = ["image_path","clutter_score","bg_white_pct","largest_cluster_pct","edge_density","n_clusters_sig","color_entropy"]
display(simple[cols])
display(busy[cols])

df_model.to_csv("df_with_clutter_features.csv", index=False)


Unnamed: 0,image_path,clutter_score,bg_white_pct,largest_cluster_pct,edge_density,n_clusters_sig,color_entropy
11381,images_amz/866237e84fde8a3c.jpg,-3.652532,0.961232,0.950769,0.001007,1.0,0.152908
10264,images_amz/eb58f783c164e724.jpg,-3.601807,0.95487,0.943848,0.001227,1.0,0.173692
9455,images_amz/0e6a40ff8d3a326e.jpg,-3.591841,0.955214,0.947998,0.002823,1.0,0.169502
9595,images_amz/45c033bc7162de66.jpg,-3.585157,0.955205,0.946777,0.002446,1.0,0.174915
9376,images_amz/96e730aaf104e216.jpg,-3.567192,0.954709,0.941895,0.002303,1.0,0.18553


Unnamed: 0,image_path,clutter_score,bg_white_pct,largest_cluster_pct,edge_density,n_clusters_sig,color_entropy
8158,images_amz/df1a7d5e92f416a2.jpg,4.772736,0.020349,0.248663,0.240882,5.0,0.966516
16981,images_amz/9ce546164ec235be.jpg,4.705153,0.132536,0.296201,0.254964,5.0,0.942215
6601,images_amz/1edd23725c057b44.jpg,4.67032,0.051152,0.335786,0.241129,5.0,0.951495
6208,images_amz/c9c9f4849d181e55.jpg,4.667717,0.109912,0.232692,0.241584,5.0,0.996853
17694,images_amz/b922bd22bca88a1c.jpg,4.622601,0.125244,0.301441,0.249596,5.0,0.955061


In [42]:
df_model.select_dtypes(np.number).corr()['bsr_best']

image_count             -0.063141
review_count                  NaN
avg_rating                    NaN
bsr_best                 1.000000
units_per_month               NaN
sales_velocity_daily          NaN
edge_density            -0.002961
bg_white_pct            -0.022585
bg_neutral_pct          -0.029610
n_clusters_sig           0.002489
color_entropy            0.000001
largest_cluster_pct     -0.004283
edge_density_z          -0.002961
n_clusters_sig_z         0.002489
color_entropy_z          0.000001
bg_white_pct_z          -0.022585
bg_neutral_pct_z        -0.029610
largest_cluster_pct_z   -0.004283
clutter_score            0.009661
Name: bsr_best, dtype: float64

I went thru the images. None have text.