In [1]:

from pathlib import Path
import pandas as pd
import numpy as np

# -------------------------------------------------------------------
# 1. Paths
# -------------------------------------------------------------------
# Notebook path: fashion_recommender/notebooks/
# Project root:  one level up
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data"

print("ROOT_DIR:", ROOT_DIR)
print("DATA_DIR:", DATA_DIR)

# -------------------------------------------------------------------
# 2. Load list_eval_partition.txt  (image_name + eval_status [+ optional item_id])
# -------------------------------------------------------------------
eval_path = DATA_DIR / "list_eval_partition.txt"

# File structure (as seen from your screenshot):
#   line 1: <number_of_images>
#   line 2: "image_name evaluation_status"  (no leading '#')
#   line 3+: actual data rows
#
# So we skip the first TWO lines.
eval_raw = pd.read_csv(
    eval_path,
    delim_whitespace=True,
    header=None,
    skiprows=2,    # <-- key fix
)

print("\nRaw eval_df shape:", eval_raw.shape)
display(eval_raw.head())

# Name columns depending on how many there are
if eval_raw.shape[1] == 2:
    # image_name, eval_status (train / val / test)
    eval_raw.columns = ["image_name", "eval_status"]
    has_item_id = False
elif eval_raw.shape[1] == 3:
    # image_name, item_id, eval_status
    eval_raw.columns = ["image_name", "item_id", "eval_status"]
    has_item_id = True
else:
    raise ValueError(
        f"Unexpected number of columns in list_eval_partition.txt: {eval_raw.shape[1]}"
    )

eval_df = eval_raw.copy()

print("\nCleaned eval_df:")
display(eval_df.head())

# -------------------------------------------------------------------
# 3. Load list_bbox_inshop.txt  (OPTIONAL, not used for products.csv)
# -------------------------------------------------------------------
bbox_path = DATA_DIR / "list_bbox_inshop.txt"

# File structure (from your screenshot):
#   line 1: <number_of_images>
#   line 2: "image_name clothes_type pose_type x_1 y_1 x_2 y_2"
#   line 3+: data rows
try:
    bbox_raw = pd.read_csv(
        bbox_path,
        delim_whitespace=True,
        header=None,
        skiprows=2,   # <-- skip count + header
    )

    print("\nRaw bbox_df shape:", bbox_raw.shape)
    # Give column names just for completeness; we won't merge them for now.
    bbox_raw.columns = [
        "image_name", "clothes_type", "pose_type", "x1", "y1", "x2", "y2"
    ]
    bbox_df = bbox_raw
    display(bbox_df.head())
except FileNotFoundError:
    print("\nlist_bbox_inshop.txt not found – skipping bbox loading.")
    bbox_df = None


ROOT_DIR: /Users/ganenthraravindran/Desktop/Fashion Recommender
DATA_DIR: /Users/ganenthraravindran/Desktop/Fashion Recommender/data

Raw eval_df shape: (289222, 2)


  eval_raw = pd.read_csv(


Unnamed: 0,0,1
0,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,train
1,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,train
2,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,val
3,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,train
4,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,test



Cleaned eval_df:


Unnamed: 0,image_name,eval_status
0,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,train
1,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,train
2,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,val
3,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,train
4,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,test


  bbox_raw = pd.read_csv(



Raw bbox_df shape: (52712, 7)


Unnamed: 0,image_name,clothes_type,pose_type,x1,y1,x2,y2
0,img/WOMEN/Blouses_Shirts/id_00000001/02_1_fron...,1,1,50,49,208,235
1,img/WOMEN/Blouses_Shirts/id_00000001/02_2_side...,1,2,119,48,136,234
2,img/WOMEN/Blouses_Shirts/id_00000001/02_3_back...,1,3,50,42,213,240
3,img/WOMEN/Blouses_Shirts/id_00000001/02_4_full...,1,4,82,30,162,129
4,img/WOMEN/Dresses/id_00000002/02_1_front.jpg,3,1,65,45,233,252


In [2]:
# -------------------------------------------------------------------
# 4. Load list_item_inshop.txt  (OPTIONAL, item-level info)
# -------------------------------------------------------------------
item_path = DATA_DIR / "list_item_inshop.txt"

# From your screenshot the file looks like:
#   line 1: <number_of_items>
#   line 2+: id_00000001, id_00000002, ...
# There is NO header row with column names.
try:
    item_raw = pd.read_csv(
        item_path,
        delim_whitespace=True,
        header=None,
        skiprows=1,    # skip count only
    )

    print("\nRaw item_df shape:", item_raw.shape)
    item_raw.columns = ["item_id"]
    item_df = item_raw
    display(item_df.head())
except FileNotFoundError:
    print("\nlist_item_inshop.txt not found – skipping item loading.")
    item_df = None

# -------------------------------------------------------------------
# 5. Build the master products table (image-level)
# -------------------------------------------------------------------
# For our recommender we only need one row per image with:
#   product_id, image_path, category, is_in_stock, eval_status, (optional item_id)

products = eval_df.copy()

# image_path: relative path from the data/ folder
products["image_path"] = products["image_name"]

# numeric product_id
products = products.reset_index(drop=True)
products["product_id"] = np.arange(1, len(products) + 1)

# derive category from folder structure: img/<category>/filename.jpg
products["category"] = (
    products["image_path"]
      .str.split("/")   # ["img", "Sheer_Pleated-Front_Blouse", "img_00000001.jpg"]
      .str[1]           # "Sheer_Pleated-Front_Blouse"
      .str.split("_")   # ["Sheer", "Pleated-Front", "Blouse"]
      .str[-1]          # "Blouse"
)

# everything starts in stock for now
products["is_in_stock"] = 1

# Decide which columns to keep and in what order
base_cols = [
    "product_id",
    "image_path",
    "category",
    "is_in_stock",
    "eval_status",
]

if has_item_id:
    base_cols.append("item_id")

products_out = products[base_cols]

print("\nFinal products_out preview:")
display(products_out.head())
print("products_out shape:", products_out.shape)

# -------------------------------------------------------------------
# 6. Save to data/products.csv
# -------------------------------------------------------------------
output_path = DATA_DIR / "products.csv"
products_out.to_csv(output_path, index=False)

print("\nSaved products.csv to:", output_path)

# Quick sanity check: reload and show a few rows
check_df = pd.read_csv(output_path)
print("\nReloaded products.csv:")
print(check_df.shape)
display(check_df.head(100))


Raw item_df shape: (7982, 1)


  item_raw = pd.read_csv(


Unnamed: 0,item_id
0,id_00000001
1,id_00000002
2,id_00000003
3,id_00000004
4,id_00000005



Final products_out preview:


Unnamed: 0,product_id,image_path,category,is_in_stock,eval_status
0,1,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,Blouse,1,train
1,2,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,Blouse,1,train
2,3,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,Blouse,1,val
3,4,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,Blouse,1,train
4,5,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,Blouse,1,test


products_out shape: (289222, 5)

Saved products.csv to: /Users/ganenthraravindran/Desktop/Fashion Recommender/data/products.csv

Reloaded products.csv:
(289222, 5)


Unnamed: 0,product_id,image_path,category,is_in_stock,eval_status
0,1,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,Blouse,1,train
1,2,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,Blouse,1,train
2,3,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,Blouse,1,val
3,4,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,Blouse,1,train
4,5,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,Blouse,1,test
...,...,...,...,...,...
95,96,img/Sheer_Pleated-Front_Blouse/img_00000096.jpg,Blouse,1,test
96,97,img/Sheer_Pleated-Front_Blouse/img_00000097.jpg,Blouse,1,train
97,98,img/Sheer_Pleated-Front_Blouse/img_00000098.jpg,Blouse,1,train
98,99,img/Sheer_Pleated-Front_Blouse/img_00000099.jpg,Blouse,1,val


In [1]:
from pathlib import Path
import pandas as pd

ROOT_DIR = Path.cwd().parent
DATA_DIR  = ROOT_DIR / "data"
CSV       = DATA_DIR / "products.csv"

df = pd.read_csv(CSV)
print("rows, cols:", df.shape)
print(df.head(3))

# Required schema
required_cols = ["product_id","image_path","category","is_in_stock","eval_status"]
missing = [c for c in required_cols if c not in df.columns]
assert not missing, f"Missing columns: {missing}"

# Basic checks
assert df["product_id"].is_unique, "product_id must be unique"
assert df["product_id"].dtype.kind in "iu", "product_id must be integer-like"
assert df["image_path"].notna().all(), "image_path has NaNs"
assert df["category"].notna().all(), "category has NaNs"
assert df["is_in_stock"].isin([0,1]).all(), "is_in_stock must be 0/1"
assert df["eval_status"].isin(["train","val","test","query","gallery"]).all(), "unexpected eval_status values"

# Category quality (we want garment words like 'Blouse', 'Dress', etc.)
sample = df["category"].value_counts().head(10)
print("\nTop categories:\n", sample)

# Spot-check that image files exist (sample 50)
missing_files = []
for p in df["image_path"].sample(min(50, len(df))):
    fp = DATA_DIR / p
    if not fp.exists():
        missing_files.append(str(fp))
if missing_files:
    print("\nWARNING: some image files are missing (showing up to 10):")
    print("\n".join(missing_files[:10]))
else:
    print("\nImage file existence check (sample): OK")

print("\nAll validation checks passed ✅")


rows, cols: (289222, 5)
   product_id                                       image_path category  \
0           1  img/Sheer_Pleated-Front_Blouse/img_00000001.jpg   Blouse   
1           2  img/Sheer_Pleated-Front_Blouse/img_00000002.jpg   Blouse   
2           3  img/Sheer_Pleated-Front_Blouse/img_00000003.jpg   Blouse   

   is_in_stock eval_status  
0            1       train  
1            1       train  
2            1         val  

Top categories:
 category
Dress       72158
Tee         36887
Blouse      24557
Shorts      19666
Tank        15429
Skirt       14773
Cardigan    13311
Sweater     13123
Jacket      10467
Top         10078
Name: count, dtype: int64

/Users/ganenthraravindran/Desktop/Fashion Recommender/data/img/Zippered_Sleeveless_Hoodie/img_00000046.jpg
/Users/ganenthraravindran/Desktop/Fashion Recommender/data/img/Paisley-Embroidered_A-Line_Dress/img_00000004.jpg
/Users/ganenthraravindran/Desktop/Fashion Recommender/data/img/Zippered_Single-Button_Blazer/img_00000008