In [2]:
# Cell 1: Imports & paths
from pathlib import Path
import pandas as pd

DATA_DIR = Path("data/hm")
ARTICLES_CSV = DATA_DIR / "articles.csv"
TRANS_CSV = DATA_DIR / "transactions_train.csv"
IMAGES_DIR = DATA_DIR / "images"

ARTICLES_CSV.exists(), TRANS_CSV.exists(), IMAGES_DIR.exists()


(True, True, True)

In [4]:
from pathlib import Path

DATA_DIR = Path("data/hm")
ARTICLES_CSV = DATA_DIR / "articles.csv"
TRANS_CSV = DATA_DIR / "transactions_train.csv"
IMAGES_DIR = DATA_DIR / "images"

print("Exists:", ARTICLES_CSV.exists(), TRANS_CSV.exists(), IMAGES_DIR.exists())
if TRANS_CSV.exists():
    print("transactions_train.csv size (GB):", round(TRANS_CSV.stat().st_size / 1e9, 2))


Exists: True True True
transactions_train.csv size (GB): 3.49


In [5]:
import pandas as pd

articles = pd.read_csv("data/hm/articles.csv", low_memory=False)
print("Articles shape:", articles.shape)
articles.head()


Articles shape: (105542, 25)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [6]:
transactions_sample = pd.read_csv(
    "data/hm/transactions_train.csv",
    nrows=100_000,   # only first 100k rows
    parse_dates=["t_dat"],
    usecols=["t_dat", "article_id", "customer_id", "price", "sales_channel_id"]  # only needed cols
)

print("Transactions sample shape:", transactions_sample.shape)
transactions_sample.head()


Transactions sample shape: (100000, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [7]:
transactions_sample["t_dat"].min(), transactions_sample["t_dat"].max()


(Timestamp('2018-09-20 00:00:00'), Timestamp('2018-09-22 00:00:00'))

In [9]:
# Cell 3: Keep only useful columns to start
article_cols = [
    "article_id", "product_code", "prod_name", "product_type_name", 
    "product_group_name", "colour_group_name", "perceived_colour_value_name",
    "perceived_colour_master_name", "graphical_appearance_name", "detail_desc"
]
articles_small = articles[article_cols].copy()

# Reduce memory: optional sampling for faster prototyping
# transactions = transactions.sample(1_000_000, random_state=42)

# Merge
df = transactions_sample.merge(articles_small, on="article_id", how="left")
df.head(), df.shape


(       t_dat                                        customer_id  article_id  \
 0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
 1 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
 2 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   505221004   
 3 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687003   
 4 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687004   
 
       price  sales_channel_id  product_code                 prod_name  \
 0  0.050831                 2        663713  Atlanta Push Body Harlow   
 1  0.030492                 2        541518   Rae Push (Melbourne) 2p   
 2  0.015237                 2        505221               Inca Jumper   
 3  0.016932                 2        685687      W YODA KNIT OL OFFER   
 4  0.016932                 2        685687      W YODA KNIT OL OFFER   
 
   product_type_name  product_group_name colour_group_name  \
 0    Unde

In [10]:
# Cell 4: Image path helper
from pathlib import Path

def article_image_path(article_id, images_root: Path = IMAGES_DIR) -> Path:
    aid = str(int(article_id)).zfill(10)  # zero-pad to 10 digits
    folder = aid[:3]
    return images_root / folder / f"{aid}.jpg"

# Test:
p = article_image_path(df["article_id"].iloc[0])
p, p.exists()


(WindowsPath('data/hm/images/066/0663713001.jpg'), True)

In [11]:
# Cell 5: TensorFlow image pipeline
import tensorflow as tf

IMG_SIZE = (224, 224)

def load_and_preprocess_image(path: str | Path):
    img = tf.io.read_file(str(path))
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32) / 255.0
    return img
