In [3]:
import os
import re
import numpy as np
import pandas as pd

# ---- paths ----
in_path  = "data/hnm/articles.csv"
out_dir  = "data/hnm/processed"
os.makedirs(out_dir, exist_ok=True)

# ---- load ----
df = pd.read_csv(in_path)

# =========================
# 1) BASIC CLEANING
# =========================

# standardize column names (optional)
df.columns = [c.strip() for c in df.columns]

# drop exact duplicate rows
df = df.drop_duplicates()

# ensure article_id is string (keeps leading zeros if any)
if "article_id" in df.columns:
    df["article_id"] = df["article_id"].astype(str)

# trim whitespace in object columns
obj_cols = df.select_dtypes(include=["object"]).columns
for c in obj_cols:
    df[c] = df[c].astype(str).str.strip()

# handle known "nan" strings (sometimes appear after astype(str))
df = df.replace({"nan": np.nan, "None": np.nan, "": np.nan})

# fill missing for key categorical columns with "Unknown"
cat_like = [
    "product_type_name","product_group_name","graphical_appearance_name",
    "colour_group_name","perceived_colour_value_name","department_name",
    "index_name","index_group_name","section_name","garment_group_name"
]
for c in cat_like:
    if c in df.columns:
        df[c] = df[c].fillna("Unknown")

# detail_desc: fill missing with empty string for NLP-friendly features
if "detail_desc" in df.columns:
    df["detail_desc"] = df["detail_desc"].fillna("")

# =========================
# 2) FEATURE ENGINEERING
# =========================

# ---- (A) price features ----
# articles.csv usually has price in SEK? (or float). We'll add bins & log.
if "price" in df.columns:
    df["price"] = pd.to_numeric(df["price"], errors="coerce")
    df["price_missing"] = df["price"].isna().astype(int)
    df["price_filled"] = df["price"].fillna(df["price"].median())
    df["price_log1p"] = np.log1p(df["price_filled"])

    # quantile bins (robust)
    try:
        df["price_bin_q"] = pd.qcut(df["price_filled"], q=10, duplicates="drop")
        df["price_bin_q"] = df["price_bin_q"].astype(str)
    except ValueError:
        df["price_bin_q"] = "bin_unavailable"

# ---- (B) text features from product name + detail_desc ----
def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

text_cols = []
if "prod_name" in df.columns: text_cols.append("prod_name")
if "detail_desc" in df.columns: text_cols.append("detail_desc")

if text_cols:
    df["text_all"] = (
        df[text_cols]
        .fillna("")
        .astype(str)
        .agg(" ".join, axis=1)
        .map(clean_text)
    )
    df["text_len"] = df["text_all"].str.len()
    df["text_word_count"] = df["text_all"].str.split().map(len)
    df["has_description"] = (df["detail_desc"].fillna("").str.len() > 0).astype(int) if "detail_desc" in df.columns else 0

# ---- (C) color normalization helpers ----
if "colour_group_name" in df.columns:
    df["colour_group_name_norm"] = df["colour_group_name"].str.lower()

# ---- (D) lightweight categorical encodings ----
# 1) frequency encoding (often better than one-hot for high-cardinality)
def freq_encode(series: pd.Series) -> pd.Series:
    freq = series.value_counts(dropna=False)
    return series.map(freq).astype(float)

for c in cat_like:
    if c in df.columns:
        df[f"{c}_freq"] = freq_encode(df[c])

# 2) optional: one-hot only for low-cardinality columns (safe)
low_card_cols = []
for c in cat_like:
    if c in df.columns and df[c].nunique() <= 30:
        low_card_cols.append(c)

df_onehot = pd.get_dummies(df[low_card_cols], prefix=low_card_cols) if low_card_cols else pd.DataFrame(index=df.index)

# ---- (E) build final feature table ----
keep_id_cols = [c for c in ["article_id", "product_code"] if c in df.columns]

numeric_feats = [c for c in df.columns if c.endswith(("_freq", "_log1p"))] + \
                [c for c in ["text_len","text_word_count","has_description","price_filled","price_missing"] if c in df.columns]

cat_feats = [c for c in ["price_bin_q","colour_group_name_norm"] if c in df.columns]

features = pd.concat(
    [
        df[keep_id_cols + numeric_feats + cat_feats].copy(),
        df_onehot
    ],
    axis=1
)

# =========================
# 3) SAVE OUTPUTS
# =========================
features_path = os.path.join(out_dir, "articles_features.csv")
cleaned_path  = os.path.join(out_dir, "articles_cleaned.csv")

features.to_csv(features_path, index=False)
df.to_csv(cleaned_path, index=False)

print("Saved:")
print(" - Cleaned:", cleaned_path)
print(" - Features:", features_path)
print("Feature shape:", features.shape)
print("Columns sample:", features.columns[:20].tolist())


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  obj_cols = df.select_dtypes(include=["object"]).columns


Saved:
 - Cleaned: data/hnm/processed/articles_cleaned.csv
 - Features: data/hnm/processed/articles_features.csv
Feature shape: (105542, 109)
Columns sample: ['article_id', 'product_code', 'product_type_name_freq', 'product_group_name_freq', 'graphical_appearance_name_freq', 'colour_group_name_freq', 'perceived_colour_value_name_freq', 'department_name_freq', 'index_name_freq', 'index_group_name_freq', 'section_name_freq', 'garment_group_name_freq', 'text_len', 'text_word_count', 'has_description', 'colour_group_name_norm', 'product_group_name_Accessories', 'product_group_name_Bags', 'product_group_name_Cosmetic', 'product_group_name_Fun']


In [4]:
import pandas as pd
import numpy as np
import os

in_path = "data/hnm/customers.csv"
out_path = "data/hnm/processed/customers_features.csv"
os.makedirs("data/hnm/processed", exist_ok=True)

df = pd.read_csv(in_path, dtype={"customer_id": str})

# ---- cleaning ----
df = df.drop_duplicates()
df.columns = [c.strip() for c in df.columns]

# age
if "age" in df.columns:
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    df["age_missing"] = df["age"].isna().astype(int)
    df["age"] = df["age"].fillna(df["age"].median())
    df["age_bin"] = pd.cut(df["age"], bins=[0,18,25,35,45,55,65,100], labels=False)

# categorical columns
cat_cols = ["fashion_news_frequency", "club_member_status", "postal_code"]
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].fillna("Unknown")

# frequency encoding
for c in cat_cols:
    if c in df.columns:
        df[f"{c}_freq"] = df[c].map(df[c].value_counts())

# keep final features
keep = ["customer_id", "age", "age_missing", "age_bin"] + \
       [c for c in df.columns if c.endswith("_freq")]

features = df[keep]
features.to_csv(out_path, index=False)

print("Saved:", out_path)


Saved: data/hnm/processed/customers_features.csv


In [5]:
import pandas as pd
import numpy as np
import os

in_path = "data/hnm/transactions_train.csv"
out_path = "data/hnm/processed/transactions_features.csv"
os.makedirs("data/hnm/processed", exist_ok=True)

df = pd.read_csv(in_path, dtype={"customer_id": str, "article_id": str})

# ---- cleaning ----
df = df.drop_duplicates()
df["t_dat"] = pd.to_datetime(df["t_dat"], errors="coerce")
df = df.dropna(subset=["t_dat"])

# ---- time features ----
df["year"] = df["t_dat"].dt.year
df["month"] = df["t_dat"].dt.month
df["day"] = df["t_dat"].dt.day
df["dayofweek"] = df["t_dat"].dt.dayofweek
df["weekofyear"] = df["t_dat"].dt.isocalendar().week.astype(int)

# ---- recency feature ----
max_date = df["t_dat"].max()
df["days_since_purchase"] = (max_date - df["t_dat"]).dt.days

# ---- interaction strength ----
# each row = 1 interaction
df["interaction"] = 1

df = df[
    [
        "customer_id",
        "article_id",
        "interaction",
        "days_since_purchase",
        "dayofweek",
        "weekofyear",
    ]
]

df.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: data/hnm/processed/transactions_features.csv
