In [2]:
import os
import pandas as pd

src = "data/hnm/transactions_train.csv"
out = "data/hnm/processed/transactions_sample.csv"
os.makedirs("data/hnm/processed", exist_ok=True)

# Take last ~2M rows by reading in chunks and keeping recent chunks
chunksize = 1_000_000
keep_chunks = []

for i, chunk in enumerate(pd.read_csv(src, chunksize=chunksize, dtype={"customer_id": str, "article_id": str})):
    keep_chunks.append(chunk)
    # keep only last 2 chunks (~2M rows) in memory
    if len(keep_chunks) > 2:
        keep_chunks.pop(0)
    print(f"Read chunk {i+1}")

sample = pd.concat(keep_chunks, ignore_index=True)
sample.to_csv(out, index=False)
print("Saved:", out, "shape:", sample.shape)


Read chunk 1
Read chunk 2
Read chunk 3
Read chunk 4
Read chunk 5
Read chunk 6
Read chunk 7
Read chunk 8
Read chunk 9
Read chunk 10
Read chunk 11
Read chunk 12
Read chunk 13
Read chunk 14
Read chunk 15
Read chunk 16
Read chunk 17
Read chunk 18
Read chunk 19
Read chunk 20
Read chunk 21
Read chunk 22
Read chunk 23
Read chunk 24
Read chunk 25
Read chunk 26
Read chunk 27
Read chunk 28
Read chunk 29
Read chunk 30
Read chunk 31
Read chunk 32
Saved: data/hnm/processed/transactions_sample.csv shape: (1788324, 5)


In [3]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib

# -----------------------
# Paths (your processed outputs)
# -----------------------
items_path = "data/hnm/processed/articles_features.csv"
users_path = "data/hnm/processed/customers_features.csv"
tx_path    = "data/hnm/processed/transactions_sample.csv"  # use sample first

os.makedirs("data/hnm/models", exist_ok=True)

# -----------------------
# Load
# -----------------------
items = pd.read_csv(items_path, dtype={"article_id": str})
users = pd.read_csv(users_path, dtype={"customer_id": str})
tx    = pd.read_csv(tx_path, dtype={"customer_id": str, "article_id": str})

# Minimal clean
tx = tx.drop_duplicates(subset=["customer_id", "article_id"])
tx["label"] = 1

# -----------------------
# Negative sampling
# For each customer: sample N not-bought articles
# -----------------------
rng = np.random.default_rng(42)
all_articles = items["article_id"].unique()

cust_buys = tx.groupby("customer_id")["article_id"].apply(set).to_dict()

NEG_PER_POS = 2  # increase to 5 later if you want stronger training set

neg_rows = []
for cust, bought in cust_buys.items():
    n_pos = len(bought)
    n_neg = n_pos * NEG_PER_POS

    sampled = set()
    # fast sampling loop
    while len(sampled) < n_neg:
        pick = rng.choice(all_articles, size=min(2000, len(all_articles)), replace=False)
        for a in pick:
            if a not in bought:
                sampled.add(a)
            if len(sampled) >= n_neg:
                break

    for a in sampled:
        neg_rows.append((cust, a, 0))

neg = pd.DataFrame(neg_rows, columns=["customer_id", "article_id", "label"])

data = pd.concat([tx[["customer_id","article_id","label"]], neg], ignore_index=True)

# -----------------------
# Join engineered features (users + items)
# -----------------------
data = data.merge(items, on="article_id", how="left")
data = data.merge(users, on="customer_id", how="left")

# Drop rows where merges failed (rare if ids mismatch)
data = data.dropna(subset=["customer_id", "article_id"])

# -----------------------
# Train/Test split
# -----------------------
train_df, test_df = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data["label"]
)

X_train = train_df.drop(columns=["label"])
y_train = train_df["label"].astype(int)
X_test  = test_df.drop(columns=["label"])
y_test  = test_df["label"].astype(int)

# -----------------------
# Preprocess for ML models
# -----------------------
cat_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

# -----------------------
# Model 1: Popularity baseline (ranking proxy)
# -----------------------
# popularity from positives
pop = tx["article_id"].value_counts()
pop_rank = {aid: (len(pop) - i) for i, aid in enumerate(pop.index)}  # higher=more popular

baseline_scores = X_test["article_id"].map(lambda a: pop_rank.get(a, 0)).astype(float)

print("\nModel 1) Popularity baseline")
print("AUC   :", roc_auc_score(y_test, baseline_scores))
print("PR-AUC:", average_precision_score(y_test, baseline_scores))

# -----------------------
# Helper: train & evaluate
# -----------------------
def train_eval_save(name, clf, filename):
    pipe = Pipeline([("prep", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)

    proba = pipe.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, proba)
    pr = average_precision_score(y_test, proba)

    print(f"\n{name}")
    print("AUC   :", auc)
    print("PR-AUC:", pr)

    joblib.dump(pipe, f"data/hnm/models/{filename}")
    print("Saved model ->", f"data/hnm/models/{filename}")
    return pipe

# -----------------------
# Model 2: Logistic Regression
# -----------------------
m2 = train_eval_save(
    "Model 2) Logistic Regression",
    LogisticRegression(max_iter=300),
    "logreg.joblib"
)

# -----------------------
# Model 3: Random Forest
# -----------------------
m3 = train_eval_save(
    "Model 3) Random Forest",
    RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42),
    "rf.joblib"
)

# -----------------------
# Model 4: Gradient Boosting (strong baseline without extra installs)
# -----------------------
m4 = train_eval_save(
    "Model 4) Gradient Boosting",
    GradientBoostingClassifier(random_state=42),
    "gbdt.joblib"
)


KeyboardInterrupt: 