In [1]:
import os
import pandas as pd

src = "data/hnm/transactions_train.csv"
out = "data/hnm/processed/transactions_sample.csv"
os.makedirs("data/hnm/processed", exist_ok=True)

# Take last ~2M rows by reading in chunks and keeping recent chunks
chunksize = 1_000_000
keep_chunks = []

for i, chunk in enumerate(pd.read_csv(src, chunksize=chunksize, dtype={"customer_id": str, "article_id": str})):
    keep_chunks.append(chunk)
    # keep only last 2 chunks (~2M rows) in memory
    if len(keep_chunks) > 2:
        keep_chunks.pop(0)
    print(f"Read chunk {i+1}")

sample = pd.concat(keep_chunks, ignore_index=True)
sample.to_csv(out, index=False)
print("Saved:", out, "shape:", sample.shape)


Read chunk 1
Read chunk 2
Read chunk 3
Read chunk 4
Read chunk 5
Read chunk 6
Read chunk 7
Read chunk 8
Read chunk 9
Read chunk 10
Read chunk 11
Read chunk 12
Read chunk 13
Read chunk 14
Read chunk 15
Read chunk 16
Read chunk 17
Read chunk 18
Read chunk 19
Read chunk 20
Read chunk 21
Read chunk 22
Read chunk 23
Read chunk 24
Read chunk 25
Read chunk 26
Read chunk 27
Read chunk 28
Read chunk 29
Read chunk 30
Read chunk 31
Read chunk 32
Saved: data/hnm/processed/transactions_sample.csv shape: (1788324, 5)


In [2]:
import numpy as np
import pandas as pd
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [3]:
items = pd.read_csv("data/hnm/processed/articles_features.csv", dtype={"article_id": str})
users = pd.read_csv("data/hnm/processed/customers_features.csv", dtype={"customer_id": str})
tx    = pd.read_csv("data/hnm/processed/transactions_sample.csv",
                    dtype={"customer_id": str, "article_id": str})

tx = tx.drop_duplicates(["customer_id","article_id"])

# 8GB: start 150k; 16GB: start 300k–500k
POS_TARGET = 150_000
tx = tx.sample(POS_TARGET, random_state=42)

tx["label"] = 1
print("Positives:", len(tx))

Positives: 150000


In [None]:
rng = np.random.default_rng(42)
all_articles = items["article_id"].dropna().unique()

NEG_PER_POS = 1
MAX_NEG_PER_CUST = 20   # start 20; if stable you can raise to 30+

cust_pos = tx.groupby("customer_id")["article_id"].apply(lambda s: s.unique())

neg_rows = []
for cust, bought_arr in cust_pos.items():
    n_pos = len(bought_arr)
    n_neg = min(n_pos * NEG_PER_POS, MAX_NEG_PER_CUST)

    pool = np.setdiff1d(all_articles, bought_arr)
    if len(pool) == 0 or n_neg == 0:
        continue

    n_neg = min(n_neg, len(pool))
    sampled = rng.choice(pool, size=n_neg, replace=False)

    neg_rows.extend([(cust, a, 0) for a in sampled])

neg = pd.DataFrame(neg_rows, columns=["customer_id","article_id","label"])
data = pd.concat([tx[["customer_id","article_id","label"]], neg], ignore_index=True)

print("Pos:", len(tx), "Neg:", len(neg), "Total:", len(data))


In [None]:
data = data.merge(items, on="article_id", how="left")
data = data.merge(users, on="customer_id", how="left")

train_df, test_df = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data["label"]
)

X_train = train_df.drop(columns=["label"])
y_train = train_df["label"].astype(int)
X_test  = test_df.drop(columns=["label"])
y_test  = test_df["label"].astype(int)


In [None]:
cat_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)


In [None]:
def train_one_model(model_name, model, pkl_name):
    print(f"\nTraining {model_name}...")

    pipe = Pipeline([
        ("prep", preprocess),
        ("clf", model)
    ])

    pipe.fit(X_train, y_train)

    proba = pipe.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, proba)
    pr  = average_precision_score(y_test, proba)

    print(f"{model_name} AUC    :", round(auc, 4))
    print(f"{model_name} PR-AUC :", round(pr, 4))

    os.makedirs("data/hnm/models", exist_ok=True)
    with open(f"data/hnm/models/{pkl_name}", "wb") as f:
        pickle.dump(pipe, f)

    print("Saved →", pkl_name)


Logistic Regression

In [None]:
train_one_model(
    "Logistic Regression",
    LogisticRegression(max_iter=300),
    "logreg.pkl"
)


Random Forest

In [None]:
train_one_model(
    "Random Forest",
    RandomForestClassifier(
        n_estimators=200,
        n_jobs=-1,
        random_state=42
    ),
    "rf.pkl"
)


Gradient Boosting

In [None]:
train_one_model(
    "Gradient Boosting",
    GradientBoostingClassifier(random_state=42),
    "gbdt.pkl"
)


AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

train_one_model(
    "AdaBoost",
    AdaBoostClassifier(
        n_estimators=200,
        learning_rate=0.05,
        random_state=42
    ),
    "adaboost.pkl"
)
