In [8]:
import numpy as np
import pandas as pd
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [9]:
# Load your processed features
items = pd.read_csv("data/hnm/processed/articles_features.csv", dtype={"article_id": str})
users = pd.read_csv("data/hnm/processed/customers_features.csv", dtype={"customer_id": str})
tx    = pd.read_csv("data/hnm/processed/transactions_sample.csv",
                    dtype={"customer_id": str, "article_id": str})

print("Items shape:", items.shape)
print("Users shape:", users.shape)
print("Transactions shape:", tx.shape)

Items shape: (105542, 109)
Users shape: (1371980, 7)
Transactions shape: (1788324, 5)


In [10]:
# Create a regression target: number of purchases per customer-article pair
# Or you could use: days_since_purchase, purchase_amount, etc.

# Count purchases per customer-article pair
purchase_counts = tx.groupby(["customer_id", "article_id"]).size().reset_index(name="purchase_count")

# Merge with features
data = purchase_counts.merge(items, on="article_id", how="left")
data = data.merge(users, on="customer_id", how="left")

print("\nMerged data shape:", data.shape)
print("Target variable (purchase_count) stats:")
print(data["purchase_count"].describe())


Merged data shape: (1576572, 117)
Target variable (purchase_count) stats:
count    1.576572e+06
mean     1.134312e+00
std      4.681954e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      7.400000e+01
Name: purchase_count, dtype: float64


In [11]:
# Prepare train/test split
# Exclude ID columns and target variable
id_cols = ["customer_id", "article_id", "product_code"]  # ID columns to exclude
cols_to_drop = ["purchase_count"] + [col for col in id_cols if col in data.columns]

X = data.drop(columns=cols_to_drop)
y = data["purchase_count"]

print("Excluded columns:", cols_to_drop)
print("Feature columns:", X.shape[1])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTrain set:", X_train.shape)
print("Test set:", X_test.shape)

Excluded columns: ['purchase_count', 'customer_id', 'article_id', 'product_code']
Feature columns: 113

Train set: (1261257, 113)
Test set: (315315, 113)


In [None]:
# Identify categorical and numerical columns
# Use select_dtypes to properly detect categorical/string columns
cat_cols = X_train.select_dtypes(include=['object', 'string', 'category']).columns.tolist()
# Include both numeric AND boolean columns (booleans are already 0/1 encoded)
num_cols = X_train.select_dtypes(include=['number', 'bool']).columns.tolist()

print(f"Categorical columns: {len(cat_cols)}")
print(f"Numerical columns (including booleans): {len(num_cols)}")
print(f"Total: {len(cat_cols) + len(num_cols)}")

# Create preprocessing pipeline
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

In [13]:
def train_regression_model(model_name, model, pkl_name):
    """
    Train a regression model and save it as a pickle file.
    
    Args:
        model_name: Display name for the model
        model: sklearn regressor instance
        pkl_name: filename to save the model
    """
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print('='*60)

    # Create pipeline
    pipe = Pipeline([
        ("prep", preprocess),
        ("reg", model)
    ])

    # Train
    pipe.fit(X_train, y_train)

    # Predict
    y_pred = pipe.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name} Results:")
    print(f"  RMSE (Root Mean Squared Error): {rmse:.4f}")
    print(f"  MAE  (Mean Absolute Error):     {mae:.4f}")
    print(f"  R²   (R-squared):               {r2:.4f}")

    # Save model
    os.makedirs("data/hnm/models", exist_ok=True)
    model_path = f"data/hnm/models/{pkl_name}"
    
    with open(model_path, "wb") as f:
        pickle.dump(pipe, f)

    print(f"\n✓ Model saved → {model_path}")
    
    return {
        'model_name': model_name,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

In [14]:
results_lr = train_regression_model(
    "Linear Regression",
    LinearRegression(),
    "linear_regression.pkl"
)


Training Linear Regression...


ValueError: could not convert string to float: 'dark green'