In [1]:
import pandas as pd

In [4]:
customer_df = pd.read_csv('customer_features.csv')

In [5]:
customer_df.head()

Unnamed: 0,customer_id,total_spend,total_transactions,avg_order_value,first_transaction_date,last_transaction_date,recency_days,tenure_days,spend_last_30d,tx_last_30d,...,tx_last_60d,spend_last_90d,tx_last_90d,unique_products,unique_categories,dominant_category,segment_id,loyalty_status,total_loyalty_points,is_cold_start
0,1,228156.76,19,12008.250526,2020-04-30,2021-08-02,1612,2071,0.0,0.0,...,0.0,0.0,0.0,53.0,8.0,Home,4.0,Gold,2347,0
1,2,14659.49,1,14659.49,2022-10-23,2022-10-23,1165,1165,0.0,0.0,...,0.0,0.0,0.0,4.0,3.0,Beauty,4.0,Bronze,166,1
2,3,76364.73,5,15272.946,2022-07-13,2022-10-23,1165,1267,0.0,0.0,...,0.0,0.0,0.0,15.0,7.0,Fashion,4.0,Bronze,763,0
3,4,64789.63,4,16197.4075,2019-09-24,2019-11-23,2230,2290,0.0,0.0,...,0.0,0.0,0.0,12.0,7.0,Fashion,3.0,Silver,663,0
4,5,218675.35,13,16821.180769,2022-08-15,2023-07-14,901,1234,0.0,0.0,...,0.0,0.0,0.0,43.0,8.0,Electronics,4.0,Silver,2186,0


In [7]:
customer_df.columns

Index(['customer_id', 'total_spend', 'total_transactions', 'avg_order_value',
       'first_transaction_date', 'last_transaction_date', 'recency_days',
       'tenure_days', 'spend_last_30d', 'tx_last_30d', 'spend_last_60d',
       'tx_last_60d', 'spend_last_90d', 'tx_last_90d', 'unique_products',
       'unique_categories', 'dominant_category', 'segment_id',
       'loyalty_status', 'total_loyalty_points', 'is_cold_start'],
      dtype='object')

In [10]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Check if xgboost is available, otherwise fall back to ExtraTrees
try:
    from xgboost import XGBRegressor
    xgb_available = True
except ImportError:
    xgb_available = False
    print("XGBoost not found. Using ExtraTreesRegressor instead.")

# -----------------------------------------------------------------------------
# 1. SETUP & TARGET CREATION
# -----------------------------------------------------------------------------
# NOTE: Using existing 'customer_df' from memory as requested.
# If running this standalone for testing without the dataframe in memory,
# uncomment the block below to simulate the schema structure (strictly for testing flow).
# -----------------------------------------------------------------------------
# if 'customer_df' not in locals():
#     raise ValueError("customer_df is not defined in memory.")

TARGET = "spend_next_30d"

# Ensure Target Exists (Safety Check)
if TARGET not in customer_df.columns:
    # In a real scenario, this should be derived from future transaction logs.
    # For this pipeline to run if the column is missing in the provided df,
    # we simulate it based on 'spend_last_30d' + noise to demonstrate functionality.
    # This is STRICTLY a fallback to ensure the code below is runnable.
    print(f"Warning: '{TARGET}' not found. Deriving mock target for demonstration.")
    np.random.seed(42)
    customer_df[TARGET] = customer_df['spend_last_30d'] * np.random.normal(1.0, 0.2, len(customer_df))
    customer_df[TARGET] = customer_df[TARGET].clip(lower=0) # Spend cannot be negative

# -----------------------------------------------------------------------------
# 2. FEATURE ENGINEERING & LEAKAGE PREVENTION
# -----------------------------------------------------------------------------

# Columns to explicitly drop to prevent leakage or ID/Date issues
cols_to_drop = [
    'customer_id',
    TARGET,
    'first_transaction_date',
    'last_transaction_date'
]

# Define Feature Matrix (X) and Target Vector (y)
X = customer_df.drop(columns=[c for c in cols_to_drop if c in customer_df.columns], errors='ignore')
y = customer_df[TARGET]

# -----------------------------------------------------------------------------
# 3. PREPROCESSING PIPELINE
# -----------------------------------------------------------------------------

# Automatically detect column types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical Features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")

# Numerical Pipeline: Impute Median -> Standard Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Pipeline: Impute Most Frequent -> OneHot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine into ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' # Drop any columns not explicitly handled
)

# -----------------------------------------------------------------------------
# 4. TRAIN / VALIDATION SPLIT
# -----------------------------------------------------------------------------

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

# -----------------------------------------------------------------------------
# 5. MODEL DEFINITIONS
# -----------------------------------------------------------------------------

models = {
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
}

if xgb_available:
    models["XGBoost"] = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
else:
    models["Extra Trees"] = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# -----------------------------------------------------------------------------
# 6. EVALUATION FUNCTION
# -----------------------------------------------------------------------------

def evaluate_model(name, model, X_val, y_val):
    """
    Predicts and evaluates a model on validation data.
    Returns a dictionary of metrics.
    """
    y_pred = model.predict(X_val)

    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)

    print(f"\n--- {name} Results ---")
    print(f"MAE:  {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R¬≤:   {r2:.4f}")

    return {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'Model': model}

# -----------------------------------------------------------------------------
# 7. TRAINING & COMPARISON LOOP
# -----------------------------------------------------------------------------

results = {}
best_model_name = None
best_model_score = -float('inf') # Maximizing R2

print("\nüöÄ Starting Model Training...")

for name, model_instance in models.items():
    # Create a full pipeline for the specific model
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model_instance)
    ])

    # Train
    clf.fit(X_train, y_train)

    # Evaluate
    metrics = evaluate_model(name, clf, X_val, y_val)
    results[name] = metrics

    # Model Selection Logic (Based on R2)
    if metrics['R2'] > best_model_score:
        best_model_score = metrics['R2']
        best_model_name = name

# -----------------------------------------------------------------------------
# 8. BEST MODEL SELECTION & EXPLANATION
# -----------------------------------------------------------------------------

print("\n" + "="*40)
print(f"üèÜ BEST PERFORMING MODEL: {best_model_name}")
print("="*40)

best_pipeline = results[best_model_name]['Model']

print(f"\nWhy {best_model_name} likely won:")
if "Boosting" in best_model_name or "XGBoost" in best_model_name:
    print("- Boosting algorithms effectively reduce bias by sequentially correcting errors.")
    print("- They handle non-linear relationships well and are robust to outliers in spend data.")
elif "Forest" in best_model_name or "Trees" in best_model_name:
    print("- Ensemble tree methods reduce variance through bagging.")
    print("- They are very effective at capturing complex interactions between recency and frequency features.")



Numerical Features (15): ['total_spend', 'total_transactions', 'avg_order_value', 'recency_days', 'tenure_days', 'spend_last_30d', 'tx_last_30d', 'spend_last_60d', 'tx_last_60d', 'spend_last_90d', 'tx_last_90d', 'unique_products', 'unique_categories', 'total_loyalty_points', 'is_cold_start']
Categorical Features (3): ['dominant_category', 'segment_id', 'loyalty_status']

Training set shape: (16000, 18)
Validation set shape: (4000, 18)

üöÄ Starting Model Training...

--- Gradient Boosting Results ---
MAE:  230.3375
RMSE: 1178.1817
R¬≤:   0.9627

--- Random Forest Results ---
MAE:  222.7946
RMSE: 1178.1015
R¬≤:   0.9627

--- XGBoost Results ---
MAE:  293.6344
RMSE: 1545.3224
R¬≤:   0.9358

üèÜ BEST PERFORMING MODEL: Random Forest

Why Random Forest likely won:
- Ensemble tree methods reduce variance through bagging.
- They are very effective at capturing complex interactions between recency and frequency features.

‚úÖ Model pipeline saved to 'best_customer_spend_model.joblib'

üîç E

In [None]:
# -----------------------------------------------------------------------------
# 9. SAVE ARTIFACTS
# -----------------------------------------------------------------------------

filename = 'best_customer_spend_model.joblib'
joblib.dump(best_pipeline, filename)
print(f"\n‚úÖ Model pipeline saved to '{filename}'")

# Example Inference
print("\nüîç Example Inference (First 2 Validation Rows):")
sample_data = X_val.iloc[:2]
predictions = best_pipeline.predict(sample_data)
print(f"Actual:    {y_val.iloc[:2].values}")
print(f"Predicted: {predictions}")

In [11]:
# -----------------------------------------------------------------------------
# 9. SAVE ARTIFACTS (USING PICKLE)
# -----------------------------------------------------------------------------

import pickle

filename = "best_customer_spend_model.pkl"

with open(filename, "wb") as f:
    pickle.dump(best_pipeline, f)

print(f"\n‚úÖ Model pipeline saved to '{filename}'")

# -----------------------------------------------------------------------------
# Example Inference
# -----------------------------------------------------------------------------
print("\nüîç Example Inference (First 2 Validation Rows):")

sample_data = X_val.iloc[:2]
predictions = best_pipeline.predict(sample_data)

print(f"Actual:    {y_val.iloc[:2].values}")
print(f"Predicted: {predictions}")



‚úÖ Model pipeline saved to 'best_customer_spend_model.pkl'

üîç Example Inference (First 2 Validation Rows):
Actual:    [0. 0.]
Predicted: [0. 0.]
