In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import datetime as dt

# Load dataset (adjust path as needed)
df = pd.read_excel("dataset/online_retail_II.xlsx",sheet_name="Year 2009-2010")

# Drop missing or invalid values
df.dropna(subset=['Customer ID', 'Invoice', 'Quantity', 'Price'], inplace=True)

# Compute Total Price
df['TotalPrice'] = df['Quantity'] * df['Price']

# Group by Customer to compute CLV and Frequency
customer_df = df.groupby('Customer ID').agg({
    'Invoice': 'nunique',         # Frequency
    'InvoiceDate': 'max',         # Last purchase date
    'TotalPrice': 'sum'           # CLV
}).reset_index()

customer_df.rename(columns={
    'Invoice': 'Frequency',
    'InvoiceDate': 'LastPurchase',
    'TotalPrice': 'CLV'
}, inplace=True)

# Add Recency and Tenure
latest_date = df['InvoiceDate'].max()
earliest_date = df['InvoiceDate'].min()
customer_df['Recency'] = (latest_date - customer_df['LastPurchase']).dt.days
customer_df['Tenure'] = (customer_df['LastPurchase'] - earliest_date).dt.days

# Feature engineering
customer_df['Tenure_Frequency'] = customer_df['Tenure'] * customer_df['Frequency']
customer_df['Freq_per_day'] = customer_df['Frequency'] / (customer_df['Tenure'] + 1)
customer_df['HighFreq'] = (customer_df['Frequency'] > customer_df['Frequency'].median()).astype(int)

# Drop NaNs just in case
customer_df.dropna(inplace=True)

# Feature set
X = customer_df[['Frequency', 'Tenure', 'Recency', 'Tenure_Frequency', 'Freq_per_day', 'HighFreq']]
y = customer_df['CLV']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

# Define parameter distribution
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 15)
}

# Randomized search with 20 iterations
random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=20,                  # Number of random combinations to try
    cv=3,                       # 3-fold cross-validation
    scoring='r2',
    random_state=42,
    n_jobs=-1                   # Use all CPU cores
)

random_search.fit(X_train, y_train)

print("Best R² score from RandomizedSearchCV:", random_search.best_score_)
print("Best Parameters:", random_search.best_params_)


Best R² score from RandomizedSearchCV: -0.12214676352612637
Best Parameters: {'max_depth': 29, 'min_samples_split': 13, 'n_estimators': 113}


In [7]:
model = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("R^2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R^2 Score: 0.6461277105453227
RMSE: 5313.600451079329
