## Import Libraries

In [80]:
#! pip install matplotlib

In [81]:
#! pip install scikit-learn

In [82]:
# Import Libraries
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np # Import numpy for np.sqrt
import joblib # For saving/loading models

print("Libraries imported successfully.")

Libraries imported successfully.


## Load RFM Features

In [83]:
# Load RFM Features
# Define the path to your RFM features file in the 'data' folder.
# Since this notebook is in the 'models' folder, you need to go up one level (..)
# to the project root, then down into the 'data' folder.
data_folder = '../data'
rfm_file_name = 'rfm_features_enhanced.xlsx'
rfm_file_path = os.path.join(data_folder, rfm_file_name)

print(f"Loading RFM features from: {rfm_file_path}")

# Load the RFM DataFrame
rfm_df = pd.read_excel(rfm_file_path)

print(f"RFM features loaded. Shape: {rfm_df.shape}")
print("First 5 rows of RFM data:")
display(rfm_df.head())
print("\nRFM Data Info:")
rfm_df.info()
print("\nDescriptive statistics of RFM data:")
display(rfm_df.describe())

Loading RFM features from: ../data\rfm_features_enhanced.xlsx
RFM features loaded. Shape: (5878, 48)
First 5 rows of RFM data:


Unnamed: 0,Customer ID,Recency,Frequency,Monetary,AOV,Tenure,UniqueProducts,Country_Australia,Country_Austria,Country_Bahrain,...,Country_Singapore,Country_Spain,Country_Sweden,Country_Switzerland,Country_Thailand,Country_USA,Country_United Arab Emirates,Country_United Kingdom,Country_Unspecified,Country_West Indies
0,12346,326,12,77556.46,6463.038333,726,27,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,12347,2,8,4921.53,615.19125,404,126,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12348,75,5,2019.4,403.88,438,25,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12349,19,4,4428.69,1107.1725,589,138,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12350,310,1,334.4,334.4,310,17,0,0,0,...,0,0,0,0,0,0,0,0,0,0



RFM Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5878 entries, 0 to 5877
Data columns (total 48 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Customer ID                   5878 non-null   int64  
 1   Recency                       5878 non-null   int64  
 2   Frequency                     5878 non-null   int64  
 3   Monetary                      5878 non-null   float64
 4   AOV                           5878 non-null   float64
 5   Tenure                        5878 non-null   int64  
 6   UniqueProducts                5878 non-null   int64  
 7   Country_Australia             5878 non-null   int64  
 8   Country_Austria               5878 non-null   int64  
 9   Country_Bahrain               5878 non-null   int64  
 10  Country_Belgium               5878 non-null   int64  
 11  Country_Brazil                5878 non-null   int64  
 12  Country_Canada                5878 non-null   

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,AOV,Tenure,UniqueProducts,Country_Australia,Country_Austria,Country_Bahrain,...,Country_Singapore,Country_Spain,Country_Sweden,Country_Switzerland,Country_Thailand,Country_USA,Country_United Arab Emirates,Country_United Kingdom,Country_Unspecified,Country_West Indies
count,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,...,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0,5878.0
mean,15315.313542,201.331916,6.289384,2955.904095,385.180841,474.711637,81.989112,0.002552,0.001871,0.00034,...,0.00017,0.006635,0.003232,0.003403,0.00017,0.001531,0.000681,0.910174,0.001021,0.00017
std,1715.572666,209.338707,13.009406,14440.852688,1214.286459,223.098342,116.484552,0.050456,0.043223,0.018444,...,0.013043,0.081191,0.056767,0.058237,0.013043,0.039103,0.02608,0.285957,0.031936,0.013043
min,12346.0,1.0,1.0,2.95,2.95,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13833.25,26.0,1.0,342.28,176.6825,313.0,19.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,15314.5,96.0,3.0,867.74,279.242679,530.0,45.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,16797.75,380.0,7.0,2248.305,414.902458,668.0,103.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,18287.0,739.0,398.0,580987.04,84236.25,739.0,2550.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Define Target Variable (CLTV Proxy) and Features

In [84]:
#  Define Target Variable (CLTV Proxy) and Features

target = 'Monetary'

# IMPORTANT: Include all the new features along with Recency and Frequency
features = [
    'Recency',
    'Frequency',
    'AOV',            # New Feature
    'Tenure',         # New Feature
    'UniqueProducts'  # New Feature
]

# Dynamically add all one-hot encoded country columns
# We need to exclude 'CustomerID', 'Monetary', 'Recency', 'Frequency', 'AOV', 'Tenure', 'UniqueProducts'
# from the columns to get the country columns.
# It's safer to get all columns that start with 'Country_'
country_features = [col for col in rfm_df.columns if col.startswith('Country_')]
features.extend(country_features)

print(f"Features selected for modeling: {features}")

X = rfm_df[features]
y = rfm_df[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print("\nFirst 5 rows of features (X):")
display(X.head())
print("\nFirst 5 rows of target (y):")
display(y.head())

Features selected for modeling: ['Recency', 'Frequency', 'AOV', 'Tenure', 'UniqueProducts', 'Country_Australia', 'Country_Austria', 'Country_Bahrain', 'Country_Belgium', 'Country_Brazil', 'Country_Canada', 'Country_Channel Islands', 'Country_Cyprus', 'Country_Czech Republic', 'Country_Denmark', 'Country_EIRE', 'Country_European Community', 'Country_Finland', 'Country_France', 'Country_Germany', 'Country_Greece', 'Country_Iceland', 'Country_Israel', 'Country_Italy', 'Country_Japan', 'Country_Korea', 'Country_Lebanon', 'Country_Lithuania', 'Country_Malta', 'Country_Netherlands', 'Country_Nigeria', 'Country_Norway', 'Country_Poland', 'Country_Portugal', 'Country_RSA', 'Country_Saudi Arabia', 'Country_Singapore', 'Country_Spain', 'Country_Sweden', 'Country_Switzerland', 'Country_Thailand', 'Country_USA', 'Country_United Arab Emirates', 'Country_United Kingdom', 'Country_Unspecified', 'Country_West Indies']
Features (X) shape: (5878, 46)
Target (y) shape: (5878,)

First 5 rows of features (

Unnamed: 0,Recency,Frequency,AOV,Tenure,UniqueProducts,Country_Australia,Country_Austria,Country_Bahrain,Country_Belgium,Country_Brazil,...,Country_Singapore,Country_Spain,Country_Sweden,Country_Switzerland,Country_Thailand,Country_USA,Country_United Arab Emirates,Country_United Kingdom,Country_Unspecified,Country_West Indies
0,326,12,6463.038333,726,27,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,8,615.19125,404,126,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,75,5,403.88,438,25,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,19,4,1107.1725,589,138,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,310,1,334.4,310,17,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



First 5 rows of target (y):


0    77556.46
1     4921.53
2     2019.40
3     4428.69
4      334.40
Name: Monetary, dtype: float64

## Split Data into Training and Testing Sets

In [85]:
# Split Data into Training and Testing Sets
# Split the data into training and testing sets (e.g., 80% train, 20% test)
# random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training features (X_train) shape: {X_train.shape}")
print(f"Testing features (X_test) shape: {X_test.shape}")
print(f"Training target (y_train) shape: {y_train.shape}")
print(f"Testing target (y_test) shape: {y_test.shape}")

Training features (X_train) shape: (4702, 46)
Testing features (X_test) shape: (1176, 46)
Training target (y_train) shape: (4702,)
Testing target (y_test) shape: (1176,)


In [86]:
# Cell 5: Standardize Features
# It's crucial to fit the scaler ONLY on the training data and then transform both
# training and testing data to prevent data leakage.
from sklearn.preprocessing import StandardScaler # This is the crucial line for StandardScaler
scaler = StandardScaler()

# Fit on training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features standardized successfully.")
print("\nShape of scaled training features:", X_train_scaled.shape)
print("Shape of scaled testing features:", X_test_scaled.shape)
print("First 5 rows of scaled training features (X_train_scaled):")
print(X_train_scaled[:5])

# You should also save this scaler if you plan to use the model for new, unseen data,
# as new data would also need to be scaled using the *same* scaler.
# For now, we'll proceed with saving the model, and we can discuss saving the scaler later.

Features standardized successfully.

Shape of scaled training features: (4702, 46)
Shape of scaled testing features: (1176, 46)
First 5 rows of scaled training features (X_train_scaled):
[[-0.52935108 -0.10007131 -0.08534577  0.0345183   0.08446623 -0.05265402
  -0.03574469 -0.02062842 -0.0701112  -0.02062842 -0.02917921 -0.05058296
  -0.04128321 -0.01458495 -0.03861279 -0.02526725  0.         -0.05058296
  -0.12817519 -0.12987884 -0.02917921 -0.01458495 -0.02526725 -0.05058296
  -0.04128321 -0.02062842  0.         -0.01458495 -0.02062842 -0.05843309
  -0.01458495 -0.05265402 -0.03262682 -0.05843309 -0.02062842 -0.01458495
  -0.01458495 -0.0801326  -0.05657156 -0.05657156 -0.01458495 -0.03861279
  -0.02526725  0.30989885 -0.03262682 -0.01458495]
 [-0.83111704 -0.32563951 -0.04928679 -1.8222295  -0.42862804 -0.05265402
  -0.03574469 -0.02062842 -0.0701112  -0.02062842 -0.02917921 -0.05058296
  -0.04128321 -0.01458495 -0.03861279 -0.02526725  0.         -0.05058296
  -0.12817519 -0.12987

In [87]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [88]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [89]:
## Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor":AdaBoostRegressor()
   
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 9423.9862
- Mean Absolute Error: 1643.9446
- R2 Score: 0.4770
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 14380.1704
- Mean Absolute Error: 1811.8159
- R2 Score: 0.4305


Lasso
Model performance for Training set
- Root Mean Squared Error: 9424.8115
- Mean Absolute Error: 1642.0488
- R2 Score: 0.4770
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 14386.8231
- Mean Absolute Error: 1806.3772
- R2 Score: 0.4300


Ridge
Model performance for Training set
- Root Mean Squared Error: 9425.4367
- Mean Absolute Error: 1647.4989
- R2 Score: 0.4769
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 14420.1171
- Mean Absolute Error: 1815.4581
- R2 Score: 0.4274


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 9493.3033
- Mean Absolute Error: 1363.46

Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6995.3468
- Mean Absolute Error: 527.5595
- R2 Score: 0.8652


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 3081.0924
- Mean Absolute Error: 172.9234
- R2 Score: 0.9441
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6113.2286
- Mean Absolute Error: 424.7028
- R2 Score: 0.8971


Adaboost Regressor
Model performance for Training set
- Root Mean Squared Error: 9195.0968
- Mean Absolute Error: 8858.6929
- R2 Score: 0.5021
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12578.2465
- Mean Absolute Error: 9378.0140
- R2 Score: 0.5643




In [90]:
#Initialize few parameter for Hyperparamter tuning
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

ada_params={
    "n_estimators":[50,60,70,80],
    "loss":['linear','square','exponential']
}

In [91]:
# Models list for Hyperparameter tuning
randomcv_models = [('KNN', KNeighborsRegressor(), knn_params),
                   ("RF", RandomForestRegressor(), rf_params),
                   ("Adaboost",AdaBoostRegressor(),ada_params)
                   
                   ]

In [92]:
##Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])



Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits


69 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
39 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\aman2\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aman2\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1356, in wrapper
    estimator._validate_params()
  File "c:\Users\aman2\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 469, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\aman2\AppData\Local\Programs\Python\Python311\Lib

Fitting 3 folds for each of 12 candidates, totalling 36 fits
---------------- Best Params for KNN -------------------
{'n_neighbors': 20}
---------------- Best Params for RF -------------------
{'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 8, 'max_depth': None}
---------------- Best Params for Adaboost -------------------
{'n_estimators': 50, 'loss': 'linear'}


In [94]:
## Retraining the models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=1000, min_samples_split=2, max_features=8, max_depth=None, 
                                                     n_jobs=-1),
     "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=20, n_jobs=-1),
     "Adaboost":AdaBoostRegressor(n_estimators=50,loss='linear')
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 3298.2882
- Mean Absolute Error: 324.2296
- R2 Score: 0.9359
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11686.7347
- Mean Absolute Error: 1043.4457
- R2 Score: 0.6239


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 10683.1457
- Mean Absolute Error: 1597.3685
- R2 Score: 0.3280
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 16955.9572
- Mean Absolute Error: 1975.2039
- R2 Score: 0.2083


Adaboost
Model performance for Training set
- Root Mean Squared Error: 7796.1799
- Mean Absolute Error: 7358.8469
- R2 Score: 0.6421
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 10985.7202
- Mean Absolute Error: 7799.7328
- R2 Score: 0.6677


