# Use numeric prediction techniques to build a predictive model for the customer.xlsx dataset.
This dataset is provided on the course website and contains data about whether or not different consumers made a purchase in response to a test mailing of a certain catalog and, in case of a purchase, how much money each consumer spent. The data file has a brief description of all the attributes in a separate worksheet. 
- Note that this dataset has two possible outcome variables: *Purchase* (0/1 value: whether or not the purchase was made) and *Spending* (numeric value: amount spent).

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

## Data Preprocessing

In [3]:
df = pd.read_excel('customers.xlsx', sheet_name='All Data')
df.head()

Unnamed: 0,sequence_number,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,...,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Purchase,Spending
0,1,1,0,0,1,0,0,0,0,0,...,0,0,2,3662,3662,1,0,1,1,127.87
1,2,1,0,0,0,0,1,0,0,0,...,0,0,0,2900,2900,1,1,0,0,0.0
2,3,1,0,0,0,0,0,0,0,0,...,0,0,2,3883,3914,0,0,0,1,127.48
3,4,1,0,1,0,0,0,0,0,0,...,0,0,1,829,829,0,1,0,0,0.0
4,5,1,0,1,0,0,0,0,0,0,...,0,0,1,869,869,0,0,0,0,0.0


In [4]:
# Handle missing values (if any)
df.isnull().sum()

sequence_number         0
US                      0
source_a                0
source_c                0
source_b                0
source_d                0
source_e                0
source_m                0
source_o                0
source_h                0
source_r                0
source_s                0
source_t                0
source_u                0
source_p                0
source_x                0
source_w                0
Freq                    0
last_update_days_ago    0
1st_update_days_ago     0
Web order               0
Gender=male             0
Address_is_res          0
Purchase                0
Spending                0
dtype: int64

In [5]:
# Summary statistic for numeric column
df.describe()

Unnamed: 0,sequence_number,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,...,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Purchase,Spending
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,0.8245,0.1265,0.056,0.06,0.0415,0.151,0.0165,0.0335,0.0525,...,0.018,0.1375,1.417,2155.101,2435.6015,0.426,0.5245,0.221,0.5,102.560745
std,577.494589,0.380489,0.332495,0.229979,0.237546,0.199493,0.358138,0.12742,0.179983,0.223089,...,0.132984,0.344461,1.405738,1141.302846,1077.872233,0.494617,0.499524,0.415024,0.500125,186.749816
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,500.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1133.0,1671.25,0.0,0.0,0.0,0.0,0.0
50%,1000.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,2280.0,2721.0,0.0,1.0,0.0,0.5,1.855
75%,1500.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3139.25,3353.0,1.0,1.0,0.0,1.0,152.5325
max,2000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,15.0,4188.0,4188.0,1.0,1.0,1.0,1.0,1500.06


In [8]:
# import packages
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor

### (a) Build numeric prediction models that predict Spending based on the other available customer information. 
- Use linear regression, k-NN, regression tree, SVM regreesion and Neural Network and ensembling models.

In [9]:
# split the data to x and y
X = df.drop(columns = ["sequence_number", "Purchase","Spending"])
y = df["Spending"]

# Normalization using Min-Max scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [10]:
## Define the function to create the neural network model
def create_neural_network(hidden_layer_sizes=(100,), alpha=0.0001):
    model = Sequential()
    model.add(Dense(units=hidden_layer_sizes[0], activation="relu", input_dim=X.shape[1]))
    for size in hidden_layer_sizes[1:]:
        model.add(Dense(units=size, activation="relu"))
    model.add(Dense(units=1, activation="linear"))
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model

In [11]:
## Define models and hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'k-NN': (KNeighborsRegressor(), {'n_neighbors': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10, 15, 20]}),
    'SVM Regression': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}),
    'Random Forest': (RandomForestRegressor(), {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    'Neural Network': (KerasRegressor(build_fn=create_neural_network, batch_size=32, epochs=100, verbose=0), {}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.01, 0.1, 0.3]}),
}

best_models = {}

In [12]:
# Define a custom RMSE scoring function
def custom_rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = sqrt(mse)
    return -rmse 

# Register the RMSE scorer
rmse_scorer = make_scorer(custom_rmse)

In [13]:
## Nested cross-validation
outer_cv = KFold(n_splits = 5, shuffle=True, random_state =42)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=42)

for model_name, (model, param_grid) in models.items():
    random_search = RandomizedSearchCV(model, param_distributions=param_grid,
                                      n_iter=5, scoring=rmse_scorer, cv=inner_cv, random_state=42)
    rmse_scores = []
    
    for train_index, test_index in outer_cv.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        random_search.fit(X_train, y_train)
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = sqrt(mse)
        rmse_scores.append(rmse)
    
    avg_rmse = np.mean(rmse_scores)
    best_models[model_name] = best_model
    print(f"{model_name} Average RMSE: {avg_rmse:.2f}")

Linear Regression Average RMSE: 127.73


  File "C:\Users\Jessica\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


k-NN Average RMSE: 169.75
Decision Tree Average RMSE: 135.69
SVM Regression Average RMSE: 167.60
Random Forest Average RMSE: 127.49
Neural Network Average RMSE: 137.34
XGBoost Average RMSE: 134.98


### (b) As a variation on this exercise, create a separate “restricted” dataset (i.e., a subset of theoriginal dataset), which includes only purchase records (i.e., where Purchase = 1). 
- Build numeric prediction models to predict Spending for this restricted dataset. 

In [15]:
df2 = pd.read_excel('customers.xlsx', sheet_name='All Data')
# Filter the dataset to create a restricted dataset with only purchase records
restricted_df = df2[df2['Purchase'] == 1]

# Split the data into features (X) and the target variable (y)
X_restricted = restricted_df.drop(columns=["sequence_number", "Spending"])
y_restricted = restricted_df["Spending"]

# Normalize the features using Min-Max scaling
scaler = MinMaxScaler()
X_restricted = scaler.fit_transform(X_restricted)

In [16]:
# Define the function to create the neural network model
def create_neural_network(hidden_layer_sizes=(100,), alpha=0.0001):
    model = Sequential()
    model.add(Dense(units=hidden_layer_sizes[0], activation="relu", input_dim=X_restricted.shape[1]))
    for size in hidden_layer_sizes[1:]:
        model.add(Dense(units=size, activation="relu"))
    model.add(Dense(units=1, activation="linear"))
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model

# Define models and hyperparameter grids (same as task (a))
models = {
    'Linear Regression': (LinearRegression(), {}),
    'k-NN': (KNeighborsRegressor(), {'n_neighbors': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10, 15, 20]}),
    'SVM Regression': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}),
    'Random Forest': (RandomForestRegressor(), {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    'Neural Network': (KerasRegressor(build_fn=create_neural_network, batch_size=32, epochs=100, verbose=0), {}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.01, 0.1, 0.3]}),
}

best_models_restricted = {}

In [17]:
# Nested cross-validation for the restricted dataset
outer_cv_restricted = KFold(n_splits = 5, shuffle=True, random_state =42)
inner_cv_restricted = KFold(n_splits=10, shuffle=True, random_state=42)

for model_name, (model, param_grid) in models.items():
    random_search = RandomizedSearchCV(model, param_distributions=param_grid,
                                      n_iter=5, scoring=rmse_scorer, cv=inner_cv, random_state=42)
    rmse_scores = []
    
    for train_index, test_index in outer_cv_restricted.split(X_restricted,y_restricted):
        train_index, test_index = list(train_index), list(test_index)
        X_train, X_test = X_restricted[train_index], X_restricted[test_index]
        y_train, y_test = y_restricted.iloc[train_index], y_restricted.iloc[test_index]
        
        random_search.fit(X_train, y_train)
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = sqrt(mse)
        rmse_scores.append(rmse)
    
    avg_rmse = np.mean(rmse_scores)
    best_models[model_name] = best_model
    print(f"{model_name} Average RMSE: {avg_rmse:.2f}")

Linear Regression Average RMSE: 165.25
k-NN Average RMSE: 208.50
Decision Tree Average RMSE: 185.14
SVM Regression Average RMSE: 206.79
Random Forest Average RMSE: 161.56
Neural Network Average RMSE: 199.48
XGBoost Average RMSE: 174.43
