## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, LogisticRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

import joblib

import warnings
warnings.filterwarnings('ignore')

from scipy import stats

## Data Proprocessing

### Data Loading and removing columns

In [2]:
# Loading the dataset
data = pd.read_csv('supermarket_sales.csv')

print('Original Data:')
display(data)

# Dropping the 'Invoice ID' Column
data.drop(columns=['Invoice ID'], inplace=True)

# Dropping the 'City' Column
data.drop(columns=['City'], inplace=True)

print('After dropping Invoice ID and City:')
display(data)

X_train, X_test = train_test_split(data, test_size=0.2, random_state=43)

print('X_train:')
display(X_train)
print('X_test:')
display(X_test)

Original Data:


Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Naypyitaw,Normal,Male,Health and beauty,40.35,1,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,303-96-2227,B,Mandalay,Normal,Female,Home and lifestyle,97.38,10,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,727-02-1313,A,Yangon,Member,Male,Food and beverages,31.84,1,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,347-56-2442,A,Yangon,Normal,Male,Home and lifestyle,65.82,1,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


After dropping Invoice ID and City:


Unnamed: 0,Branch,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,A,Member,Female,Health and beauty,74.69,7,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,C,Normal,Female,Electronic accessories,15.28,5,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,A,Normal,Male,Home and lifestyle,46.33,7,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,A,Member,Male,Health and beauty,58.22,8,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,A,Normal,Male,Sports and travel,86.31,7,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,C,Normal,Male,Health and beauty,40.35,1,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,B,Normal,Female,Home and lifestyle,97.38,10,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,A,Member,Male,Food and beverages,31.84,1,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,A,Normal,Male,Home and lifestyle,65.82,1,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


X_train:


Unnamed: 0,Branch,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
147,C,Normal,Male,Health and beauty,66.14,4,277.7880,3/19/2019,12:46,Credit card,264.56,4.761905,13.2280,5.6
88,A,Normal,Male,Sports and travel,42.47,1,44.5935,1/2/2019,16:57,Cash,42.47,4.761905,2.1235,5.7
731,A,Normal,Male,Health and beauty,56.00,3,176.4000,2/28/2019,19:33,Ewallet,168.00,4.761905,8.4000,4.8
741,C,Normal,Male,Food and beverages,84.83,1,89.0715,1/14/2019,15:20,Ewallet,84.83,4.761905,4.2415,8.8
36,A,Member,Male,Sports and travel,62.62,5,328.7550,3/10/2019,19:15,Ewallet,313.10,4.761905,15.6550,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,C,Normal,Female,Fashion accessories,95.42,4,400.7640,2/2/2019,13:23,Ewallet,381.68,4.761905,19.0840,6.4
817,A,Normal,Male,Food and beverages,33.88,8,284.5920,1/19/2019,20:29,Ewallet,271.04,4.761905,13.5520,9.6
255,B,Member,Male,Fashion accessories,32.62,4,137.0040,1/29/2019,14:12,Cash,130.48,4.761905,6.5240,9.0
320,C,Normal,Female,Food and beverages,22.93,9,216.6885,2/26/2019,20:26,Cash,206.37,4.761905,10.3185,5.5


X_test:


Unnamed: 0,Branch,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
858,B,Normal,Male,Health and beauty,57.59,6,362.8170,2/15/2019,13:51,Cash,345.54,4.761905,17.2770,5.1
986,B,Normal,Female,Health and beauty,14.76,2,30.9960,2/18/2019,14:42,Ewallet,29.52,4.761905,1.4760,4.3
183,C,Normal,Male,Health and beauty,34.31,8,288.2040,1/25/2019,15:00,Ewallet,274.48,4.761905,13.7240,5.7
502,C,Normal,Male,Home and lifestyle,69.40,2,145.7400,1/27/2019,19:48,Ewallet,138.80,4.761905,6.9400,9.0
710,A,Member,Male,Food and beverages,80.62,6,507.9060,2/28/2019,20:18,Cash,483.72,4.761905,24.1860,9.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,A,Normal,Female,Fashion accessories,87.67,2,184.1070,3/10/2019,12:17,Credit card,175.34,4.761905,8.7670,7.7
244,B,Normal,Male,Home and lifestyle,93.87,8,788.5080,2/2/2019,18:42,Credit card,750.96,4.761905,37.5480,8.3
294,B,Normal,Male,Health and beauty,18.11,10,190.1550,3/13/2019,11:46,Ewallet,181.10,4.761905,9.0550,5.9
176,A,Member,Male,Food and beverages,22.17,8,186.2280,3/3/2019,17:01,Credit card,177.36,4.761905,8.8680,9.6


### Functions to process Date and Time Features

In [3]:
# Encoding 'Date' feature into day of the week
def extract_weekday(X):
    X['Date'] = pd.to_datetime(X['Date'])
    return X['Date'].dt.strftime("%A").values.reshape(-1, 1)

# Encoding 'Time' feature into Time of the Day categories
def categorize_time(X):
    categories = ['Morning', 'Afternoon', 'Evening', 'Night']
    bins = [0, 12, 17, 19, 21]
    X['Time'] = pd.to_datetime(X['Time'])
    time_categories = pd.cut(X['Time'].dt.hour, bins=bins, labels=categories)
    return time_categories.values.reshape(-1, 1)

### Building and Implementing Pipelines to Preprocess the data

In [4]:
# Defining the categorical and numerical feature columns
numerical_features = ['Unit price', 'Quantity', 'Total', 'cogs',
                      'gross margin percentage', 'gross income', 'Rating']
categorical_features = ['Branch', 'Product line', 'Payment']

# Defining the column transformer
preprocessor = ColumnTransformer(
    transformers=[('date', Pipeline([('weekday', FunctionTransformer(extract_weekday, validate=False)),
                                     ('encoder', OrdinalEncoder())]), ['Date']),
                  # Extracting Weekday from 'Date' and then Label encoding it
                  ('time', Pipeline([('time_cat', FunctionTransformer(categorize_time, validate=False)),
                                     ('encoder', OrdinalEncoder())]), ['Time']),
                  # Label Encoding 'Gender' and 'Customer type' features
                  ('gender', OrdinalEncoder(), ['Gender']),
                  ('customer', OrdinalEncoder(), ['Customer type']),
                  # Extracting Time of Day from 'Time' and then Label encoding it
                  ('num', StandardScaler(), numerical_features), # Standardizing numerical features
                  ('cat', OneHotEncoder(), categorical_features)]) # One-Hot Encoding categorical features

# Finishing the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Applying preprocessing to the training data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)

onehotfeatures = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
# Converting the data back into a Dataframe
X_train_preprocessed = pd.DataFrame(X_train_preprocessed,
                                    columns=['Date','Time','Gender','Customer type'] + numerical_features + onehotfeatures)

# Saving the preprocesssing pipeline in a pickle file
joblib.dump(preprocessing_pipeline, 'Preprocessor.pkl');

# Displaying the preprocessed data
print('X_train_preprocessed:')
display(X_train_preprocessed)

X_train_preprocessed:


Unnamed: 0,Date,Time,Gender,Customer type,Unit price,Quantity,Total,cogs,gross margin percentage,gross income,...,Branch_C,Product line_Electronic accessories,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel,Payment_Cash,Payment_Credit card,Payment_Ewallet
0,5.0,2.0,1.0,1.0,0.436005,-0.518907,-0.164198,-0.164198,0.0,-0.164198,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,6.0,0.0,1.0,1.0,-0.463935,-1.544754,-1.121929,-1.121929,0.0,-1.121929,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,4.0,1.0,1.0,1.0,0.050479,-0.860856,-0.580599,-0.580599,0.0,-0.580599,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,1.0,1.146604,-1.544754,-0.939257,-0.939257,0.0,-0.939257,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3.0,1.0,1.0,0.0,0.302173,-0.176959,0.045123,0.045123,0.0,0.045123,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,2.0,0.0,0.0,1.0,1.549239,-0.518907,0.340864,0.340864,0.0,0.340864,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
796,2.0,3.0,1.0,1.0,-0.790530,0.848888,-0.136254,-0.136254,0.0,-0.136254,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
797,5.0,0.0,1.0,0.0,-0.838435,-0.518907,-0.742399,-0.742399,0.0,-0.742399,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
798,5.0,3.0,0.0,1.0,-1.206852,1.190837,-0.415134,-0.415134,0.0,-0.415134,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


## Feature Extraction for predicting gross income

In [5]:
# Splitting features and the target variables for predicting 'gross income'
X_train_gi = X_train_preprocessed.drop(columns=['gross income'])
y_train_gi = X_train_preprocessed['gross income']

## Implemeting Linear Regression to predict 'gross income'

In [8]:
# Defining Linear Regression Pipeline
lin_reg_pipeline_gi = Pipeline([('lin_reg', LinearRegression())])

# Fitting the Linear Regression model to the training data
lin_reg_pipeline_gi.fit(X_train_gi, y_train_gi)

# Saving the trained models as pickle files
joblib.dump(lin_reg_pipeline_gi, 'LinearRegressorforGrossIncome.pkl');

In [11]:
y_pred_lin_reg = lin_reg_pipeline_gi.predict(X_train_gi)

# Calculating the regression metrics
lin_reg_r2 = r2_score(y_train_gi, y_pred_lin_reg)
lin_reg_mae = mean_absolute_error(y_train_gi, y_pred_lin_reg)
lin_reg_mse = mean_squared_error(y_train_gi, y_pred_lin_reg)

# Print the regression metrics
print("For Linear Regression:")
print("R-squared (R2) Score:", lin_reg_r2)
print("Mean Absolute Error (MAE):", lin_reg_mae)
print("Mean Squared Error (MSE):", lin_reg_mse,'\n')

For Linear Regression:
R-squared (R2) Score: 1.0
Mean Absolute Error (MAE): 8.585605913279809e-16
Mean Squared Error (MSE): 1.0798938193890548e-30 



In [12]:
# Defining a function to calculate 95% confidence interval of r^2
def estimate_r2_CI(estimator, X, y, n_iter=1000):
    r2_scores = []
    for _ in range(n_iter):
        X_resampled, y_resampled = resample(X, y)
        y_pred = estimator.predict(X_resampled)
        r2_scores.append(r2_score(y_resampled, y_pred))
    return np.percentile(r2_scores, [2.5, 97.5])

confidence_interval = estimate_r2_CI(lin_reg_pipeline_gi, X_train_gi, y_train_gi)
print(f"95% Confidence Interval for Linear Regression 𝑟^2 score: {confidence_interval}")

95% Confidence Interval for Linear Regression 𝑟^2 score: [1. 1.]


## Implemeting Lasso Regression to predict 'gross income'

In [15]:
# Defining Lasso Regression with Cross-Validation Pipeline
lasso_pipeline_gi = Pipeline([('lasso', LassoCV(cv=5, alphas=np.linspace(0.1,5,10)))])  # Using 5-fold cross-validation

# Fitting the Linear Regression with Lasso model to the training data
lasso_pipeline_gi.fit(X_train_gi, y_train_gi)

joblib.dump(lasso_pipeline_gi, 'LassoforGrossIncome.pkl');

y_pred_lasso = lasso_pipeline_gi.predict(X_train_gi)
# Calculating R2 score and other metrics for Lasso regression
lasso_r2 = r2_score(y_train_gi, y_pred_lasso)
lasso_mae = mean_absolute_error(y_train_gi, y_pred_lasso)
lasso_mse = mean_squared_error(y_train_gi, y_pred_lasso)

print("For Lasso Regression:")
print("R-squared (R2) Score:", lasso_r2)
print("Mean Absolute Error (MAE):", lasso_mae)
print("Mean Squared Error (MSE):", lasso_mse)

confidence_interval = estimate_r2_CI(lasso_pipeline_gi, X_train_gi, y_train_gi)
print(f"95% Confidence Interval for Lasso 𝑟^2 score: {confidence_interval}")

For Lasso Regression:
R-squared (R2) Score: 0.99
Mean Absolute Error (MAE): 0.08163780909053812
Mean Squared Error (MSE): 0.010000000000000018
95% Confidence Interval for Lasso 𝑟^2 score: [0.98994155 0.98999999]


## Extracting features for Unit Price Prediction

In [18]:
# Splitting the features and target labels for predicting Unit Price
y_train_up = X_train_preprocessed['Unit price']
X_train_up = X_train_preprocessed.drop(columns=['Unit price'])

## Implementing Linear Regression to predict Unit price

In [19]:
# Linear Regression Pipeline for predicting Unit Price
lin_reg_pipeline_up = Pipeline([('lin_reg', LinearRegression())])

# Fitting the model to the training data
lin_reg_pipeline_up.fit(X_train_up, y_train_up)

# SAving the model as a pickle file
joblib.dump(lin_reg_pipeline_up, 'LinearRegressorforUnitPrice.pkl');

In [20]:
# Predicting for the train set
y_pred_lin_reg = lin_reg_pipeline_up.predict(X_train_up)

lin_reg_r2 = r2_score(y_train_up, y_pred_lin_reg)
lin_reg_mae = mean_absolute_error(y_train_up, y_pred_lin_reg)
lin_reg_mse = mean_squared_error(y_train_up, y_pred_lin_reg)

# Print the regression metrics
print("For Linear Regression:")
print("R-squared (R2) Score:", lin_reg_r2)
print("Mean Absolute Error (MAE):", lin_reg_mae)
print("Mean Squared Error (MSE):", lin_reg_mse,'\n')

For Linear Regression:
R-squared (R2) Score: 0.7784862691176561
Mean Absolute Error (MAE): 0.341372249782427
Mean Squared Error (MSE): 0.221513730882344 



In [21]:
# Calculating the 95% CI for Linear Regressor
confidence_interval = estimate_r2_CI(lin_reg_pipeline_up, X_train_up, y_train_up)
print(f"95% Confidence Interval for Linear Regression 𝑟^2 score: {confidence_interval}")

95% Confidence Interval for Linear Regression 𝑟^2 score: [0.75521218 0.80064739]


## Implementing Lasso Regression to predict Unit Price

In [23]:
# Lasso Regression with Cross-Validation
# Using KFold cross-validation strategy for more robust results
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Defining the lasso pipeline
lasso_pipeline_up = Pipeline([('lasso', LassoCV(cv=kf, alphas=np.linspace(0.2,5,10)))])

# Fitting the model to the training data
lasso_pipeline_up.fit(X_train_up, y_train_up)

# Saving as pickle file
joblib.dump(lasso_pipeline_up, 'LassoforUnitPrice.pkl');

y_pred_lasso = lasso_pipeline_up.predict(X_train_up)
lasso_r2 = r2_score(y_train_up, y_pred_lasso)
lasso_mae = mean_absolute_error(y_train_up, y_pred_lasso)
lasso_mse = mean_squared_error(y_train_up, y_pred_lasso)

print("For Lasso Regression:")
print("R-squared (R2) Score:", lasso_r2)
print("Mean Absolute Error (MAE):", lasso_mae)
print("Mean Squared Error (MSE):", lasso_mse)

confidence_interval = estimate_r2_CI(lasso_pipeline_up, X_train_up, y_train_up)
print(f"95% Confidence Interval for Lasso 𝑟^2 score: {confidence_interval}")

For Lasso Regression:
R-squared (R2) Score: 0.5048700694285799
Mean Absolute Error (MAE): 0.5739002521189034
Mean Squared Error (MSE): 0.49512993057142013
95% Confidence Interval for Lasso 𝑟^2 score: [0.47077485 0.53700979]


## Feature Extraction for Gender Classification for Branch C

In [26]:
# Filtering the data for branch 'C'
train_data_branch_C = X_train_preprocessed[X_train_preprocessed['Branch_C'] == 1]

# Splitting the data for interaction and training logistic regression to classify Gender
X_train_g = train_data_branch_C.drop(columns=['Gender'])
# Only including Product Line, Payment and gross income for interaction
X_train_g_int = train_data_branch_C[['Product line_Electronic accessories', 'Product line_Fashion accessories',
                                 'Product line_Food and beverages','Product line_Health and beauty',
                                 'Product line_Home and lifestyle', 'Product line_Sports and travel',
                                 'Payment_Cash','Payment_Ewallet', 'Payment_Credit card', 'gross income']]
y_train_g = train_data_branch_C['Gender']

## Implementing Logistic Regression to classify Gender

In [33]:
# Defining a custom R^2 scorer for logistic regression
def logistic_r2_estimator(y_true, y_pred_prob):
    return np.square(np.corrcoef(y_true, y_pred_prob)[0, 1])

r2_scorer = make_scorer(logistic_r2_estimator, greater_is_better=True, needs_proba=True)

# Creating a pipeline with Logistic Regression with interaction
pipeline = Pipeline([
    ("interaction", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("logreg", LogisticRegression(max_iter=10000))
])

# Defining hyperparameters to tune
param_grid = {'logreg__C': np.linspace(-3, 3, 7)}

# Implementing GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, scoring=r2_scorer, cv=5, n_jobs=-1)

# FItting the model to the training data
grid_search.fit(X_train_g, y_train_g)

# Getting the best model and parameters
best_params = grid_search.best_params_
best_r2 = grid_search.best_score_

print(f"Best parameters: {best_params}\n")

# Saving as pickle file
joblib.dump(grid_search.best_estimator_, 'LogRegforGender.pkl');

r2_scores = cross_val_score(grid_search.best_estimator_, X_train_g, y_train_g, cv=5, scoring=r2_scorer)
print(classification_report(y_train_g, grid_search.best_estimator_.predict(X_train_g)))

confidence_level = 0.95
degrees_freedom = len(r2_scores) - 1
confidence_interval = stats.t.interval(confidence_level, degrees_freedom, loc=np.mean(r2_scores), scale=stats.sem(r2_scores))

print(f"R^2 (coefficient of determination): {best_r2:.4f}")
print(f"95% confidence interval for R^2: ({confidence_interval[0]:.4f}, {confidence_interval[1]:.4f})")

Best parameters: {'logreg__C': 1.0}

              precision    recall  f1-score   support

         0.0       0.76      0.76      0.76       136
         1.0       0.73      0.73      0.73       119

    accuracy                           0.75       255
   macro avg       0.75      0.75      0.75       255
weighted avg       0.75      0.75      0.75       255

R^2 (coefficient of determination): 0.0218
95% confidence interval for R^2: (-0.0054, 0.0489)


## Feature Extraction for classifying Customer type

In [30]:
# Filtering the data for branch 'C'
train_branch_c_data = X_train_preprocessed[X_train_preprocessed['Branch_C'] == 1]

# Define X and y for training
X_train_ct = train_branch_c_data.drop(columns=['Customer type'])
X_train_ct_int = train_branch_c_data[['Gender', 'Date', 'Time']]
y_train_ct = train_branch_c_data['Customer type']

## Implementing Logistic Regression for classifying Customer type

In [32]:
# Defining the pipeline for logistic regression with interaction
pipeline = Pipeline([
    ("interaction", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("logreg", LogisticRegression(max_iter=10000))
])

# Defining the hyperparameters to search over
param_grid = {"logreg__C": [0.001, 0.01, 0.1, 1, 10, 100],
              "logreg__penalty": ["l1", "l2"],
              "logreg__solver": ["liblinear", "saga"]}

# Defining the GridSearchCV attribute
logreg_ct = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Fitting the model to the training data
logreg_ct.fit(X_train_ct, y_train_ct)

# Saving as pickle file
joblib.dump(grid_search.best_estimator_, 'LogRegforCustomer.pkl')

# Printing the best parameters
print("Best hyperparameters:", logreg_ct.best_params_)

print(classification_report(logreg_ct.best_estimator_.predict(X_train_ct), y_train_ct))

# Calculating the confidence interval for R^2
logreg_ct_r2_scores = cross_val_score(logreg_ct.best_estimator_,
                                      X_train_ct, y_train_ct, cv=5, scoring=r2_scorer)

confidence_level = 0.95
degrees_freedom = len(logreg_ct_r2_scores) - 1
confidence_interval = stats.t.interval(confidence_level, degrees_freedom,
                                       loc=np.mean(logreg_ct_r2_scores), scale=stats.sem(logreg_ct_r2_scores))

print(f"R^2: {np.mean(logreg_ct_r2_scores):.4f}")
print(f"95% Confidence Interval for R^2: {confidence_interval}")

Best hyperparameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}
              precision    recall  f1-score   support

         0.0       0.73      0.69      0.71       141
         1.0       0.64      0.69      0.67       114

    accuracy                           0.69       255
   macro avg       0.69      0.69      0.69       255
weighted avg       0.69      0.69      0.69       255

R^2: 0.0200
95% Confidence Interval for R^2: (-0.013992775130078683, 0.05402439936244709)


## Feature Extraction for Day of Purchase Prediction

In [34]:
# Splitting the data into features and target for day of purchase Classification
y_train_day = X_train_preprocessed['Date']
X_train_day = X_train_preprocessed.drop(columns=['Date'])

## Implementing Logistic Regression for Day of Purchase Prediction

In [36]:
# Defining Pipeline for Logistic Regression
logreg_pipeline = Pipeline([("logreg", LogisticRegression(max_iter=10000))])

# Definig the hyperparameters grid for Logistic Regression Gridsearch
logreg_params = {"logreg__C": [0.001, 0.01, 0.1, 1, 10, 100],
                 "logreg__penalty": ["l1", "l2"],
                 "logreg__solver": ["liblinear", "saga"]}

logreg_day = GridSearchCV(logreg_pipeline, logreg_params, cv=5, n_jobs=-1, scoring='accuracy')
logreg_day.fit(X_train_day, y_train_day)

# Printing the best parameters for Logistic Regression
print("Logistic Regression - Best hyperparameters:", logreg_day.best_params_)

# Saving as pickle file
joblib.dump(logreg_day.best_estimator_, 'LogRegforDay.pkl')

Logistic Regression - Best hyperparameters: {'logreg__C': 0.01, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}


['LogRegforDay.pkl']

In [37]:
# For Logistic Regression
logreg_r2_scores = cross_val_score(logreg_day.best_estimator_,
                                   X_train_day, y_train_day, cv=5, scoring=r2_scorer)

confidence_level = 0.95
degrees_freedom = len(logreg_r2_scores) - 1
logreg_confidence_interval = stats.t.interval(confidence_level, degrees_freedom,
                                       loc=np.mean(logreg_r2_scores), scale=stats.sem(logreg_r2_scores))
logreg_confidence_interval = estimate_r2_CI(logreg_day.best_estimator_,X_train_day, y_train_day)

if np.mean(logreg_r2_scores is not float):
    print("Logistic Regression R^2: ", r2_score(y_train_day, logreg_day.best_estimator_.predict(X_train_day)))
else:
    print(f"Logistic Regression R^2: {np.mean(logreg_r2_scores):.4f}")
print(f"95% Confidence Interval for Logistic Regression R^2: {logreg_confidence_interval}\n")
print(classification_report(y_train_day, logreg_day.best_estimator_.predict(X_train_day)))


Logistic Regression R^2:  -0.2924713543608479
95% Confidence Interval for Logistic Regression R^2: [-0.3789698  -0.21917416]

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       113
         1.0       0.00      0.00      0.00        89
         2.0       0.17      1.00      0.29       136
         3.0       0.00      0.00      0.00       108
         4.0       0.00      0.00      0.00       109
         5.0       0.00      0.00      0.00       130
         6.0       0.00      0.00      0.00       115

    accuracy                           0.17       800
   macro avg       0.02      0.14      0.04       800
weighted avg       0.03      0.17      0.05       800



## Implementing Random Forest Classifier for Day Prediction

In [38]:
# Defining the pipeline for Random Forest Classifier
rf_pipeline = Pipeline([("rf", RandomForestClassifier())])

# Definig the hyperparameters grid for Random Forest Classifier Gridsearch
rf_params = {"rf__n_estimators": [10, 25, 50, 100, 150],
             "rf__max_depth": [None, 5, 10, 20, 30],
             "rf__min_samples_split": [1, 2, 5, 10, 15],
             "rf__min_samples_leaf": [1, 2, 4, 6]}

rf_day = GridSearchCV(rf_pipeline, rf_params, cv=5, n_jobs=-1, scoring='accuracy')
rf_day.fit(X_train_day, y_train_day)

# Priniting the best parameters for Random Forest Classifier
print("Random Forest Classifier - Best hyperparameters:", rf_day.best_params_)

# Saving as pickle file
joblib.dump(rf_day.best_estimator_, 'RFforDay.pkl')

Random Forest Classifier - Best hyperparameters: {'rf__max_depth': 5, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 15, 'rf__n_estimators': 10}


['RFforDay.pkl']

In [39]:
# For Random Forest Classifier
rf_r2_scores = cross_val_score(rf_day.best_estimator_,
                               X_train_day, y_train_day, cv=5, scoring=r2_scorer)

confidence_level = 0.95
degrees_freedom = len(rf_r2_scores) - 1
rf_confidence_interval = stats.t.interval(confidence_level, degrees_freedom,
                                          loc=np.mean(rf_r2_scores), scale=stats.sem(rf_r2_scores))
rf_confidence_interval = estimate_r2_CI(rf_day.best_estimator_,X_train_day, y_train_day)

if np.mean(rf_r2_scores is not float):
    print("Random Forest R^2: ", r2_score(y_train_day, rf_day.best_estimator_.predict(X_train_day)))
else:
    print(f"Random Forest R^2: {np.mean(rf_r2_scores):.4f}")
print(f"95% Confidence Interval for Random Forest Classifier R^2: {rf_confidence_interval}\n")
print(classification_report(y_train_day, rf_day.best_estimator_.predict(X_train_day)))

Random Forest R^2:  -0.3148805036061766
95% Confidence Interval for Random Forest Classifier R^2: [-0.45493212 -0.17067055]

              precision    recall  f1-score   support

         0.0       0.49      0.38      0.43       113
         1.0       0.59      0.18      0.28        89
         2.0       0.42      0.60      0.49       136
         3.0       0.40      0.31      0.35       108
         4.0       0.36      0.35      0.36       109
         5.0       0.36      0.58      0.45       130
         6.0       0.38      0.30      0.34       115

    accuracy                           0.40       800
   macro avg       0.43      0.39      0.38       800
weighted avg       0.42      0.40      0.39       800

