## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, LogisticRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

import joblib

import warnings
warnings.filterwarnings('ignore')

from scipy import stats

## Data Preprocessing

### Data Loading and removing columns

In [2]:
# Loading the dataset
data = pd.read_csv('supermarket_sales.csv')

print('Original Data:')
display(data)

# Dropping the 'Invoice ID' Column
data.drop(columns=['Invoice ID'], inplace=True)

# Dropping the 'City' Column
data.drop(columns=['City'], inplace=True)

print('After dropping Invoice ID and City:')
display(data)

X_train, X_test = train_test_split(data, test_size=0.2, random_state=43)

print('X_test:')
display(X_test)

Original Data:


Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Naypyitaw,Normal,Male,Health and beauty,40.35,1,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,303-96-2227,B,Mandalay,Normal,Female,Home and lifestyle,97.38,10,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,727-02-1313,A,Yangon,Member,Male,Food and beverages,31.84,1,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,347-56-2442,A,Yangon,Normal,Male,Home and lifestyle,65.82,1,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


After dropping Invoice ID and City:


Unnamed: 0,Branch,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,A,Member,Female,Health and beauty,74.69,7,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,C,Normal,Female,Electronic accessories,15.28,5,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,A,Normal,Male,Home and lifestyle,46.33,7,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,A,Member,Male,Health and beauty,58.22,8,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,A,Normal,Male,Sports and travel,86.31,7,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,C,Normal,Male,Health and beauty,40.35,1,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,B,Normal,Female,Home and lifestyle,97.38,10,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,A,Member,Male,Food and beverages,31.84,1,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,A,Normal,Male,Home and lifestyle,65.82,1,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


X_train:


Unnamed: 0,Branch,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
147,C,Normal,Male,Health and beauty,66.14,4,277.7880,3/19/2019,12:46,Credit card,264.56,4.761905,13.2280,5.6
88,A,Normal,Male,Sports and travel,42.47,1,44.5935,1/2/2019,16:57,Cash,42.47,4.761905,2.1235,5.7
731,A,Normal,Male,Health and beauty,56.00,3,176.4000,2/28/2019,19:33,Ewallet,168.00,4.761905,8.4000,4.8
741,C,Normal,Male,Food and beverages,84.83,1,89.0715,1/14/2019,15:20,Ewallet,84.83,4.761905,4.2415,8.8
36,A,Member,Male,Sports and travel,62.62,5,328.7550,3/10/2019,19:15,Ewallet,313.10,4.761905,15.6550,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,C,Normal,Female,Fashion accessories,95.42,4,400.7640,2/2/2019,13:23,Ewallet,381.68,4.761905,19.0840,6.4
817,A,Normal,Male,Food and beverages,33.88,8,284.5920,1/19/2019,20:29,Ewallet,271.04,4.761905,13.5520,9.6
255,B,Member,Male,Fashion accessories,32.62,4,137.0040,1/29/2019,14:12,Cash,130.48,4.761905,6.5240,9.0
320,C,Normal,Female,Food and beverages,22.93,9,216.6885,2/26/2019,20:26,Cash,206.37,4.761905,10.3185,5.5


X_test:


Unnamed: 0,Branch,Customer type,Gender,Product line,Unit price,Quantity,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
858,B,Normal,Male,Health and beauty,57.59,6,362.8170,2/15/2019,13:51,Cash,345.54,4.761905,17.2770,5.1
986,B,Normal,Female,Health and beauty,14.76,2,30.9960,2/18/2019,14:42,Ewallet,29.52,4.761905,1.4760,4.3
183,C,Normal,Male,Health and beauty,34.31,8,288.2040,1/25/2019,15:00,Ewallet,274.48,4.761905,13.7240,5.7
502,C,Normal,Male,Home and lifestyle,69.40,2,145.7400,1/27/2019,19:48,Ewallet,138.80,4.761905,6.9400,9.0
710,A,Member,Male,Food and beverages,80.62,6,507.9060,2/28/2019,20:18,Cash,483.72,4.761905,24.1860,9.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,A,Normal,Female,Fashion accessories,87.67,2,184.1070,3/10/2019,12:17,Credit card,175.34,4.761905,8.7670,7.7
244,B,Normal,Male,Home and lifestyle,93.87,8,788.5080,2/2/2019,18:42,Credit card,750.96,4.761905,37.5480,8.3
294,B,Normal,Male,Health and beauty,18.11,10,190.1550,3/13/2019,11:46,Ewallet,181.10,4.761905,9.0550,5.9
176,A,Member,Male,Food and beverages,22.17,8,186.2280,3/3/2019,17:01,Credit card,177.36,4.761905,8.8680,9.6


## Loading the preprocessor and processing the test data

In [5]:
# Encoding 'Date' feature into day of the week
def extract_weekday(X):
    X['Date'] = pd.to_datetime(X['Date'])
    return X['Date'].dt.strftime("%A").values.reshape(-1, 1)

# Encoding 'Time' feature into Time of the Day categories
def categorize_time(X):
    categories = ['Morning', 'Afternoon', 'Evening', 'Night']
    bins = [0, 12, 17, 19, 21]
    X['Time'] = pd.to_datetime(X['Time'])
    time_categories = pd.cut(X['Time'].dt.hour, bins=bins, labels=categories)
    return time_categories.values.reshape(-1, 1)

preprocessing_pipeline = joblib.load('Preprocessor.pkl')

In [10]:
# Defining the categorical and numerical feature columns
numerical_features = ['Unit price', 'Quantity', 'Total', 'cogs',
                      'gross margin percentage', 'gross income', 'Rating']
categorical_features = ['Branch', 'Product line', 'Payment']

# Applying the transformation for the test data
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

onehotfeatures = list(preprocessing_pipeline.named_steps.preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

X_test_preprocessed = pd.DataFrame(X_test_preprocessed,
                                   columns=['Date','Time','Gender','Customer type'] + numerical_features + onehotfeatures)

# Displaying the preprocessed data
print('X_test_preprocessed:')
display(X_test_preprocessed)

X_test_preprocessed:


Unnamed: 0,Date,Time,Gender,Customer type,Unit price,Quantity,Total,cogs,gross margin percentage,gross income,...,Branch_C,Product line_Electronic accessories,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel,Payment_Cash,Payment_Credit card,Payment_Ewallet
0,0.0,0.0,1.0,1.0,0.110931,0.164990,0.185016,0.185016,0.0,0.185016,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,-1.517477,-1.202805,-1.177774,-1.177774,0.0,-1.177774,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,-0.774181,0.848888,-0.121420,-0.121420,0.0,-0.121420,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,3.0,1.0,1.0,1.0,0.559951,-1.202805,-0.706520,-0.706520,0.0,-0.706520,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4.0,3.0,1.0,0.0,0.986538,0.164990,0.780897,0.780897,0.0,0.780897,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,3.0,2.0,0.0,1.0,1.254581,-1.202805,-0.548946,-0.548946,0.0,-0.548946,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
196,2.0,1.0,1.0,1.0,1.490307,0.848888,1.933329,1.933329,0.0,1.933329,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
197,6.0,2.0,1.0,1.0,-1.390109,1.532786,-0.524107,-0.524107,0.0,-0.524107,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
198,3.0,0.0,1.0,0.0,-1.235747,0.848888,-0.540235,-0.540235,0.0,-0.540235,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Feature Extraction for predicting gross income

In [12]:
# Splitting features and the target variables for predicting 'gross income'
X_test_gi = X_test_preprocessed.drop(columns=['gross income'])
y_test_gi = X_test_preprocessed['gross income']

## Implemeting Linear Regression to predict 'gross income'

In [13]:
lin_reg_pipeline_gi = joblib.load('LinearRegressorforGrossIncome.pkl')

# Predicting for the test set using the trained model
y_pred_lin_reg = lin_reg_pipeline_gi.predict(X_test_gi)

# Calculating the regression metrics
lin_reg_r2 = r2_score(y_test_gi, y_pred_lin_reg)
lin_reg_mae = mean_absolute_error(y_test_gi, y_pred_lin_reg)
lin_reg_mse = mean_squared_error(y_test_gi, y_pred_lin_reg)

# Print the regression metrics
print("For Linear Regression:")
print("R-squared (R2) Score:", lin_reg_r2)
print("Mean Absolute Error (MAE):", lin_reg_mae)
print("Mean Squared Error (MSE):", lin_reg_mse,'\n')

For Linear Regression:
R-squared (R2) Score: 1.0
Mean Absolute Error (MAE): 8.7659914049798e-16
Mean Squared Error (MSE): 1.1650170662299056e-30 



In [14]:
# Defining a function to calculate 95% confidence interval of r^2
def estimate_r2_CI(estimator, X, y, n_iter=1000):
    r2_scores = []
    for _ in range(n_iter):
        X_resampled, y_resampled = resample(X, y)
        y_pred = estimator.predict(X_resampled)
        r2_scores.append(r2_score(y_resampled, y_pred))
    return np.percentile(r2_scores, [2.5, 97.5])

confidence_interval = estimate_r2_CI(lin_reg_pipeline_gi, X_test_gi, y_test_gi)
print(f"95% Confidence Interval for Linear Regression 𝑟^2 score: {confidence_interval}")

95% Confidence Interval for Linear Regression 𝑟^2 score: [1. 1.]


## Implemeting Lasso Regression to predict 'gross income'

In [15]:
lasso_pipeline_gi = joblib.load('LassoforGrossIncome.pkl')

# Predicting for the test set using the trained model
y_pred_lasso = lasso_pipeline_gi.predict(X_test_gi)

# Calculating R2 score and other metrics for Lasso regression
lasso_r2 = r2_score(y_test_gi, y_pred_lasso)
lasso_mae = mean_absolute_error(y_test_gi, y_pred_lasso)
lasso_mse = mean_squared_error(y_test_gi, y_pred_lasso)

print("For Lasso Regression:")
print("R-squared (R2) Score:", lasso_r2)
print("Mean Absolute Error (MAE):", lasso_mae)
print("Mean Squared Error (MSE):", lasso_mse)

For Lasso Regression:
R-squared (R2) Score: 0.9898949434949342
Mean Absolute Error (MAE): 0.08640944184017513
Mean Squared Error (MSE): 0.010961798518186325


In [16]:
confidence_interval = estimate_r2_CI(lasso_pipeline_gi, X_test_gi, y_test_gi)
print(f"95% Confidence Interval for Lasso 𝑟^2 score: {confidence_interval}")

95% Confidence Interval for Lasso 𝑟^2 score: [0.98941933 0.98999933]


## Extracting features for Unit Price Prediction

In [18]:
# Splitting the features and target labels for predicting Unit Price
y_test_up = X_test_preprocessed['Unit price']
X_test_up = X_test_preprocessed.drop(columns=['Unit price'])

## Implementing Linear Regression to predict Unit price

In [19]:
lin_reg_pipeline_up = joblib.load('LinearRegressorforUnitPrice.pkl')

# Predicting for the test set
y_pred_lin_reg = lin_reg_pipeline_up.predict(X_test_up)

lin_reg_r2 = r2_score(y_test_up, y_pred_lin_reg)
lin_reg_mae = mean_absolute_error(y_test_up, y_pred_lin_reg)
lin_reg_mse = mean_squared_error(y_test_up, y_pred_lin_reg)

# Print the regression metrics
print("For Linear Regression:")
print("R-squared (R2) Score:", lin_reg_r2)
print("Mean Absolute Error (MAE):", lin_reg_mae)
print("Mean Squared Error (MSE):", lin_reg_mse,'\n')

For Linear Regression:
R-squared (R2) Score: 0.7927873218495987
Mean Absolute Error (MAE): 0.3545071895171906
Mean Squared Error (MSE): 0.21542405812873308 



In [20]:
# Calculating the 95% CI for Linear Regressor
confidence_interval = estimate_r2_CI(lin_reg_pipeline_up, X_test_up, y_test_up)
print(f"95% Confidence Interval for Linear Regression 𝑟^2 score: {confidence_interval}")

95% Confidence Interval for Linear Regression 𝑟^2 score: [0.74767577 0.83085323]


## Implementing Lasso Regression to predict Unit price

In [21]:
# Defining the lasso pipeline
lasso_pipeline_up = joblib.load('LassoforUnitPrice.pkl')

y_pred_lasso = lasso_pipeline_up.predict(X_test_up)
lasso_r2 = r2_score(y_test_up, y_pred_lasso)
lasso_mae = mean_absolute_error(y_test_up, y_pred_lasso)
lasso_mse = mean_squared_error(y_test_up, y_pred_lasso)

print("For Lasso Regression:")
print("R-squared (R2) Score:", lasso_r2)
print("Mean Absolute Error (MAE):", lasso_mae)
print("Mean Squared Error (MSE):", lasso_mse)

confidence_interval = estimate_r2_CI(lasso_pipeline_up, X_test_up, y_test_up)
print(f"95% Confidence Interval for Lasso 𝑟^2 score: {confidence_interval}")

For Lasso Regression:
R-squared (R2) Score: 0.4913334469799068
Mean Absolute Error (MAE): 0.6156649931201509
Mean Squared Error (MSE): 0.5288238830946773
95% Confidence Interval for Lasso 𝑟^2 score: [0.41017638 0.55521972]


## Feature Extraction for Gender Classification for Branch C

In [23]:
# Filtering the data for branch 'C'
test_data_branch_C = X_test_preprocessed[X_test_preprocessed['Branch_C'] == 1]

# Splitting the data for interaction and training logistic regression to classify Gender
X_test_g = test_data_branch_C.drop(columns=['Gender'])
# Only including Product Line, Payment and gross income for interaction
X_test_g_int = test_data_branch_C[['Product line_Electronic accessories', 'Product line_Fashion accessories',
                                 'Product line_Food and beverages','Product line_Health and beauty',
                                 'Product line_Home and lifestyle', 'Product line_Sports and travel',
                                 'Payment_Cash','Payment_Ewallet', 'Payment_Credit card', 'gross income']]
y_test_g = test_data_branch_C['Gender']

## Implementing Logistic Regression to classify Gender

In [28]:
# Defining a custom R^2 scorer for logistic regression
def logistic_r2_estimator(y_true, y_pred_prob):
    return np.square(np.corrcoef(y_true, y_pred_prob)[0, 1])

r2_scorer = make_scorer(logistic_r2_estimator, greater_is_better=True, needs_proba=True)

grid_search = joblib.load('LogRegforGender.pkl')

r2_scores = cross_val_score(grid_search, X_test_g, y_test_g, cv=5, scoring=r2_scorer)
print(classification_report(y_test_g, grid_search.predict(X_test_g)))

confidence_level = 0.95
degrees_freedom = len(r2_scores) - 1
confidence_interval = stats.t.interval(confidence_level, degrees_freedom, loc=np.mean(r2_scores), scale=stats.sem(r2_scores))

print(f"R^2 (coefficient of determination): {np.mean(r2_scores):.4f}")
print(f"95% confidence interval for R^2: ({confidence_interval[0]:.4f}, {confidence_interval[1]:.4f})")

              precision    recall  f1-score   support

         0.0       0.62      0.67      0.64        42
         1.0       0.50      0.45      0.47        31

    accuracy                           0.58        73
   macro avg       0.56      0.56      0.56        73
weighted avg       0.57      0.58      0.57        73

R^2 (coefficient of determination): 0.0850
95% confidence interval for R^2: (-0.0396, 0.2096)


## Feature Extraction for classifying Customer type

In [30]:
# Filtering the data for branch 'C'
test_branch_c_data = X_test_preprocessed[X_test_preprocessed['Branch_C'] == 1]

# Define X and y for training and test
X_test_ct = test_branch_c_data.drop(columns=['Customer type'])
X_test_ct_int = test_branch_c_data[['Gender', 'Date', 'Time']]
y_test_ct = test_branch_c_data['Customer type']

## Implementing Logistic Regression for classifying Customer type

In [32]:
logreg_ct = joblib.load('LogRegforCustomer.pkl')

print(classification_report(logreg_ct.predict(X_test_ct), y_test_ct))

# Calculating the confidence interval for R^2
logreg_ct_r2_scores = cross_val_score(logreg_ct,
                                      X_test_ct, y_test_ct, cv=5, scoring=r2_scorer)

confidence_level = 0.95
degrees_freedom = len(logreg_ct_r2_scores) - 1
confidence_interval = stats.t.interval(confidence_level, degrees_freedom,
                                       loc=np.mean(logreg_ct_r2_scores), scale=stats.sem(logreg_ct_r2_scores))

print(f"R^2: {np.mean(logreg_ct_r2_scores):.4f}")
print(f"95% Confidence Interval for R^2: {confidence_interval}")

              precision    recall  f1-score   support

         0.0       0.70      0.59      0.64        44
         1.0       0.50      0.62      0.55        29

    accuracy                           0.60        73
   macro avg       0.60      0.61      0.60        73
weighted avg       0.62      0.60      0.61        73

R^2: 0.0639
95% Confidence Interval for R^2: (0.0031367654066654926, 0.12469899110420188)


## Feature Extraction for Day of Purchase Prediction

In [33]:
# Splitting the data into features and target for day of purchase Classification

y_test_day = X_test_preprocessed['Date']
X_test_day = X_test_preprocessed.drop(columns=['Date'])


## Implementing Logistic Regression for Day of Purchase Prediction

In [36]:
# Defining Pipeline for Logistic Regression
logreg_day = joblib.load('LogRegforDay.pkl')

In [41]:
# For Logistic Regression
logreg_r2_scores = cross_val_score(logreg_day,
                                   X_test_day, y_test_day, cv=5, scoring=r2_scorer)

confidence_level = 0.95
degrees_freedom = len(logreg_r2_scores) - 1
logreg_confidence_interval = stats.t.interval(confidence_level, degrees_freedom,
                                       loc=np.mean(logreg_r2_scores), scale=stats.sem(logreg_r2_scores))
logreg_confidence_interval = estimate_r2_CI(logreg_day,X_test_day, y_test_day)

if np.mean(logreg_r2_scores is not float):
    print("Logistic Regression R^2: ", r2_score(y_test_day, logreg_day.predict(X_test_day)))
else:
    print(f"Logistic Regression R^2: {np.mean(logreg_r2_scores):.4f}")
print(f"95% Confidence Interval for Logistic Regression R^2: {logreg_confidence_interval}\n")
print(classification_report(y_test_day, logreg_day.predict(X_test_day)))


Logistic Regression R^2:  -0.22840739047952963
95% Confidence Interval for Logistic Regression R^2: [-0.38357331 -0.11651937]

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        26
         1.0       0.00      0.00      0.00        36
         2.0       0.14      1.00      0.25        28
         3.0       0.00      0.00      0.00        25
         4.0       0.00      0.00      0.00        29
         5.0       0.00      0.00      0.00        28
         6.0       0.00      0.00      0.00        28

    accuracy                           0.14       200
   macro avg       0.02      0.14      0.04       200
weighted avg       0.02      0.14      0.03       200



## Implementing Random Forest Classifier for Day Prediction

In [42]:
# Defining the pipeline for Random Forest Classifier
rf_day = joblib.load('RFforDay.pkl')

In [44]:
# For Random Forest Classifier
rf_r2_scores = cross_val_score(rf_day,
                               X_test_day, y_test_day, cv=5, scoring=r2_scorer)

confidence_level = 0.95
degrees_freedom = len(rf_r2_scores) - 1
rf_confidence_interval = stats.t.interval(confidence_level, degrees_freedom,
                                          loc=np.mean(rf_r2_scores), scale=stats.sem(rf_r2_scores))
rf_confidence_interval = estimate_r2_CI(rf_day,X_test_day, y_test_day)

if np.mean(rf_r2_scores is not float):
    print("Random Forest R^2: ", r2_score(y_test_day, rf_day.predict(X_test_day)))
else:
    print(f"Random Forest R^2: {np.mean(rf_r2_scores):.4f}")
print(f"95% Confidence Interval for Random Forest Classifier R^2: {rf_confidence_interval}\n")
print(classification_report(y_test_day, rf_day.predict(X_test_day)))

Random Forest R^2:  -0.651650711562181
95% Confidence Interval for Random Forest Classifier R^2: [-0.96699185 -0.39258289]

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        26
         1.0       0.12      0.03      0.05        36
         2.0       0.11      0.18      0.14        28
         3.0       0.12      0.12      0.12        25
         4.0       0.15      0.17      0.16        29
         5.0       0.20      0.43      0.27        28
         6.0       0.30      0.21      0.25        28

    accuracy                           0.16       200
   macro avg       0.14      0.16      0.14       200
weighted avg       0.15      0.16      0.14       200

