In [1]:
import pandas as pd
from pandas import DatetimeIndex as dt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import classification_report, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from pandas.plotting import scatter_matrix
from scipy import stats
import joblib

In [2]:
sales = pd.read_csv('supermarket_sales.csv')
#Changing Date and Column according to encoding requirements
sales['Date'] = pd.to_datetime(sales['Date']).dt.day_of_week

pd.to_datetime(sales['Time'])
pd.to_datetime(sales['Time']).dt.strftime('%H').astype('float')

sales['Time'] = pd.to_datetime(sales['Time']).dt.strftime('%H').astype('float')
sales['Time'] = pd.cut(sales['Time'], bins=[0., 12.0, 17.0, 19.0, np.inf], labels=[1, 2, 3, 4])

sales_income = sales.drop(labels=['Invoice ID', 'gross margin percentage'], axis = 1)

#### 2. Gross income

In [3]:
X = sales_income.drop(labels=['gross income'],axis=1)
t = sales_income[['gross income']]

Total_cat = pd.cut(sales['Total'],
                   bins=[0,125,250,400,600, np.inf],
                   labels=[1, 2, 3, 4, 5])

X_train, X_test, t_train, t_test = train_test_split(X,t, stratify = Total_cat, test_size = 0.2, random_state = 2,shuffle=True)

gross_income = joblib.load('models/LinearReg_tuned.pkl')

y_test = gross_income.predict(X_test)

r2 = r2_score(t_test, y_test)
print(f'R-squared (R2) Score: {r2:.2f}')

cv_scores  = cross_val_score(gross_income, X_test, t_test, cv=2, scoring='r2')

confidence = 0.95

print('95% CI: for Training without Lasso regularization', stats.t.interval(confidence, len(cv_scores) - 1,
                 loc=cv_scores.mean(),
                 scale=cv_scores.std(ddof=1)/np.sqrt(len(cv_scores))))

R-squared (R2) Score: 1.00
95% CI: for Training without Lasso regularization (0.9999999292586668, 1.0000000228877985)


#### 3. Unit Price

In [4]:
X = sales_income.drop(labels=['Unit price','City'],axis=1)
t = sales_income[['Unit price']]

Total_cat = pd.cut(sales['Total'],
                   bins=[0,125,250, 470, np.inf],
                   labels=[1, 2, 3, 4])

X_train, X_test, t_train, t_test = train_test_split(X,t, stratify = Total_cat, test_size = 0.2, random_state = 2,shuffle=True)

unit_price = joblib.load('models/LinearReg_UnitPrice.pkl')

y_test = unit_price.predict(X_test)

r2 = r2_score(t_test, y_test)
print(f'R-squared (R2) Score: {r2:.2f}')

grid_scores  = cross_val_score(unit_price, X_test, t_test, cv=2, scoring='r2')

confidence = 0.95

print('95% CI: ', stats.t.interval(confidence, len(grid_scores) - 1,
                 loc=grid_scores.mean(),
                 scale=grid_scores.std(ddof=1)/np.sqrt(len(grid_scores))))

R-squared (R2) Score: 0.78
95% CI:  (0.1916718869451629, 1.2361275708821808)


#### 4. Gender

In [5]:
sales_class = sales.drop(labels=['Invoice ID','gross margin percentage'],axis=1)
cat_enc = OrdinalEncoder()
t = cat_enc.fit_transform(sales_class[['Gender']]).ravel()
X = sales_class.drop(labels=['Gender'],axis=1)

X_train, X_test, t_train, t_test = train_test_split(X,t, stratify = t, test_size = 0.2, random_state = 2,shuffle=True)

gender = joblib.load('models/LogReg_Gender.pkl')

y_test = gender.predict(X_test)

print('Test Set:\n', classification_report(t_test, y_test))

Test Set:
               precision    recall  f1-score   support

         0.0       0.43      0.38      0.40       100
         1.0       0.44      0.49      0.46       100

    accuracy                           0.43       200
   macro avg       0.43      0.43      0.43       200
weighted avg       0.43      0.43      0.43       200



#### 5. Customer Type

In [6]:
sales_class = sales.drop(labels=['Invoice ID','gross margin percentage'],axis=1)
cat_enc = OrdinalEncoder()
t = cat_enc.fit_transform(sales_class[['Customer type']]).ravel()
X = sales_class.drop(labels=['Customer type'],axis=1)

X_train, X_test, t_train, t_test = train_test_split(X,t, stratify = t, test_size = 0.2, random_state = 2,shuffle=True)

customer_type = joblib.load('models/LogReg_CustomerType.pkl')

y_test = customer_type.predict(X_test)

print('Test Set:\n', classification_report(t_test, y_test))

Test Set:
               precision    recall  f1-score   support

         0.0       0.52      0.54      0.53       100
         1.0       0.53      0.51      0.52       100

    accuracy                           0.53       200
   macro avg       0.53      0.53      0.52       200
weighted avg       0.53      0.53      0.52       200



#### 6. Day of the Week

In [7]:
sales_class_day = sales.drop(labels=['Invoice ID','gross margin percentage'],axis=1)
X = sales_class_day.drop(labels=['Date'],axis=1)
t = sales_class_day['Date']
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=2, shuffle = True, stratify = t)

In [8]:
#Random Forest
day_ran = joblib.load('models/RandomForest_Day.pkl')

y_test = day_ran.predict(X_test)

print('Test Set:\n', classification_report(t_test, y_test))

Test Set:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.16      0.41      0.23        32
           2       0.33      0.07      0.11        29
           3       0.11      0.04      0.06        27
           4       0.17      0.04      0.06        28
           5       0.13      0.36      0.20        33
           6       0.00      0.00      0.00        26

    accuracy                           0.14       200
   macro avg       0.13      0.13      0.09       200
weighted avg       0.13      0.14      0.10       200



In [10]:
#Logistic Regression
day_log = joblib.load('models/LogReg_Day.pkl')

y_test = day_log.predict(X_test)

print('Training Set:\n', classification_report(t_test, y_test))

Training Set:
               precision    recall  f1-score   support

           0       0.09      0.08      0.08        25
           1       0.13      0.19      0.15        32
           2       0.22      0.24      0.23        29
           3       0.15      0.15      0.15        27
           4       0.11      0.04      0.05        28
           5       0.09      0.09      0.09        33
           6       0.11      0.12      0.11        26

    accuracy                           0.13       200
   macro avg       0.13      0.13      0.12       200
weighted avg       0.13      0.13      0.12       200

