In [1]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# Load dataset
df = pd.read_csv('train_cleaned_df.csv')

df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,weekday,weekend,beginning_of_month,mid_month,end_of_month
0,1,5,2015-07-31,-0.132683,-0.168269,1,1,0,1,2,0,-0.538742,0,5,0,0,0,1
1,2,5,2015-07-31,0.075373,-0.017540,1,1,0,1,0,0,-0.629569,1,5,0,0,0,1
2,3,5,2015-07-31,0.659800,0.404499,1,1,0,1,0,0,1.129891,1,5,0,0,0,1
3,4,5,2015-07-31,2.135414,1.862258,1,1,0,1,2,2,-0.623082,0,5,0,0,0,1
4,5,5,2015-07-31,-0.247231,-0.159656,1,1,0,1,0,0,3.177404,0,5,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1111,2,2013-01-01,-1.499723,-1.363330,0,0,1,1,0,0,-0.456997,1,2,0,1,0,0
1017205,1112,2,2013-01-01,-1.499723,-1.363330,0,0,1,1,2,2,-0.459592,0,2,0,1,0,0
1017206,1113,2,2013-01-01,-1.499723,-1.363330,0,0,1,1,0,2,0.497990,0,2,0,1,0,0
1017207,1114,2,2013-01-01,-1.499723,-1.363330,0,0,1,1,0,2,-0.590643,0,2,0,1,0,0


In [3]:
# Drop non-numeric columns
numeric_df = df.select_dtypes(include=['number'])

numeric_df

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,weekday,weekend,beginning_of_month,mid_month,end_of_month
0,1,5,-0.132683,-0.168269,1,1,0,1,2,0,-0.538742,0,5,0,0,0,1
1,2,5,0.075373,-0.017540,1,1,0,1,0,0,-0.629569,1,5,0,0,0,1
2,3,5,0.659800,0.404499,1,1,0,1,0,0,1.129891,1,5,0,0,0,1
3,4,5,2.135414,1.862258,1,1,0,1,2,2,-0.623082,0,5,0,0,0,1
4,5,5,-0.247231,-0.159656,1,1,0,1,0,0,3.177404,0,5,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1111,2,-1.499723,-1.363330,0,0,1,1,0,0,-0.456997,1,2,0,1,0,0
1017205,1112,2,-1.499723,-1.363330,0,0,1,1,2,2,-0.459592,0,2,0,1,0,0
1017206,1113,2,-1.499723,-1.363330,0,0,1,1,0,2,0.497990,0,2,0,1,0,0
1017207,1114,2,-1.499723,-1.363330,0,0,1,1,0,2,-0.590643,0,2,0,1,0,0


In [4]:
# Drop weekday column as we also have DayOfWeek
numeric_df = numeric_df.drop(['weekday'], axis=1)

In [5]:
numeric_df.describe().round(2)

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,weekend,beginning_of_month,mid_month,end_of_month
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0
mean,558.43,4.0,0.0,0.0,0.83,0.38,0.05,0.18,1.21,0.94,0.0,0.5,0.28,0.03,0.3,0.03
std,321.91,2.0,1.0,1.0,0.38,0.49,0.28,0.38,1.37,0.99,1.0,0.5,0.45,0.18,0.46,0.18
min,1.0,1.0,-1.5,-1.36,0.0,0.0,0.0,0.0,0.0,0.0,-0.7,0.0,0.0,0.0,0.0,0.0
25%,280.0,2.0,-0.53,-0.49,1.0,0.0,0.0,0.0,0.0,0.0,-0.61,0.0,0.0,0.0,0.0,0.0
50%,558.0,4.0,-0.01,-0.05,1.0,0.0,0.0,0.0,0.0,0.0,-0.4,1.0,0.0,0.0,0.0,0.0
75%,838.0,6.0,0.54,0.44,1.0,1.0,0.0,0.0,3.0,2.0,0.19,1.0,1.0,0.0,1.0,0.0
max,1115.0,7.0,9.29,14.54,1.0,1.0,3.0,1.0,3.0,2.0,9.14,1.0,1.0,1.0,1.0,1.0


In [6]:
# Splitting dataset into features and target variable
X = numeric_df.drop(['Sales', 'Customers'], axis=1) #X = numeric_df.drop(['Sales'], axis=1)
y = numeric_df[['Sales', 'Customers']] #y = numeric_df['Sales']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### RandomForestRegressor

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Define the steps of the pipeline
stepsRFR = [
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler()),  # Scale features
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Regressor
]

# Create the pipeline
pipelineRFR = Pipeline(stepsRFR)

# Fit the pipeline to the training data
pipelineRFR.fit(X_train, y_train)

# Predict on the test data
predictionsRFR = pipelineRFR.predict(X_test)

# Separate predictions for 'Sales' and 'Customer'
predictionsRFR_sales = predictionsRFR[:, 0]  # Assuming 'Sales' is the first column
predictionsRFR_customers = predictionsRFR[:, 1]  # Assuming 'Customer' is the second column

# Evaluate the model using custom loss function (MSE) for 'Sales'
mse_RandomForest_sales = mean_squared_error(y_test['Sales'], predictionsRFR_sales)
print("Custom Loss (MSE) from RandomForest for 'Sales':", mse_RandomForest_sales)

# Evaluate the model using custom loss function (MSE) for 'Customer'
mse_RandomForest_customers = mean_squared_error(y_test['Customers'], predictionsRFR_customers)
print("Custom Loss (MSE) from RandomForest for 'Customers':", mse_RandomForest_customers)


Custom Loss (MSE) from RandomForest for 'Sales': 0.08493276276006101
Custom Loss (MSE) from RandomForest for 'Customers': 0.04885194319888262


### DecisionTreeRegressor

In [9]:
from sklearn.tree import DecisionTreeRegressor

# Define the steps of the pipeline
stepsDTR = [
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values if necessary
    ('scaler', StandardScaler()),  # Scale features
    ('regressor', DecisionTreeRegressor(random_state=42))  # Decision Tree Regressor
]

# Create the pipeline
pipelineDTR = Pipeline(stepsDTR)

# Fit the pipeline to the training data
pipelineDTR.fit(X_train, y_train)

# Predict on the test data
predictionsDTR = pipelineDTR.predict(X_test)

# Separate predictions for 'Sales' and 'Customer'
predictionsDTR_sales = predictionsDTR[:, 0]  # 'Sales' is the first column
predictionsDTR_customers = predictionsDTR[:, 1]  # 'Customer' is the second column

# Evaluate the model using custom loss function (MSE) for 'Sales'
mse_DecisionTree_sales = mean_squared_error(y_test['Sales'], predictionsDTR_sales)
print("Custom Loss (MSE) from DecisionTree for 'Sales':", mse_DecisionTree_sales)

# Evaluate the model using custom loss function (MSE) for 'Customer'
mse_DecisionTree_customers = mean_squared_error(y_test['Customers'], predictionsDTR_customers)
print("Custom Loss (MSE) from DecisionTree for 'Customers':", mse_DecisionTree_customers)

Custom Loss (MSE) from DecisionTree for 'Sales': 0.08688374056283048
Custom Loss (MSE) from DecisionTree for 'Customers': 0.049957759476582486


### GradientBoostingRegressor

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

# Define the steps of the pipeline for 'Sales' prediction
steps_salesGBR = [
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values if necessary
    ('scaler', StandardScaler()),  # Scale features
    ('regressor', GradientBoostingRegressor(random_state=42))  # Gradient Boosting Regressor
]

# Create the pipeline for 'Sales' prediction
pipelineGBR_sales = Pipeline(steps_salesGBR)

# Fit the pipeline to the training data for 'Sales' prediction
pipelineGBR_sales.fit(X_train, y_train['Sales'])

# Predict on the test data for 'Sales'
predictionsGBR_sales = pipelineGBR_sales.predict(X_test)

# Evaluate the model using custom loss function (MSE) for 'Sales'
mse_GBR_sales = mean_squared_error(y_test['Sales'], predictionsGBR_sales)
print("Custom Loss (MSE) from GradientBoosting for 'Sales':", mse_GBR_sales)

# Define the steps of the pipeline for 'Customers' prediction
steps_customersGBR = [
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values if necessary
    ('scaler', StandardScaler()),  # Scale features
    ('regressor', GradientBoostingRegressor(random_state=42))  # Gradient Boosting Regressor
]

# Create the pipeline for 'Customer' prediction
pipelineGBR_customers = Pipeline(steps_customersGBR)

# Fit the pipeline to the training data for 'Customer' prediction
pipelineGBR_customers.fit(X_train, y_train['Customers'])

# Predict on the test data for 'Customer'
predictionsGBR_customers = pipelineGBR_customers.predict(X_test)

# Evaluate the model using custom loss function (MSE) for 'Customer'
mse_GBR_customers = mean_squared_error(y_test['Customers'], predictionsGBR_customers)
print("Custom Loss (MSE) from GradientBoosting for 'Customers':", mse_GBR_customers)

Custom Loss (MSE) from GradientBoosting for 'Sales': 0.366147095324158
Custom Loss (MSE) from GradientBoosting for 'Customers': 0.327634010583168


### KNeighborsRegressor

In [11]:
from sklearn.neighbors import KNeighborsRegressor

# Define the steps of the pipeline
stepsKNN = [
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values if necessary
    ('scaler', StandardScaler()),  # Scale features
    ('regressor', KNeighborsRegressor())  # K-Nearest Neighbors Regressor
]

# Create the pipeline
pipelineKNN = Pipeline(stepsKNN)

# Fit the pipeline to the training data
pipelineKNN.fit(X_train, y_train)

# Predict on the test data
predictionsKNN = pipelineKNN.predict(X_test)

# Separate predictions for 'Sales' and 'Customer'
predictionsKNN_sales = predictionsKNN[:, 0]  # 'Sales' is the first column
predictionsKNN_customers = predictionsKNN[:, 1]  # 'Customer' is the second column

# Evaluate the model using custom loss function (MSE) for 'Sales'
mse_KNN_sales = mean_squared_error(y_test['Sales'], predictionsKNN_sales)
print("Custom Loss (MSE) from KNN for 'Sales':", mse_KNN_sales)

# Evaluate the model using custom loss function (MSE) for 'Customer'
mse_KNN_customers = mean_squared_error(y_test['Customers'], predictionsKNN_customers)
print("Custom Loss (MSE) from KNN for 'Customers':", mse_KNN_customers)

Custom Loss (MSE) from KNN for 'Sales': 0.1189891517115727
Custom Loss (MSE) from KNN for 'Customers': 0.07965446264985437


### LinearRegression

In [12]:
from sklearn.linear_model import LinearRegression

# Define the steps of the pipeline
stepsLR = [
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values if necessary
    ('scaler', StandardScaler()),  # Scale features
    ('regressor', LinearRegression())  # Linear Regression model
]

# Create the pipeline
pipelineLR = Pipeline(stepsLR)

# Fit the pipeline to the training data
pipelineLR.fit(X_train, y_train)

# Predict on the test data
predictionsLR = pipelineLR.predict(X_test)

# Separate predictions for 'Sales' and 'Customers'
predictionsLR_sales = predictionsLR[:, 0]  # 'Sales' is the first column
predictionsLR_customers = predictionsLR[:, 1]  # 'Customers' is the second column

# Evaluate the model using custom loss function (MSE) for 'Sales'
mse_Linear_sales = mean_squared_error(y_test['Sales'], predictionsLR_sales)
print("Custom Loss (MSE) from LinearRegression for 'Sales':", mse_Linear_sales)

# Evaluate the model using custom loss function (MSE) for 'Customer'
mse_Linear_customers = mean_squared_error(y_test['Customers'], predictionsLR_customers)
print("Custom Loss (MSE) from LinearRegression for 'Customers':", mse_Linear_customers)

Custom Loss (MSE) from LinearRegression for 'Sales': 0.44507959537410585
Custom Loss (MSE) from LinearRegression for 'Customers': 0.5461814708014942


### Evaluation of the performance of the models

In [13]:
print("Custom Loss (MSE) from RandomForest for 'Sales':", mse_RandomForest_sales)
print("Custom Loss (MSE) from RandomForest for 'Customer':", mse_RandomForest_customers)

print("Custom Loss (MSE) from DecisionTree for 'Sales':", mse_DecisionTree_sales)
print("Custom Loss (MSE) from DecisionTree for 'Customer':", mse_DecisionTree_customers)

print("Custom Loss (MSE) from GradientBoosting for 'Sales':", mse_GBR_sales)
print("Custom Loss (MSE) from GradientBoosting for 'Customer':", mse_GBR_customers)

print("Custom Loss (MSE) from KNN for 'Sales':", mse_KNN_sales)
print("Custom Loss (MSE) from KNN for 'Customer':", mse_KNN_customers)

print("Custom Loss (MSE) from LinearRegression for 'Sales':", mse_Linear_sales)
print("Custom Loss (MSE) from LinearRegression for 'Customer':", mse_Linear_customers)

Custom Loss (MSE) from RandomForest for 'Sales': 0.08493276276006101
Custom Loss (MSE) from RandomForest for 'Customer': 0.04885194319888262
Custom Loss (MSE) from DecisionTree for 'Sales': 0.08688374056283048
Custom Loss (MSE) from DecisionTree for 'Customer': 0.049957759476582486
Custom Loss (MSE) from GradientBoosting for 'Sales': 0.366147095324158
Custom Loss (MSE) from GradientBoosting for 'Customer': 0.327634010583168
Custom Loss (MSE) from KNN for 'Sales': 0.1189891517115727
Custom Loss (MSE) from KNN for 'Customer': 0.07965446264985437
Custom Loss (MSE) from LinearRegression for 'Sales': 0.44507959537410585
Custom Loss (MSE) from LinearRegression for 'Customer': 0.5461814708014942


### Serialize models

In [15]:
import pickle
from datetime import datetime

# Create a dictionary to store all models
models = {
    'random_forest_regressor': pipelineRFR,
    'decision_tree_regressor': pipelineDTR,
    'gradient_boosting_regressor_sales': pipelineGBR_sales,
    'gradient_boosting_regressor_customers': pipelineGBR_customers,
    'kneighbors_regressor': pipelineKNN,
    'linear_regression': pipelineLR
}

# Serialize and save each model with a timestamp in the filename
for name, model in models.items():
    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")
    filename = f"{name}_{timestamp}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model '{name}' saved as '{filename}'")
    

Model 'random_forest_regressor' saved as 'random_forest_regressor_2024-04-08-19-17-58-222220.pkl'
Model 'decision_tree_regressor' saved as 'decision_tree_regressor_2024-04-08-19-18-07-579422.pkl'
Model 'gradient_boosting_regressor_sales' saved as 'gradient_boosting_regressor_sales_2024-04-08-19-18-07-579422.pkl'
Model 'gradient_boosting_regressor_customers' saved as 'gradient_boosting_regressor_customers_2024-04-08-19-18-07-595053.pkl'
Model 'kneighbors_regressor' saved as 'kneighbors_regressor_2024-04-08-19-18-07-595053.pkl'
Model 'linear_regression' saved as 'linear_regression_2024-04-08-19-18-07-964493.pkl'
