In [54]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
import datetime
import holidays
from datetime import date

In [55]:
# Reading data
train = pd.read_csv("../data/train.csv")
features = pd.read_csv("../data/features.csv")
stores = pd.read_csv("../data/stores.csv")

In [56]:
# Merging datasets
train = pd.merge(train, features, on=[
                 'Store', 'Date', 'IsHoliday'], how='left')
train = pd.merge(train, stores, on=['Store'], how='left')

In [57]:
train.Date = pd.to_datetime(train.Date)
features.Date = pd.to_datetime(features.Date)

train.dtypes

Store                    int64
Dept                     int64
Date            datetime64[ns]
Weekly_Sales           float64
IsHoliday                 bool
Temperature            float64
Fuel_Price             float64
MarkDown1              float64
MarkDown2              float64
MarkDown3              float64
MarkDown4              float64
MarkDown5              float64
CPI                    float64
Unemployment           float64
Type                    object
Size                     int64
dtype: object

In [58]:
# Feature engineering
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Week'] = train['Date'].dt.isocalendar().week
train['Day'] = train['Date'].dt.day
train['n_days'] = (train['Date'].dt.date -
                   train['Date'].dt.date.min()).apply(lambda x: x.days)

In [59]:
# Handling Markdown features
for i in range(1, 6):
    features["MarkDown"+str(i)] = features["MarkDown" +
                                           str(i)].apply(lambda x: 0 if x < 0 else x)
    features["MarkDown"+str(i)].fillna(value=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features["MarkDown"+str(i)].fillna(value=0, inplace=True)


In [60]:
# Creating HolidayType column
def create_Holiday_Type(df):
    def create_holiday_type_column(df, dates, holiday_type, name):
        df.loc[
            df['Date'].isin(dates),
            'HolidayType'
        ] = holiday_type

    df['HolidayType'] = -1

    holiday_list = [
        (['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'], 'Super_Bowl'),
        (['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'], 'Labor_Day'),
        (['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'], 'Thanksgiving'),
        (['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'], 'Christmas')
    ]

    for index in range(0, len(holiday_list)):
        holiday = holiday_list[index]
        create_holiday_type_column(df, holiday[0], index, holiday[1])

    for x in df:
        if df[x].dtypes == "int64":
            df[x] = df[x].astype(float)




In [61]:
create_Holiday_Type(train)

  df['Date'].isin(dates),


In [62]:
# Handling missing values
data = train[['Store', 'Size', 'Dept', 'Month', 'Type', 'Year', 'Week',
              'Day', 'n_days', 'IsHoliday', 'HolidayType', 'CPI']]

In [63]:
# Using LabelEncoder for 'Type' column
data['Type'] = LabelEncoder().fit_transform(data['Type'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Type'] = LabelEncoder().fit_transform(data['Type'])


In [64]:
data.describe

<bound method NDFrame.describe of         Store      Size  Dept  Month  Type  Year  Week  Day  n_days  \
0         1.0  151315.0   1.0      2     0  2010     5    5     0.0   
1         1.0  151315.0   1.0      2     0  2010     6   12     7.0   
2         1.0  151315.0   1.0      2     0  2010     7   19    14.0   
3         1.0  151315.0   1.0      2     0  2010     8   26    21.0   
4         1.0  151315.0   1.0      3     0  2010     9    5    28.0   
...       ...       ...   ...    ...   ...   ...   ...  ...     ...   
421565   45.0  118221.0  98.0      9     1  2012    39   28   966.0   
421566   45.0  118221.0  98.0     10     1  2012    40    5   973.0   
421567   45.0  118221.0  98.0     10     1  2012    41   12   980.0   
421568   45.0  118221.0  98.0     10     1  2012    42   19   987.0   
421569   45.0  118221.0  98.0     10     1  2012    43   26   994.0   

        IsHoliday  HolidayType         CPI  
0           False         -1.0  211.096358  
1            True      

In [65]:
# Splitting data
X = data
Y = train['Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

In [66]:
# Feature scaling and encoding
numeric_features = ['Size', 'Dept', 'Month',
                    'Year', 'Week', 'Day', 'n_days', 'CPI']
categorical_features = ['Type']

In [67]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [69]:
# Model training (Random Forest)
rf_model = RandomForestRegressor()

In [70]:
# Creating a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

In [71]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [20, 25, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

In [72]:
# grid_search = GridSearchCV(model, param_grid, cv=3,
#                           scoring='neg_mean_absolute_error')
# grid_search.fit(X_train, y_train)


In [73]:
# Best hyperparameters
# best_params = grid_search.best_params_
# print(f'Best Hyperparameters: {best_params}')

In [74]:
# Final model
# final_model = grid_search.best_estimator_

In [75]:
# Define the best hyperparameters
best_hyperparameters = {
    'n_estimators': 100,
    'max_depth': 27,
    'min_samples_split': 2,
    'min_samples_leaf': 1
}

In [76]:
# Create the final model with the best hyperparameters
final_model = RandomForestRegressor(**best_hyperparameters)

In [77]:
# Train the final model on your data
final_model.fit(X_train, y_train)

In [78]:
# Predictions
predictions = final_model.predict(X_test)

In [79]:
def WMAE(dataset, real, predicted):

    weights = dataset.IsHoliday.apply(lambda x: 5 if x else 1)
    return np.round(np.sum(weights*abs(real-predicted))/(np.sum(weights)), 2)

In [80]:
# Evaluate the model
wmae_score = WMAE(X_test, y_test, predictions)
mae_score = mean_absolute_error(y_test, predictions)

In [81]:
print(f'WMAE Score: {wmae_score}')
print(f'MAE Score: {mae_score}')

WMAE Score: 1446.55
MAE Score: 1266.1250588579633


In [82]:
# Access and print feature importances
feature_importances = final_model.feature_importances_
feature_names = X_train.columns

In [83]:
# Create a DataFrame to display feature importances
importance_df = pd.DataFrame(
    {'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

        Feature  Importance
2          Dept    0.626547
1          Size    0.196497
0         Store    0.063435
6          Week    0.038368
11          CPI    0.030316
4          Type    0.014671
7           Day    0.010045
8        n_days    0.006668
10  HolidayType    0.006557
3         Month    0.005945
9     IsHoliday    0.000522
5          Year    0.000430


In [85]:
print(final_model.score(X, Y))

0.9036150046931428


In [91]:
def predict_weekly_sales(store, department, date, is_holiday):
    # Prepare input data for prediction
    input_data = pd.DataFrame({
        'Store': [store],
        'Dept': [department],
        'Date': [pd.to_datetime(date)],
        'IsHoliday': [is_holiday]
    })

    # Merge with features and stores data
    input_data = pd.merge(input_data, features, on=[
                          'Store', 'Date', 'IsHoliday'], how='left')
    input_data = pd.merge(input_data, stores, on=['Store'], how='left')

    # Feature engineering for the input data
    input_data['Year'] = input_data['Date'].dt.year
    input_data['Month'] = input_data['Date'].dt.month
    input_data['Week'] = input_data['Date'].dt.isocalendar().week
    input_data['Day'] = input_data['Date'].dt.day
    input_data['n_days'] = (input_data['Date'].dt.date -
                            train['Date'].dt.date.min()).apply(lambda x: x.days)

    # Handling Markdown features
    for i in range(1, 6):
        input_data["MarkDown"+str(i)] = input_data["MarkDown" +
                                                   str(i)].apply(lambda x: 0 if x < 0 else x)
        input_data["MarkDown"+str(i)].fillna(value=0, inplace=True)

    # Create HolidayType column
    create_Holiday_Type(input_data)

    # Extract relevant features for prediction
    input_features = input_data[['Store', 'Size', 'Dept', 'Month', 'Type', 'Year', 'Week', 'Day', 'n_days',
                                 'IsHoliday', 'HolidayType', 'CPI']]

    # Use LabelEncoder for 'Type' column
    input_features['Type'] = LabelEncoder(
    ).fit_transform(input_features['Type'])

    # Make predictions using the trained model
    predicted_sales = final_model.predict(input_features)
    # Get feature importances from the model
    feature_importances = final_model.feature_importances_

    # Create a dictionary with feature names and their importance values
    importance_dict = dict(zip(features.columns, feature_importances))
    
    return predicted_sales[0], importance_dict

In [92]:
# Example to test predict_weekly_sales
store_example = 1
department_example = 1
date_example = '2024-02-10'
is_holiday_example = False

predicted_sales_example, feature_importance_example = predict_weekly_sales(
    store_example, department_example, date_example, is_holiday_example)

print(f'Predicted Weekly Sales: {predicted_sales_example}')
print("Feature Importance:")
for feature, importance in feature_importance_example.items():
    print(f"{feature}: {importance}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  input_data["MarkDown"+str(i)].fillna(value=0, inplace=True)
  df['Date'].isin(dates),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_features['Type'] = LabelEncoder(


Predicted Weekly Sales: 45560.58370000001
Feature Importance:
Store: 0.06343482986859679
Date: 0.19649725686048164
Temperature: 0.626546708367122
Fuel_Price: 0.0059446593069684035
MarkDown1: 0.014670701245482292
MarkDown2: 0.000430280283391465
MarkDown3: 0.03836759912484354
MarkDown4: 0.010045024272271557
MarkDown5: 0.00666839055760696
CPI: 0.0005222976003708549
Unemployment: 0.006556536942266385
IsHoliday: 0.030315715570598048
