<a href="https://colab.research.google.com/github/DeeeTeeee/Store-Sales-Time-Series-Forecasting/blob/main/Sales_Prediction_nw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import pandas as pd
import numpy as np
from joblib import dump
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score



In [11]:
# #Loading all datasets
holiday = pd.read_csv('/content/holidays_events.csv')
oil  = pd.read_csv('/content/oil.csv')
stores  = pd.read_csv('/content/stores.csv')
test  = pd.read_csv('/content/test.csv')
train  = pd.read_csv('/content/train.csv')
transaction  = pd.read_csv('/content/transactions.csv')
submission  = pd.read_csv('/content/sample_submission.csv')

In [19]:

merged_df = pd.merge(train, stores, on='store_nbr', how='left')
merged_df = pd.merge(merged_df, transaction, on=['store_nbr', 'date'], how='left')
merged_df = pd.merge(merged_df, oil, on=['date'], how='left')
data = pd.merge(merged_df, holiday, on=['date'], how='left')

# Rename column
data.rename(columns={'type_x': 'Store_type'}, inplace=True)
data.rename(columns={'type_y': 'Holiday_type'}, inplace=True)

# Drop the id column since it's not relevant for modeling
data = data.drop('id', axis=1)

# Convert the date column to a datetime object
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Set the date column as the index
data = data.set_index('date')


In [20]:
#Creating the Day, Month and Year column from the Date Column
data['year'] = data.index.year
data['month'] =data.index.month
data['day']=data.index.day

In [21]:
#Implement the new super grouping of product family on the actual family attribute.

data['family'] = data['family'].replace({
'AUTOMOTIVE': 'Others',
'BABY CARE': 'Personal Care',
'BEAUTY': 'Personal Care',
'BEVERAGES': 'Beverages',
'BOOKS': 'Others',
'BREAD/BAKERY': 'Food',
'CELEBRATION': 'Food',
'CLEANING': 'Others',
'DAIRY': 'Food',
'DELI': 'Food',
'EGGS': 'Food',
'FROZEN FOODS': 'Food',
'GROCERY I': 'Food',
'GROCERY II': 'Food',
'HARDWARE': 'Others',
'HOME AND KITCHEN I': 'Home and Kitchen',
'HOME AND KITCHEN II': 'Home and Kitchen',
'HOME APPLIANCES': 'Home and Kitchen',
'HOME CARE': 'Home and Kitchen',
'LADIESWEAR': 'Clothing',
'LAWN AND GARDEN': 'Others',
'LINGERIE': 'Clothing',
'LIQUOR,WINE,BEER': 'Beverages',
'MAGAZINES': 'Others',
'MEATS': 'Food',
'PERSONAL CARE': 'Personal Care',
'PET SUPPLIES': 'Others',
'PLAYERS AND ELECTRONICS': 'Others',
'POULTRY': 'Food',
'PREPARED FOODS': 'Food',
'PRODUCE': 'Food',
'SCHOOL AND OFFICE SUPPLIES': 'Others',
'SEAFOOD': 'Food'
})

data['Holiday_type'] = np.where(data['Holiday_type'].isin(['Holiday',
                                                                     'Additional', 'Event', 'Transfer', 'Bridge']),
                                                                                                                'Holiday', 'Workday')
data = data.drop(['locale', 'locale_name', 'description', 'state', 'transferred'], axis=1)

In [22]:
# Identify numeric and non-numeric columns
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = data.select_dtypes(exclude=[np.number]).columns.tolist()

# Creating imputer variables
numerical_imputer = SimpleImputer(strategy = "mean")
categorical_imputer = SimpleImputer(strategy = "most_frequent")


# Define the column transformer
categorical_features = cat_cols
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto', sparse=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

In [23]:
# resample numeric columns by mean and categorical columns by mode
resampled = data.resample('D').agg({**{col: 'mean' for col in num_cols}, **{col: (lambda x: x.mode()[0] if not x.mode().empty else np.nan) for col in cat_cols}}).reset_index()

resampled = resampled.drop('date', axis=1)


In [24]:
# Filling missing values in numerical features of training set
resampled[num_cols] = numerical_imputer.fit_transform(resampled[num_cols])

resampled[cat_cols] = categorical_imputer.fit_transform(resampled[cat_cols])

In [25]:
# Calculate the number of rows in the data
n_rows = resampled.shape[0]

In [26]:
# Calculate the split point
split_point = int(n_rows * 0.90)

In [27]:
# Select the first 85% of the rows as the training data
X_train = resampled.iloc[:split_point]
y_train = X_train['sales']
X_train = X_train.drop('sales', axis=1)

In [28]:
# Select the remaining 15% of the rows as the validation data
X_eval = resampled.iloc[split_point:]
y_eval = X_eval['sales']
X_eval = X_eval.drop('sales', axis=1)

In [29]:
 # remove 'sales' from num_cols
num_cols.remove('sales')

In [30]:
X_train_cat = X_train[cat_cols].copy()
X_train_num = X_train[num_cols].copy()


In [31]:
X_eval_cat = X_eval[cat_cols].copy()
X_eval_num = X_eval[num_cols].copy()


In [32]:
# Fitting the Imputer
X_train_cat_imputed = categorical_imputer.fit_transform(X_train_cat)
X_train_num_imputed = numerical_imputer.fit_transform(X_train_num)

X_eval_cat_imputed = categorical_imputer.fit_transform(X_eval_cat)
X_eval_num_imputed = numerical_imputer.fit_transform(X_eval_num)


encoder=OneHotEncoder(handle_unknown='ignore')

In [33]:
# encoding the xtrain categories and converting to a dataframe
X_train_cat_encoded = encoder.fit(X_train_cat_imputed)
X_train_cat_encoded = pd.DataFrame(encoder.transform(X_train_cat_imputed).toarray(),
                                   columns=encoder.get_feature_names_out(cat_cols))

In [34]:
# encoding the xeval categories and converting to a dataframe
X_eval_cat_encoded = encoder.fit(X_eval_cat_imputed)
X_eval_cat_encoded = pd.DataFrame(encoder.transform(X_eval_cat_imputed).toarray(),
                                   columns=encoder.get_feature_names_out(cat_cols))


In [35]:
scaler= StandardScaler()

X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)
X_train_num_sc = pd.DataFrame(X_train_num_scaled, columns = num_cols)

X_eval_num_scaled = scaler.fit_transform(X_eval_num_imputed)
X_eval_num_sc = pd.DataFrame(X_eval_num_scaled, columns = num_cols)

X_train_df = pd.concat([X_train_num_sc,X_train_cat_encoded], axis =1)
X_eval_df = pd.concat([X_eval_num_sc,X_eval_cat_encoded], axis =1)


In [37]:
# create a dictionary of models to fit
models = {
    'Random Forest Regressor': RandomForestRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}

In [38]:
# iterate over the models and fit each one to the training data
for name, model in models.items():
    model.fit(X_train_df, y_train)

In [39]:
# evaluate each model using cross-validation
rmsle_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_df, y_train, cv=50, scoring='neg_mean_squared_log_error')
    rmsle_scores[name] = np.sqrt(-scores.mean())

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py", line 525, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



In [40]:
# print the RMSLE scores for each model
for name, score in rmsle_scores.items():
    print(f'{name}: {score}')

Random Forest Regressor: 0.23409874413079693
Decision Tree Regressor: 0.3461313766985483
Gradient Boosting Regressor: nan


In [41]:
# choose the model with the lowest RMSLE score
best_model_name = min(rmsle_scores, key=rmsle_scores.get)
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')

Best model: Random Forest Regressor


In [43]:
# set the destination path to the "export" directory
destination = "."

# create a dictionary to store the objects and their filenames
models = {"numerical_imputer": numerical_imputer,
          "categorical_imputer": categorical_imputer,
          "encoder": encoder,
          "scaler": scaler,
          "Final_model": best_model}

# loop through the models and save them using joblib.dump()
for name, model in models.items():
    dump(model, os.path.join(destination, f"{name}.joblib"))

In [44]:
#!pip freeze > requirements.txt

In [46]:
!pipreqs . --force

INFO: Successfully saved requirements file in ./requirements.txt


In [48]:
data.to_csv('resampledCmplete.csv', index=False)