# **Data Preprocessing**

In [1]:
# Import necessary libraries
import pandas as pd
import logging
import os
import sys

In [2]:
# Configure the script to include modules from the 'scripts' directory for modular imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [3]:
# Import the data preprocessor class from the scripts module
from data_preprocessing import DataPreprocessor

In [4]:
# Configure pandas to display a large number of rows and columns in the console for better visibility
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [5]:
# Set up logging to capture and display script execution details
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [6]:
# Log the successful import and configuration of libraries
logger.info("Imported necessary libraries and configured logging.")

2025-01-11 17:07:26,363 - INFO - Imported necessary libraries and configured logging.


In [7]:
# Print a message to indicate that the environment setup is complete
print("Environment setup complete. Ready to process data.")

Environment setup complete. Ready to process data.


In [8]:
# Example log to inform about the next steps (e.g., loading data)
logger.info("Proceeding to load and preprocess data. Ensure data file is available.")

2025-01-11 17:07:26,380 - INFO - Proceeding to load and preprocess data. Ensure data file is available.


## **Preprocessing**

In [9]:
# Log that the preprocessing script has started
logger.info("Starting the preprocessing script.")

2025-01-11 17:07:26,390 - INFO - Starting the preprocessing script.


In [10]:
logger.info("Preprocessed both the test and train datasets")
# Load and preprocess the datasets
if __name__ == "__main__":
    train_path = '../data/train_cleaned.csv'  # Path to train dataset
    test_path = '../data/test_cleaned.csv'  # Path to test dataset
    test_id = '../data/test.csv'
    # Create instance of the class
    preprocessor = DataPreprocessor(train_path, test_path, test_id)
    # Load the dataset
    train_df, test_df = preprocessor.preprocess()
    # Save Preprocessed data
    preprocessor.save_data()

2025-01-11 17:07:26,401 - INFO - Preprocessed both the test and train datasets


Cleaning data...
Extracting datetime features...
Performing feature engineering...
Encoding categorical data...
✅ Preprocessing complete.
Processed data saved to ../data/pre_processed/train_processed.csv and ../data/pre_processed/test_processed.csv.


In [11]:
train_df.dropna(axis=1)

Unnamed: 0_level_0,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,CompetitionDistance_log,Sales_winsorized,Customers_winsorized,Weekday,IsWeekend,Month,DaysToHoliday,DaysAfterHoliday,IsBeginningOfMonth,IsMidMonth,IsEndOfMonth,IsHoliday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-07-31,5,5263.0,1.0,1.0,0,1.0,2,0,1270.0,0,7.147559,5263.0,555.0,4,0,7,3069,-3069,0,0,1,1
2015-07-31,5,6064.0,1.0,1.0,0,1.0,0,0,570.0,1,6.347389,6064.0,625.0,4,0,7,3069,-3069,0,0,1,1
2015-07-31,5,8314.0,1.0,1.0,0,1.0,0,0,14130.0,1,9.556126,8314.0,821.0,4,0,7,3069,-3069,0,0,1,1
2015-07-31,5,12137.0,1.0,1.0,0,1.0,2,2,620.0,0,6.431331,12137.0,1362.0,4,0,7,3069,-3069,0,0,1,1
2015-07-31,5,4822.0,1.0,1.0,0,1.0,0,0,29910.0,0,10.305982,4822.0,559.0,4,0,7,3069,-3069,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-01-01,2,3375.0,1.0,0.0,1,1.0,1,0,150.0,0,5.017280,3375.0,566.0,1,0,1,4010,-4010,1,0,0,1
2013-01-01,2,10765.0,1.0,0.0,1,1.0,1,1,860.0,0,6.758095,10765.0,1362.0,1,0,1,4010,-4010,1,0,0,1
2013-01-01,2,5035.0,1.0,0.0,1,1.0,1,1,840.0,1,6.734592,5035.0,1248.0,1,0,1,4010,-4010,1,0,0,1
2013-01-01,2,4491.0,1.0,0.0,1,1.0,1,1,1430.0,0,7.266129,4491.0,1039.0,1,0,1,4010,-4010,1,0,0,1


In [12]:
train_df.columns

Index(['DayOfWeek', 'Sales', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2',
       'CompetitionDistance_log', 'Sales_winsorized', 'Customers_winsorized',
       'Store', 'Weekday', 'IsWeekend', 'Month', 'DaysToHoliday',
       'DaysAfterHoliday', 'IsBeginningOfMonth', 'IsMidMonth', 'IsEndOfMonth',
       'IsHoliday', 'Promo_duration'],
      dtype='object')

In [13]:
print(train_df.isna().sum())

DayOfWeek                       0
Sales                           0
Open                            0
Promo                           0
StateHoliday                    0
SchoolHoliday                   0
StoreType                       0
Assortment                      0
CompetitionDistance             0
Promo2                          0
CompetitionDistance_log         0
Sales_winsorized                0
Customers_winsorized            0
Store                      844392
Weekday                         0
IsWeekend                       0
Month                           0
DaysToHoliday                   0
DaysAfterHoliday                0
IsBeginningOfMonth              0
IsMidMonth                      0
IsEndOfMonth                    0
IsHoliday                       0
Promo_duration             844392
dtype: int64


In [14]:
cleaned_train_df = train_df.dropna(axis=1, how='any')

In [15]:
# Display the cleaned DataFrame's columns
print(cleaned_train_df.columns)

Index(['DayOfWeek', 'Sales', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2',
       'CompetitionDistance_log', 'Sales_winsorized', 'Customers_winsorized',
       'Weekday', 'IsWeekend', 'Month', 'DaysToHoliday', 'DaysAfterHoliday',
       'IsBeginningOfMonth', 'IsMidMonth', 'IsEndOfMonth', 'IsHoliday'],
      dtype='object')


In [16]:
test_df.dropna(axis=1)

Unnamed: 0_level_0,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Store,Weekday,IsWeekend,Month,DaysToHoliday,DaysAfterHoliday,IsBeginningOfMonth,IsMidMonth,IsEndOfMonth,IsHoliday,Promo_duration
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,4,1.0,1.0,0,0.0,2,0,1270.0,0,1.0,3,0,9,3021,-3021,0,1,0,0,1.0
2,4,1.0,1.0,0,0.0,0,0,14130.0,1,3.0,3,0,9,3021,-3021,0,1,0,0,1.0
3,4,1.0,1.0,0,0.0,0,2,24000.0,0,7.0,3,0,9,3021,-3021,0,1,0,0,1.0
4,4,1.0,1.0,0,0.0,0,0,7520.0,0,8.0,3,0,9,3021,-3021,0,1,0,0,1.0
5,4,1.0,1.0,0,0.0,0,2,2030.0,0,9.0,3,0,9,3021,-3021,0,1,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41084,6,1.0,0.0,0,0.0,0,0,1900.0,1,1111.0,5,1,8,3068,-3068,1,0,0,0,19.0
41085,6,1.0,0.0,0,0.0,2,2,1880.0,0,1112.0,5,1,8,3068,-3068,1,0,0,0,19.0
41086,6,1.0,0.0,0,0.0,0,2,9260.0,0,1113.0,5,1,8,3068,-3068,1,0,0,0,19.0
41087,6,1.0,0.0,0,0.0,0,2,870.0,0,1114.0,5,1,8,3068,-3068,1,0,0,0,19.0


In [17]:
test_df.columns

Index(['DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2',
       'CompetitionDistance_log', 'Sales_winsorized', 'Customers_winsorized',
       'Store', 'Weekday', 'IsWeekend', 'Month', 'DaysToHoliday',
       'DaysAfterHoliday', 'IsBeginningOfMonth', 'IsMidMonth', 'IsEndOfMonth',
       'IsHoliday', 'Promo_duration'],
      dtype='object')

In [18]:
print(test_df.isna().sum())

DayOfWeek                      0
Open                           0
Promo                          0
StateHoliday                   0
SchoolHoliday                  0
StoreType                      0
Assortment                     0
CompetitionDistance            0
Promo2                         0
CompetitionDistance_log    41088
Sales_winsorized           41088
Customers_winsorized       41088
Store                          0
Weekday                        0
IsWeekend                      0
Month                          0
DaysToHoliday                  0
DaysAfterHoliday               0
IsBeginningOfMonth             0
IsMidMonth                     0
IsEndOfMonth                   0
IsHoliday                      0
Promo_duration                 0
dtype: int64


In [19]:
cleaned_test_df = test_df.dropna(axis=1, how='any')

In [20]:
# Display the cleaned DataFrame's columns
print(cleaned_test_df.columns)

Index(['DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2', 'Store',
       'Weekday', 'IsWeekend', 'Month', 'DaysToHoliday', 'DaysAfterHoliday',
       'IsBeginningOfMonth', 'IsMidMonth', 'IsEndOfMonth', 'IsHoliday',
       'Promo_duration'],
      dtype='object')


In [21]:
cleaned_test_df.shape, cleaned_train_df.shape

((41088, 20), (844392, 22))