# **Data Loading & Setup**

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
import logging
import os
import sys

In [2]:
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [3]:
# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [4]:
# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

logger.info("Imported libraries and configured logging.")

2025-01-02 19:27:42,611 - INFO - Imported libraries and configured logging.


In [18]:
# Import load_data
from load_data import load_data

if __name__ == "__main__":
    # Define file paths and names
    zip_file_path = '../data/rossmann-store-sales.zip'
    extract_to_folder = '../data'
    train = 'train.csv'
    test = 'test.csv'
    store = 'store.csv'

    # Load the dataset
    try:
        train_data = load_data(zip_file_path, train, extract_to_folder)
        test_data = load_data(zip_file_path, test, extract_to_folder)
        store_data = load_data(zip_file_path, store, extract_to_folder)
        print("Data successfully loaded.")
        display(train_data.head(10))
    except FileNotFoundError as e:
        logger.error(e)

    logger.info("Data loaded successfully.")

ValueError encountered while loading the CSV. Attempting to fix...
ValueError encountered while loading the CSV. Attempting to fix...
Data successfully loaded.


Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5,2015-07-31,5263,555,1,1,0,1
2,5,2015-07-31,6064,625,1,1,0,1
3,5,2015-07-31,8314,821,1,1,0,1
4,5,2015-07-31,13995,1498,1,1,0,1
5,5,2015-07-31,4822,559,1,1,0,1
6,5,2015-07-31,5651,589,1,1,0,1
7,5,2015-07-31,15344,1414,1,1,0,1
8,5,2015-07-31,8492,833,1,1,0,1
9,5,2015-07-31,8565,687,1,1,0,1
10,5,2015-07-31,7185,681,1,1,0,1


2025-01-02 19:38:31,589 - INFO - Data loaded successfully.


**Display the first 10 train_data**

In [10]:
# Explore the train data
train_data.head(10)

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5,2015-07-31,5263,555,1,1,0,1
2,5,2015-07-31,6064,625,1,1,0,1
3,5,2015-07-31,8314,821,1,1,0,1
4,5,2015-07-31,13995,1498,1,1,0,1
5,5,2015-07-31,4822,559,1,1,0,1
6,5,2015-07-31,5651,589,1,1,0,1
7,5,2015-07-31,15344,1414,1,1,0,1
8,5,2015-07-31,8492,833,1,1,0,1
9,5,2015-07-31,8565,687,1,1,0,1
10,5,2015-07-31,7185,681,1,1,0,1


**Display the first 10 test_data**

In [7]:
# Explore the testing data
test_data.head(10)

Unnamed: 0_level_0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,4,2015-09-17,1.0,1,0,0
2,3,4,2015-09-17,1.0,1,0,0
3,7,4,2015-09-17,1.0,1,0,0
4,8,4,2015-09-17,1.0,1,0,0
5,9,4,2015-09-17,1.0,1,0,0
6,10,4,2015-09-17,1.0,1,0,0
7,11,4,2015-09-17,1.0,1,0,0
8,12,4,2015-09-17,1.0,1,0,0
9,13,4,2015-09-17,1.0,1,0,0
10,14,4,2015-09-17,1.0,1,0,0


**Display the first 10 store_data**

In [9]:
# Explore the store dataset
store_data.head(10)

Unnamed: 0_level_0,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,c,a,1270.0,9.0,2008.0,0,,0.0,
2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
4,c,c,620.0,9.0,2009.0,0,,0.0,
5,a,a,29910.0,4.0,2015.0,0,,0.0,
6,a,a,310.0,12.0,2013.0,0,,0.0,
7,a,c,24000.0,4.0,2013.0,0,,0.0,
8,a,a,7520.0,10.0,2014.0,0,,0.0,
9,a,c,2030.0,8.0,2000.0,0,,0.0,
10,a,a,3160.0,9.0,2009.0,0,,0.0,


Now, We merge the store dataset with the train and test dataset on the same store to have rich set of data for analysis

In [11]:
# Logging the merging process for train and test datasets with store data
logger.info("Starting the merging process: combining store data with train and test datasets.")

# Merge store data with train data using 'Store' as the key
print("Merging store data with train dataset...")
_train_data = train_data.merge(store_data, on='Store', how='left')
print("Train dataset successfully merged with store data.")

# Merge store data with test data using 'Store' as the key
print("Merging store data with test dataset...")
_test_data = test_data.merge(store_data, on='Store', how='left')
print("Test dataset successfully merged with store data.")


2025-01-02 19:34:16,873 - INFO - Starting the merging process: combining store data with train and test datasets.


Merging store data with train dataset...
Train dataset successfully merged with store data.
Merging store data with test dataset...
Test dataset successfully merged with store data.


In [13]:
# Now let's explore to understand the nature of the merged data
_train_data.head(10)

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,0.0,
2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9.0,2009.0,0,,0.0,
5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4.0,2015.0,0,,0.0,
6,5,2015-07-31,5651,589,1,1,0,1,a,a,310.0,12.0,2013.0,0,,0.0,
7,5,2015-07-31,15344,1414,1,1,0,1,a,c,24000.0,4.0,2013.0,0,,0.0,
8,5,2015-07-31,8492,833,1,1,0,1,a,a,7520.0,10.0,2014.0,0,,0.0,
9,5,2015-07-31,8565,687,1,1,0,1,a,c,2030.0,8.0,2000.0,0,,0.0,
10,5,2015-07-31,7185,681,1,1,0,1,a,a,3160.0,9.0,2009.0,0,,0.0,


In [14]:
# Info the train data
_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1017209 entries, 1 to 1115
Data columns (total 17 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   DayOfWeek                  1017209 non-null  int64  
 1   Date                       1017209 non-null  object 
 2   Sales                      1017209 non-null  int64  
 3   Customers                  1017209 non-null  int64  
 4   Open                       1017209 non-null  int64  
 5   Promo                      1017209 non-null  int64  
 6   StateHoliday               1017209 non-null  object 
 7   SchoolHoliday              1017209 non-null  int64  
 8   StoreType                  1017209 non-null  object 
 9   Assortment                 1017209 non-null  object 
 10  CompetitionDistance        1014567 non-null  float64
 11  CompetitionOpenSinceMonth  693861 non-null   float64
 12  CompetitionOpenSinceYear   693861 non-null   float64
 13  Promo2              

In [16]:
# Log the process of generating statistical summaries
logger.info("Generating statistical summaries for numerical data in train and test datasets.")

# Display statistical summary of numerical data in the train dataset
print("Displaying statistical summary for the train dataset...")
train_summary = train_data.describe()
display(train_summary)
logger.info("Statistical summary for the train dataset generated successfully.")

# Display statistical summary of numerical data in the test dataset
print("Displaying statistical summary for the test dataset...")
test_summary = test_data.describe()
display(test_summary)
logger.info("Statistical summary for the test dataset generated successfully.")


2025-01-02 19:37:02,531 - INFO - Generating statistical summaries for numerical data in train and test datasets.


Displaying statistical summary for the train dataset...


Unnamed: 0,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0
mean,3.998341,5773.819,633.1459,0.8301067,0.3815145,0.1786467
std,1.997391,3849.926,464.4117,0.3755392,0.4857586,0.3830564
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,3727.0,405.0,1.0,0.0,0.0
50%,4.0,5744.0,609.0,1.0,0.0,0.0
75%,6.0,7856.0,837.0,1.0,1.0,0.0
max,7.0,41551.0,7388.0,1.0,1.0,1.0


2025-01-02 19:37:02,705 - INFO - Statistical summary for the train dataset generated successfully.


Displaying statistical summary for the test dataset...


Unnamed: 0,Store,DayOfWeek,Open,Promo,SchoolHoliday
count,41088.0,41088.0,41077.0,41088.0,41088.0
mean,555.899533,3.979167,0.854322,0.395833,0.443487
std,320.274496,2.015481,0.352787,0.489035,0.496802
min,1.0,1.0,0.0,0.0,0.0
25%,279.75,2.0,1.0,0.0,0.0
50%,553.5,4.0,1.0,0.0,0.0
75%,832.25,6.0,1.0,1.0,1.0
max,1115.0,7.0,1.0,1.0,1.0


2025-01-02 19:37:02,716 - INFO - Statistical summary for the test dataset generated successfully.


In [17]:
# Display the shape of the merged datasets
print("Checking the shape of the merged datasets...")

# Log and print the shape of the training dataset
train_shape = _train_data.shape
print(f"Shape of the training dataset: {train_shape} (Rows: {train_shape[0]}, Columns: {train_shape[1]})")

# Log and print the shape of the testing dataset
test_shape = _test_data.shape
print(f"Shape of the testing dataset: {test_shape} (Rows: {test_shape[0]}, Columns: {test_shape[1]})")


Checking the shape of the merged datasets...
Shape of the training dataset: (1017209, 17) (Rows: 1017209, Columns: 17)
Shape of the testing dataset: (41088, 16) (Rows: 41088, Columns: 16)
