In [20]:
# linear algebra and data processing
import numpy as np
import pandas as pd

# visualisations
import seaborn as sns
import matplotlib.pyplot as plt

# logging
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

<h3>Data</h3>

In [26]:
train_df = pd.read_csv('../data/train.csv', dtype={'StateHoliday': object})
test_df = pd.read_csv('../data/test.csv')
store_df = pd.read_csv('../data/store.csv')
submission_df = pd.read_csv('../data/sample_submission.csv')

In [27]:
logging.info('train set shape: %s', train_df.shape)
logging.info('test set shape: %s', test_df.shape)
logging.info('store set shape: %s', store_df.shape)

2022-05-25 01:28:03,521 - root - INFO - train set shape: (1017209, 9)
2022-05-25 01:28:03,523 - root - INFO - test set shape: (41088, 8)
2022-05-25 01:28:03,524 - root - INFO - store set shape: (1115, 10)


In [28]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Date           1017209 non-null  object
 3   Sales          1017209 non-null  int64 
 4   Customers      1017209 non-null  int64 
 5   Open           1017209 non-null  int64 
 6   Promo          1017209 non-null  int64 
 7   StateHoliday   1017209 non-null  object
 8   SchoolHoliday  1017209 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 69.8+ MB


In [29]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             41088 non-null  int64  
 1   Store          41088 non-null  int64  
 2   DayOfWeek      41088 non-null  int64  
 3   Date           41088 non-null  object 
 4   Open           41077 non-null  float64
 5   Promo          41088 non-null  int64  
 6   StateHoliday   41088 non-null  object 
 7   SchoolHoliday  41088 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 2.5+ MB


In [30]:
store_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


<h5> Merge the train/test sets with the stores set </h5>

In [31]:
full_train_df = pd.merge(left= train_df, right=store_df, how='inner', left_on='Store', right_on='Store')
full_test_df = pd.merge(left= test_df, right=store_df, how='inner', left_on='Store', right_on='Store')

In [32]:
logging.info(full_train_df.shape)
logging.info(full_test_df.shape)

2022-05-25 01:34:14,284 - root - INFO - (1017209, 18)
2022-05-25 01:34:14,286 - root - INFO - (41088, 17)
