# Downloading the store sales dataset from Kaggle API

In [5]:
import os

# Set a custom path with your desired folder
custom_path = '/Users/bilalkhokhar/Desktop/asadrkhokar/data-science-portfolio'

# Make sure folder exists
os.makedirs(custom_path, exist_ok=True)

# Change working directory
os.chdir(custom_path)

# Now run kaggle download command here
!kaggle competitions download -c store-sales-time-series-forecasting

# Unzip here as well
!unzip -q store-sales-time-series-forecasting.zip -d store-sales-time-series-forecasting



Downloading store-sales-time-series-forecasting.zip to /Users/bilalkhokhar/Desktop/asadrkhokar/data-science-portfolio
  0%|                                               | 0.00/21.4M [00:00<?, ?B/s]
100%|██████████████████████████████████████| 21.4M/21.4M [00:00<00:00, 1.58GB/s]


# Loading libraries and data

In [7]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random

In [18]:
#Loading data as pandas dataframes
train_df = pd.read_csv('store-sales-time-series-forecasting/train.csv')
stores_df = pd.read_csv('store-sales-time-series-forecasting/stores.csv')
transactions_df = pd.read_csv('store-sales-time-series-forecasting/transactions.csv')
oil_df = pd.read_csv('store-sales-time-series-forecasting/oil.csv')
holidays_events_df = pd.read_csv('store-sales-time-series-forecasting/holidays_events.csv')
test_df = pd.read_csv('store-sales-time-series-forecasting/test.csv')

# Checking counts, data types and statistics for each dataframe and column

In [19]:

df_list = [train_df, test_df, stores_df, transactions_df, oil_df, holidays_events_df]
df_names = ['train_df', 'test_df', 'stores_df', 'transactions_df', 'oil_df', 'holidays_events_df']

for df, df_name in zip(df_list, df_names):
    print(f"\n{'-'*20} {df_name} {'-'*20}")
    df.info()
    print(df.describe()) 


-------------------- train_df --------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB
                 id     store_nbr         sales   onpromotion
count  3.000888e+06  3.000888e+06  3.000888e+06  3.000888e+06
mean   1.500444e+06  2.750000e+01  3.577757e+02  2.602770e+00
std    8.662819e+05  1.558579e+01  1.101998e+03  1.221888e+01
min    0.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00
25%    7.502218e+05  1.400000e+01  0.000000e+00  0.000000e+00
50%    1.500444e+06  2.750000e+01  1.100000e+01  0.000000e+00
75%    2.250665e+06  4.100000e+01  1.958473e+02  0.000000e+00
max    3.000887e+06  5.400000e+01  1.247170e+05  7.410000e+02

-----

| Dataset              | Data cleaning needed?                                               |
| -------------------- | ------------------------------------------------------------------- |
| train\_df            | Convert `date` to datetime                                          |
| test\_df            | Convert `date` to datetime                                          |
| stores\_df           | Check categorical consistency                                       |
| transactions\_df     | Convert `date` to datetime                                          |
| oil\_df              | Convert `date` to datetime; handle missing in `dcoilwtico`          |
| holidays\_events\_df | Convert `date` to datetime; check duplicate dates (multiple events) |


# Data Cleaning and Preperation

In [20]:
# Convert date columns to datetime
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])
transactions_df['date'] = pd.to_datetime(transactions_df['date'])
oil_df['date'] = pd.to_datetime(oil_df['date'])
holidays_events_df['date'] = pd.to_datetime(holidays_events_df['date'])

In [21]:
# Check missing values
print(oil_df.isnull().sum())

date           0
dcoilwtico    43
dtype: int64


In [12]:
# Forward fill missing oil prices (commonly used in time series when price data is missing)
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='ffill')

  oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='ffill')


In [13]:
# If first value(s) are still NaN after ffill, can backward fill
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='bfill')

  oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='bfill')


In [22]:
# Check duplicated rows
print("train_df duplicated rows:", train_df.duplicated().sum())
print("test_df duplicated rows:", test_df.duplicated().sum())
print("transactions_df duplicated rows:", transactions_df.duplicated().sum())
print("oil_df duplicated rows:", oil_df.duplicated().sum())
print("holidays_events_df duplicated rows:", holidays_events_df.duplicated().sum())
print("stores_df duplicated rows:", stores_df.duplicated().sum())

train_df duplicated rows: 0
test_df duplicated rows: 0
transactions_df duplicated rows: 0
oil_df duplicated rows: 0
holidays_events_df duplicated rows: 0
stores_df duplicated rows: 0


In [23]:
# Unique values in categorical columns
print("stores_df city:", stores_df['city'].unique())
print("stores_df state:", stores_df['state'].unique())
print("stores_df type:", stores_df['type'].unique())
print("holidays_events_df type:", holidays_events_df['type'].unique())
print("holidays_events_df locale:", holidays_events_df['locale'].unique())

stores_df city: ['Quito' 'Santo Domingo' 'Cayambe' 'Latacunga' 'Riobamba' 'Ibarra'
 'Guaranda' 'Puyo' 'Ambato' 'Guayaquil' 'Salinas' 'Daule' 'Babahoyo'
 'Quevedo' 'Playas' 'Libertad' 'Cuenca' 'Loja' 'Machala' 'Esmeraldas'
 'Manta' 'El Carmen']
stores_df state: ['Pichincha' 'Santo Domingo de los Tsachilas' 'Cotopaxi' 'Chimborazo'
 'Imbabura' 'Bolivar' 'Pastaza' 'Tungurahua' 'Guayas' 'Santa Elena'
 'Los Rios' 'Azuay' 'Loja' 'El Oro' 'Esmeraldas' 'Manabi']
stores_df type: ['D' 'B' 'C' 'E' 'A']
holidays_events_df type: ['Holiday' 'Transfer' 'Additional' 'Bridge' 'Work Day' 'Event']
holidays_events_df locale: ['Local' 'Regional' 'National']


In [26]:
# Negative sales?
print("Negative sales count (train_df):", (train_df['sales'] < 0).sum())

Negative sales count (train_df): 0


# Modelling data creation

In [29]:
# Copy train_df to start modelling dataframe
train_df = train_df.copy()

# Merge holidays/events info on 'date'
train_df = train_df.merge(
    holidays_events_df[['date', 'type', 'locale', 'locale_name', 'description', 'transferred']],
    on='date',
    how='left'
)

# Merge oil prices on 'date'
train_df = train_df.merge(
    oil_df[['date', 'dcoilwtico']],
    on='date',
    how='left'
)

# Merge transactions per store and date
train_df = train_df.merge(
    transactions_df,
    on=['date', 'store_nbr'],
    how='left'
)

# Merge store metadata on 'store_nbr'
train_df = train_df.merge(
    stores_df,
    on='store_nbr',
    how='left'
)

In [30]:
# Fill missing transactions with 0
train_df['transactions'] = train_df['transactions'].fillna(0)

# Fill missing oil prices forward then backward
train_df['dcoilwtico'] = train_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')

  train_df['dcoilwtico'] = train_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')


# Feature Engineering

In [32]:
# Convert categorical columns from holidays/events and stores to category dtype
categorical_cols = ['type', 'locale', 'locale_name', 'description', 'store_nbr', 'city', 'state', 'type_y'] 
for col in categorical_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype('category')

# Extract datetime features from 'date'
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['dayofweek'] = train_df['date'].dt.dayofweek
train_df['weekofyear'] = train_df['date'].dt.isocalendar().week.astype(int)
train_df['is_weekend'] = train_df['dayofweek'].isin([5,6]).astype(int)

# Create is_holiday flag from holidays_events_df
holiday_dates = holidays_events_df['date'].unique()
train_df['is_holiday'] = train_df['date'].isin(holiday_dates).astype(int)

# Label encode 'family' (main product category) 
from sklearn.preprocessing import LabelEncoder
le_family = LabelEncoder()
train_df['family_enc'] = le_family.fit_transform(train_df['family'])

# Final check of missing values
print("Missing values per column:")
print(train_df.isnull().sum())

# Display the head of final modelling dataframe
print(train_df.head())

Missing values per column:
id                    0
date                  0
store_nbr             0
family                0
sales                 0
onpromotion           0
type_x          2551824
locale          2551824
locale_name     2551824
description     2551824
transferred     2551824
dcoilwtico            0
transactions          0
city                  0
state                 0
type_y                0
cluster               0
year                  0
month                 0
day                   0
dayofweek             0
weekofyear            0
is_weekend            0
is_holiday            0
family_enc            0
dtype: int64
   id       date store_nbr      family  sales  onpromotion   type_x    locale  \
0   0 2013-01-01         1  AUTOMOTIVE    0.0            0  Holiday  National   
1   1 2013-01-01         1   BABY CARE    0.0            0  Holiday  National   
2   2 2013-01-01         1      BEAUTY    0.0            0  Holiday  National   
3   3 2013-01-01         1   BEVERAG

In [34]:
# Handle missing values in categorical columns (fill with 'Unknown')
cat_cols = ['type', 'locale', 'locale_name', 'description', 'city', 'state', 'type_y']

for col in cat_cols:
    if col in train_df.columns:
        if pd.api.types.is_categorical_dtype(train_df[col]):
            # Add 'Unknown' to categories before filling
            train_df[col] = train_df[col].cat.add_categories('Unknown')
        train_df[col] = train_df[col].fillna('Unknown')

# Create lag and rolling features for sales per 'id'
train_df = train_df.sort_values(['id', 'date'])
train_df['sales_lag_7'] = train_df.groupby('id')['sales'].shift(7)
train_df['sales_roll_mean_7'] = train_df.groupby('id')['sales'].transform(lambda x: x.shift(1).rolling(window=7).mean())

# Fill lag/rolling NaNs with 0 or some other strategy
train_df['sales_lag_7'] = train_df['sales_lag_7'].fillna(0)
train_df['sales_roll_mean_7'] = train_df['sales_roll_mean_7'].fillna(0)

  if pd.api.types.is_categorical_dtype(train_df[col]):


# Creating modelling test dataset

In [35]:
# Merge supplementary data same as train
test_df = test_df.merge(
    holidays_events_df[['date', 'type', 'locale', 'locale_name', 'description', 'transferred']],
    on='date',
    how='left'
)
test_df = test_df.merge(
    oil_df[['date', 'dcoilwtico']],
    on='date',
    how='left'
)
test_df = test_df.merge(
    transactions_df,
    on=['date', 'store_nbr'],
    how='left'
)
test_df = test_df.merge(
    stores_df,
    on='store_nbr',
    how='left'
)
test_df['transactions'] = test_df['transactions'].fillna(0)
test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')

# Categorical columns: fillna and convert to category
for col in cat_cols:
    if col in test_df.columns:
        if pd.api.types.is_categorical_dtype(test_df[col]):
            test_df[col] = test_df[col].cat.add_categories('Unknown')
        test_df[col] = test_df[col].fillna('Unknown')

# Encode 'family' using trained encoder
test_df['family_enc'] = le_family.transform(test_df['family'])

# Extract date features
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['dayofweek'] = test_df['date'].dt.dayofweek
test_df['weekofyear'] = test_df['date'].dt.isocalendar().week.astype(int)
test_df['is_weekend'] = test_df['dayofweek'].isin([5,6]).astype(int)
test_df['is_holiday'] = test_df['date'].isin(holiday_dates).astype(int)

  test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
  if pd.api.types.is_categorical_dtype(test_df[col]):


# Full modularised and reproducible pipeline

In [36]:
def set_seed(seed=42):
    """Set seed for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)

def load_data(base_path='store-sales-time-series-forecasting'):
    """Load all datasets."""
    train_df = pd.read_csv(f'{base_path}/train.csv')
    test_df = pd.read_csv(f'{base_path}/test.csv')
    stores_df = pd.read_csv(f'{base_path}/stores.csv')
    transactions_df = pd.read_csv(f'{base_path}/transactions.csv')
    oil_df = pd.read_csv(f'{base_path}/oil.csv')
    holidays_events_df = pd.read_csv(f'{base_path}/holidays_events.csv')
    return train_df, test_df, stores_df, transactions_df, oil_df, holidays_events_df

def preprocess_dates(*dfs):
    """Convert date columns to datetime."""
    for df in dfs:
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])

def fill_oil_prices(oil_df):
    """Fill missing oil prices forward then backward."""
    oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
    return oil_df

def fill_missing_and_convert_cats(df, cat_cols):
    """Fill NA with 'Unknown' before converting to categorical dtype."""
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown')
            df[col] = df[col].astype('category')
    return df

def extract_date_features(df):
    """Extract common datetime features."""
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    return df

def add_holiday_flag(df, holidays_events_df):
    """Add is_holiday flag based on holidays_events_df."""
    holiday_dates = holidays_events_df['date'].unique()
    df['is_holiday'] = df['date'].isin(holiday_dates).astype(int)
    return df

def merge_external_data(df, holidays_events_df, oil_df, transactions_df, stores_df):
    """Merge all external data on date/store keys and rename columns for clarity."""
    
    # Rename 'type' in stores_df to 'store_type'
    stores_df = stores_df.rename(columns={'type': 'store_type'})
    
    # Rename 'type' in holidays_events_df to 'holiday_type' before merging
    holidays_events_df = holidays_events_df.rename(columns={'type': 'holiday_type'})
    
    df = df.merge(
        holidays_events_df[['date', 'holiday_type', 'locale', 'locale_name', 'description', 'transferred']],
        on='date', how='left'
    )
    df = df.merge(
        oil_df[['date', 'dcoilwtico']],
        on='date', how='left'
    )
    df = df.merge(
        transactions_df,
        on=['date', 'store_nbr'], how='left'
    )
    df = df.merge(
        stores_df,
        on='store_nbr', how='left'
    )
    return df

def fill_missing_and_convert_cats(df, cat_cols):
    """Fill NA with 'Unknown' before converting to categorical dtype."""
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown')
            df[col] = df[col].astype('category')
    return df

def preprocess_train(train_df, holidays_events_df, oil_df, transactions_df, stores_df, seed=42):
    set_seed(seed)
    
    # Merge external info with renamed columns
    train_df = merge_external_data(train_df, holidays_events_df, oil_df, transactions_df, stores_df)
    
    # Fill missing values before category conversion
    train_df['transactions'] = train_df['transactions'].fillna(0)
    train_df['dcoilwtico'] = train_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
    
    cat_cols = ['holiday_type', 'locale', 'locale_name', 'description', 'city', 'state', 'store_type', 'store_nbr']
    train_df = fill_missing_and_convert_cats(train_df, cat_cols)
    
    # Date features & holiday flag
    train_df = extract_date_features(train_df)
    train_df = add_holiday_flag(train_df, holidays_events_df)
    
    # Label encode 'family'
    le_family = LabelEncoder()
    train_df['family_enc'] = le_family.fit_transform(train_df['family'])
    
    # Sort for lag features
    train_df = train_df.sort_values(['id', 'date'])
    train_df['sales_lag_7'] = train_df.groupby('id')['sales'].shift(7)
    train_df['sales_roll_mean_7'] = train_df.groupby('id')['sales'].transform(lambda x: x.shift(1).rolling(window=7).mean())
    
    # Fill lag/rolling NaNs
    train_df['sales_lag_7'] = train_df['sales_lag_7'].fillna(0)
    train_df['sales_roll_mean_7'] = train_df['sales_roll_mean_7'].fillna(0)
    
    return train_df, le_family

def preprocess_test(test_df, holidays_events_df, oil_df, transactions_df, stores_df, le_family, seed=42):
    set_seed(seed)
    
    # Merge external info with renamed columns
    test_df = merge_external_data(test_df, holidays_events_df, oil_df, transactions_df, stores_df)
    
    # Fill missing values before category conversion
    test_df['transactions'] = test_df['transactions'].fillna(0)
    test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
    
    cat_cols = ['holiday_type', 'locale', 'locale_name', 'description', 'city', 'state', 'store_type', 'store_nbr']
    test_df = fill_missing_and_convert_cats(test_df, cat_cols)
    
    # Date features & holiday flag
    test_df = extract_date_features(test_df)
    test_df = add_holiday_flag(test_df, holidays_events_df)
    
    # Label encode 'family' using existing encoder
    test_df['family_enc'] = le_family.transform(test_df['family'])
    
    # No lag features for test (no sales column)
    return test_df

def main():
    # Load data
    train_df, test_df, stores_df, transactions_df, oil_df, holidays_events_df = load_data()
    
    # Convert dates
    preprocess_dates(train_df, test_df, transactions_df, oil_df, holidays_events_df)
    
    # Fill oil prices missing values
    oil_df = fill_oil_prices(oil_df)
    
    # Preprocess train and test
    train_df_processed, le_family = preprocess_train(train_df, holidays_events_df, oil_df, transactions_df, stores_df)
    test_df_processed = preprocess_test(test_df, holidays_events_df, oil_df, transactions_df, stores_df, le_family)
    
    print("\nProcessed train sample:")
    print(train_df_processed.head())
    
    print("\nProcessed test sample:")
    print(test_df_processed.head())
    
    # Check missing values in train
    print("\nMissing values in train_df after preprocessing:")
    print(train_df_processed.isnull().sum())

if __name__ == "__main__":
    main()


  oil_df['dcoilwtico'] = oil_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
  train_df['dcoilwtico'] = train_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
  test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')



Processed train sample:
   id       date store_nbr      family  sales  onpromotion holiday_type  \
0   0 2013-01-01         1  AUTOMOTIVE    0.0            0      Holiday   
1   1 2013-01-01         1   BABY CARE    0.0            0      Holiday   
2   2 2013-01-01         1      BEAUTY    0.0            0      Holiday   
3   3 2013-01-01         1   BEVERAGES    0.0            0      Holiday   
4   4 2013-01-01         1       BOOKS    0.0            0      Holiday   

     locale locale_name         description  ...  year  month  day dayofweek  \
0  National     Ecuador  Primer dia del ano  ...  2013      1    1         1   
1  National     Ecuador  Primer dia del ano  ...  2013      1    1         1   
2  National     Ecuador  Primer dia del ano  ...  2013      1    1         1   
3  National     Ecuador  Primer dia del ano  ...  2013      1    1         1   
4  National     Ecuador  Primer dia del ano  ...  2013      1    1         1   

  weekofyear is_weekend  is_holiday  family