In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### EDA Conclusion
1. *The dataset is a clean time-series dataset with daily sales records from 2013 to 2017, containing 91,300 rows and 4 columns.*
2. *The dataset includes the features date, store (10 unique stores), item (50 unique items), and sales, with no missing or duplicate values.*
3. *Sales exhibit a clear upward trend over the years, indicating increasing demand over time.*
4. *Strong yearly seasonality is observed, with repeating demand patterns across years, confirming that sales are non-stationary.*
5. *Most daily sales values fall within the range of 0–100 units, representing regular demand levels across stores and items.*
6. *A small number of days experience very high sales (approximately 100–180 units), creating a long right tail in the distribution and              resulting in a right-skewed sales distribution.*
7. *All stores contain outliers, indicating that these high-sales values represent genuine demand spikes rather than data errors; therefore,         outliers should be retained, and store-specific modeling is likely to improve forecasting performance.*


## Feature Engineering

In [15]:
df = pd.read_csv("../data/processed/cleaned_data.csv",parse_dates=['date'])

In [16]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-01,7,12,26
2,2013-01-01,7,46,27
3,2013-01-01,8,12,54
4,2013-01-01,9,12,35


## 1. Date Based Features

In [17]:
df["date"].dtype

dtype('<M8[ns]')

In [19]:
df = df.sort_values(["store","item","date"])

*Lag features depend on correct order,
Prevents data leakage (Absolutely required for time series)*

In [20]:
df

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
699,2013-01-02,1,1,11
1062,2013-01-03,1,1,14
1861,2013-01-04,1,1,13
2291,2013-01-05,1,1,10
...,...,...,...,...
910872,2017-12-27,10,50,63
911260,2017-12-28,10,50,59
911707,2017-12-29,10,50,74
912007,2017-12-30,10,50,62


In [21]:
df["year"] = df["date"].dt.year

In [23]:
df["month"] = df["date"].dt.month 

In [26]:
df["week"] = df["date"].dt.isocalendar().week

In [36]:
# date - 2017-12-27 - 27
df["day"] = df["date"].dt.day

In [37]:
# Monday(1) , Tuesday(2)
df["dayofweek"] = df["date"].dt.dayofweek

In [38]:
df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int) 

In [39]:
df

Unnamed: 0,date,store,item,sales,year,month,week,day,dayofweek,is_weekend
0,2013-01-01,1,1,13,2013,1,1,1,1,0
699,2013-01-02,1,1,11,2013,1,1,2,2,0
1062,2013-01-03,1,1,14,2013,1,1,3,3,0
1861,2013-01-04,1,1,13,2013,1,1,4,4,0
2291,2013-01-05,1,1,10,2013,1,1,5,5,1
...,...,...,...,...,...,...,...,...,...,...
910872,2017-12-27,10,50,63,2017,12,52,27,2,0
911260,2017-12-28,10,50,59,2017,12,52,28,3,0
911707,2017-12-29,10,50,74,2017,12,52,29,4,0
912007,2017-12-30,10,50,62,2017,12,52,30,5,1
