In [1]:
%pip install matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_ROOT = Path(r"C:\Users\Arushi Sharma\Documents\retail_demand_forecasting")
DATA_DIR = PROJECT_ROOT / "data" / "rossmann-store-sales"


In [3]:
dtype_map = {
    "Store": "int32",
    "DayOfWeek": "int8",
    "Sales": "int32",
    "Customers": "int32",
    "Open": "int8",
    "Promo": "int8",
    "StateHoliday": "string",
    "SchoolHoliday": "int8",
    "StoreType": "category",
    "Assortment": "category",
    "Promo2": "int8",
    "PromoInterval": "string"
}

sales_df = pd.read_csv(DATA_DIR / "train.csv", parse_dates=["Date"], dtype=dtype_map, low_memory=False)
store_df = pd.read_csv(DATA_DIR / "store.csv")

df = sales_df.merge(store_df, on="Store", how="left")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Store                      1017209 non-null  int32         
 1   DayOfWeek                  1017209 non-null  int8          
 2   Date                       1017209 non-null  datetime64[ns]
 3   Sales                      1017209 non-null  int32         
 4   Customers                  1017209 non-null  int32         
 5   Open                       1017209 non-null  int8          
 6   Promo                      1017209 non-null  int8          
 7   StateHoliday               1017209 non-null  string        
 8   SchoolHoliday              1017209 non-null  int8          
 9   StoreType                  1017209 non-null  object        
 10  Assortment                 1017209 non-null  object        
 11  CompetitionDistance        1014567 no

In [4]:
df.loc[df["Open"] == 0, "Sales"] = df.loc[df["Open"] == 0, "Sales"].fillna(0)
df["CompetitionDistance"] = df["CompetitionDistance"].fillna(df["CompetitionDistance"].median())

df["Year"] = df["Date"].dt.year.astype("int16")
df["Month"] = df["Date"].dt.month.astype("int8")
df["Week"] = df["Date"].dt.isocalendar().week.astype("int16")
df["Quarter"] = df["Date"].dt.quarter.astype("int8")
df["IsWeekend"] = df["DayOfWeek"].isin([6,7]).astype("int8")

df.head()


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Week,Quarter,IsWeekend
0,1,5,2015-07-31,5263,555,1,1,0,1,c,...,2008.0,0,,,,2015,7,31,3,0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,...,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,31,3,0
2,3,5,2015-07-31,8314,821,1,1,0,1,a,...,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,31,3,0
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,...,2009.0,0,,,,2015,7,31,3,0
4,5,5,2015-07-31,4822,559,1,1,0,1,a,...,2015.0,0,,,,2015,7,31,3,0


In [5]:
plt.figure(figsize=(12, 4))
df.groupby("Date")["Sales"].sum().plot(title="Total Sales Over Time")
plt.xlabel("Date")
plt.ylabel("Total Sales")
plt.tight_layout()
plt.savefig(PROJECT_ROOT / "outputs" / "plots" / "total_sales_over_time.png")
plt.close()


In [6]:
plt.figure(figsize=(10, 4))
sns.boxplot(x="DayOfWeek", y="Sales", data=df[df["Open"] == 1], showfliers=False)
plt.title("Sales by Day of Week (Open Stores)")
plt.tight_layout()
plt.savefig(PROJECT_ROOT / "outputs" / "plots" / "sales_by_weekday.png")
plt.close()


In [7]:
monthly = df[df["Open"] == 1].groupby(["Year", "Month"])["Sales"].sum().reset_index()
monthly["YearMonth"] = pd.to_datetime(monthly["Year"].astype(str) + "-" + monthly["Month"].astype(str) + "-01")

plt.figure(figsize=(12, 4))
plt.plot(monthly["YearMonth"], monthly["Sales"])
plt.title("Monthly Sales")
plt.xlabel("Year-Month")
plt.ylabel("Total Sales")
plt.tight_layout()
plt.savefig(PROJECT_ROOT / "outputs" / "plots" / "monthly_sales.png")
plt.close()


In [8]:
plt.figure(figsize=(8, 4))
sns.boxplot(x="Promo", y="Sales", data=df[df["Open"] == 1], showfliers=False)
plt.title("Sales with vs without Promo")
plt.tight_layout()
plt.savefig(PROJECT_ROOT / "outputs" / "plots" / "sales_vs_promo.png")
plt.close()


In [9]:
plt.figure(figsize=(8, 4))
sns.boxplot(x="StoreType", y="Sales", data=df[df["Open"] == 1], showfliers=False)
plt.title("Sales by StoreType")
plt.tight_layout()
plt.savefig(PROJECT_ROOT / "outputs" / "plots" / "sales_by_store_type.png")
plt.close()


In [10]:
!pip install --upgrade pandas pyarrow




In [11]:
# Save the cleaned DataFrame to a fast Parquet file
df.to_parquet(PROJECT_ROOT / "data" / "cleaned_rossmann.parquet", index=False)
