# Import Libraries

In [1]:
import pandas as pd
import warnings
import plotly.express as px
import plotly.graph_objects as go
import dash
from dash import dcc, html

# Suppress all FutureWarnings

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

# Loading Data

In [3]:
train = pd.read_csv("train.csv", parse_dates=["date"])
test = pd.read_csv("test.csv", parse_dates=["date"])
stores = pd.read_csv("stores.csv")
transactions = pd.read_csv("transactions.csv", parse_dates=["date"])
holidays = pd.read_csv("holidays_events.csv", parse_dates=["date"])
oil = pd.read_csv("oil.csv", parse_dates=["date"])

# Merge Auxiliary Data into df

- **stores.csv** → Adds store information (city, state, type, cluster).

- **transactions.csv** → Adds daily transactions per store.

- **oil.csv** → Adds oil prices (missing values forward-filled).

- **holidays_events.csv** → Adds holiday/event type for each date.

In [4]:
df = train.merge(stores, on="store_nbr", how="left")
df = df.merge(transactions, on=["date", "store_nbr"], how="left")
oil["dcoilwtico"] = oil["dcoilwtico"].fillna(method="ffill")
df = df.merge(oil, on="date", how="left")
df = df.merge(holidays[["date","type"]], on="date", how="left")

## Remove Duplicate

In [5]:
initial_rows = df.shape[0]
df = df.drop_duplicates()
print(f"Removed {initial_rows - df.shape[0]} duplicate rows.")
df.shape

Removed 30294 duplicate rows.


(3024054, 13)

## Fill Missing values in type_y

In [6]:
df['type_y'] = df['type_y'].fillna('Work Day')

## Average Sales per day_type

In [7]:
df.groupby("type_y", as_index=False)["sales"].mean().sort_values("sales", ascending=False)

Unnamed: 0,type_y,sales
0,Additional,487.628925
4,Transfer,467.75311
1,Bridge,446.750854
2,Event,425.024432
3,Holiday,356.375983
5,Work Day,352.228764


## Handle Duplicated  in Holidays Events

In [8]:
holiday_priority ={
    'Holiday': 2,
    'Bridge': 4,
    'Transfer': 5,
    'Additional': 6,
    'Work Day': 1,
    'Event': 3
}

In [9]:
holidays["priority"] = holidays["type"].map(holiday_priority)
idx=holidays.groupby("date")["priority"].idxmax()
holidays=holidays.loc[idx].reset_index(drop=True)

# Merge Auxiliary Data into Train


In [10]:
train = train.merge(stores, on="store_nbr", how="left")
train = train.merge(transactions, on=["date", "store_nbr"], how="left")
oil["dcoilwtico"] = oil["dcoilwtico"].fillna(method="ffill")
train = train.merge(oil, on="date", how="left")
train = train.merge(holidays[["date","type"]], on="date", how="left")

# Merge Auxiliary Data into test


In [11]:
test = test.merge(stores, on="store_nbr", how="left")
test = test.merge(transactions, on=["date", "store_nbr"], how="left")
test = test.merge(oil, on="date", how="left")
test = test.merge(holidays[["date","type"]], on="date", how="left")

### Renaming Columns for Clarity

- After merging multiple datasets, some columns inherited generic names like `type_x` and `type_y`.  
- To make the dataset more interpretable, we renamed them:

    - **type_x** (coming from stores.csv) →  **store_type**

        Indicates the type of store (A, B, C, or D).

    - **type_y** (coming from holidays_events.csv) → **day_type**

        Classifies each date as either a special event (Holiday, Transfer, Additional, Event, etc.) or a regular Work Day (for dates without special events).

In [12]:
train = train.rename(columns={
    'type_x': 'store_type',
    'type_y': 'day_type'
})
test = test.rename(columns={
    'type_x': 'store_type',
    'type_y': 'day_type'
})

# Check Duplicates

In [13]:
print(train.duplicated().sum())
print(test.duplicated().sum())

0
0


In [14]:
initial_rows = train.shape[0]
train = train.drop_duplicates()
print(f"Removed {initial_rows - train.shape[0]} duplicate rows.")
train.shape

Removed 0 duplicate rows.


(3000888, 13)

# Hidden Duplicates

## Handle missing values
- Logical imputation strategies:  
  - `transactions`: median per store.  
  - `dcoilwtico`: forward-fill.  
  - `day_type`: fill NaN with `"Work Day"`.  

In [15]:
train['dcoilwtico'] = train['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
train['day_type'] = train['day_type'].fillna('Work Day')
test['dcoilwtico'] = test['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
test['day_type'] = test['day_type'].fillna('Work Day')

train.drop("transactions", axis=1, inplace=True)
test.drop("transactions", axis=1, inplace=True)

# Feature Engineering

In [16]:
train["Year"] = train["date"].dt.year
train["Month"] = train["date"].dt.month
train["Day"] = train["date"].dt.day
train["Week_day"] = train["date"].dt.strftime("%A")

test["Year"] = test["date"].dt.year
test["Month"] = test["date"].dt.month
test["Day"] = test["date"].dt.day
test["Week_day"] = test["date"].dt.strftime("%A")

# Figures

## # 1) Global + top stores sales

In [17]:
daily_global = train.groupby('date', as_index=False)['sales'].sum()
top_stores = train.groupby('store_nbr')['sales'].sum().nlargest(5).index

fig = go.Figure()
fig.add_trace(go.Scatter(x=daily_global['date'], y=daily_global['sales'], mode='lines', name="Global"))
for store in top_stores:
    df_store = train[train['store_nbr'] == store].groupby('date', as_index=False)['sales'].sum()
    fig.add_trace(go.Scatter(x=df_store['date'], y=df_store['sales'], mode='lines', name=f"Store {store}"))
fig.update_layout(title="Daily Sales (Global + Top Stores)", xaxis_title="Date", yaxis_title="Sales")
sales_timeline_fig = fig


## 2) Family sales


In [18]:
top_fam = train.groupby('family')['sales'].sum().nlargest(12).index
fig = go.Figure()
for fam in top_fam:
    df_f = train[train['family'] == fam].groupby('date', as_index=False)['sales'].sum()
    fig.add_trace(go.Scatter(x=df_f['date'], y=df_f['sales'], mode='lines', name=fam))
fig.update_layout(title="Daily Sales (Top Families)", xaxis_title="Date", yaxis_title="Sales")
family_sales_fig = fig

## 3) Monthly distribution

In [19]:
monthly = train.copy()
monthly['YearMonth'] = monthly['date'].dt.to_period('M').dt.to_timestamp()
month_agg = monthly.groupby(['YearMonth'], as_index=False)['sales'].sum()
month_agg['month'] = month_agg['YearMonth'].dt.month
fig = px.box(month_agg, x='month', y='sales', points='outliers', title='Distribution of Monthly Sales by Month')
fig.update_xaxes(tickmode='array', tickvals=list(range(1,13)))
monthly_box_fig = fig

## 4) Weekday sales

In [20]:
weekday = train.copy()
weekday['Week_day'] = pd.Categorical(weekday['Week_day'],
    categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], ordered=True)
weekday['year'] = weekday['date'].dt.year
weekday_avg = weekday.groupby(['year','Week_day'], as_index=False)['sales'].mean()
weekday_all = weekday.groupby(['Week_day'], as_index=False)['sales'].mean()

fig = go.Figure()
fig.add_bar(x=weekday_all['Week_day'], y=weekday_all['sales'], name="All Years")
for y in sorted(weekday_avg['year'].unique()):
    df_year = weekday_avg[weekday_avg['year']==y]
    fig.add_trace(go.Scatter(x=df_year['Week_day'], y=df_year['sales'], mode='lines+markers', name=str(y)))
fig.update_layout(title="Average Sales by Weekday", xaxis_title="Weekday", yaxis_title="Average Sales")
weekday_analysis_fig = fig


## 5) Oil correlation

In [21]:
oil_daily = train[['date','dcoilwtico','sales']].dropna().groupby('date', as_index=False).agg({'dcoilwtico':'mean','sales':'sum'})
oil_scatter_fig = px.scatter(oil_daily, x='dcoilwtico', y='sales', trendline='ols', title='Oil Price vs Sales')
fig = go.Figure()
fig.add_trace(go.Scatter(x=oil_daily['date'], y=oil_daily['sales'], name='Sales'))
fig.add_trace(go.Scatter(x=oil_daily['date'], y=oil_daily['dcoilwtico'], name='Oil Price', yaxis="y2"))
fig.update_layout(title='Sales & Oil Price Over Time',
                  yaxis=dict(title='Sales'),
                  yaxis2=dict(title='Oil Price', overlaying='y', side='right'))
oil_timeseries_fig = fig

## 6) City & State


In [22]:
city_sum = train.groupby('city', as_index=False)['sales'].sum().sort_values('sales', ascending=False)
city_treemap_fig = px.treemap(city_sum, path=['city'], values='sales', title='Sales Treemap by City')
state_sum = train.groupby('state', as_index=False)['sales'].sum().sort_values('sales', ascending=False)
state_bar_fig = px.bar(state_sum.head(15), x='state', y='sales', title='Top 15 States by Sales')

## 7) Store type

In [23]:
ct = train.groupby(['date','store_type'], as_index=False)['sales'].sum()
fig = go.Figure()
for st in ct['store_type'].unique():
    df = ct[ct['store_type'] == st]
    fig.add_trace(go.Scatter(x=df['date'], y=df['sales'], mode='lines', name=st))
fig.update_layout(title="Sales Over Time by Store Type", xaxis_title="Date", yaxis_title="Sales")
store_type_fig = fig

## 8) Monthly trends by year


In [24]:
cal = train.groupby('date', as_index=False)['sales'].sum()
cal['year'] = cal['date'].dt.year
cal['month'] = cal['date'].dt.month
monthly = cal.groupby(['year','month'], as_index=False)['sales'].sum()
monthly['month_name'] = pd.to_datetime(monthly['month'], format='%m').dt.strftime('%b')
month_order = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
monthly_trends_fig = px.line(monthly, x='month_name', y='sales', color='year', markers=True,
                             category_orders={"month_name": month_order}, title="Monthly Sales Trends by Year")

## 9) Time series metrics

In [25]:
time_df = train[['date','sales','onpromotion','dcoilwtico']].groupby('date', as_index=False).mean()
fig = go.Figure()
for col in ['sales','onpromotion','dcoilwtico']:
    fig.add_trace(go.Scatter(x=time_df['date'], y=time_df[col], mode='lines', name=col))
fig.update_layout(title="Time Series Metrics", xaxis_title="Date", yaxis_title="Value")
time_series_metrics_fig = fig

## 10) KPIs

In [26]:
total_sales = train['sales'].sum()
median_sales = train['sales'].median()
mean_sales = train['sales'].mean()
max_sale = train['sales'].max()

kpi_fig = go.Figure()
kpi_fig.add_trace(go.Indicator(mode="number", value=total_sales, title={"text": "Total Sales"}, domain={'row': 0, 'column': 0}))
kpi_fig.add_trace(go.Indicator(mode="number", value=median_sales, title={"text": "Median Sale"}, domain={'row': 0, 'column': 1}))
kpi_fig.add_trace(go.Indicator(mode="number", value=mean_sales, title={"text": "Mean Sale"}, domain={'row': 0, 'column': 2}))
kpi_fig.add_trace(go.Indicator(mode="number", value=max_sale, title={"text": "Max Sale"}, domain={'row': 0, 'column': 3}))
kpi_fig.update_layout(grid={'rows': 1, 'columns': 4}, title="Quick KPIs")

# Dash App

In [27]:
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Sales Analytics Dashboard", style={'text-align': 'center', 'margin-bottom': '30px'}),

    html.Div([html.H2("Key Performance Indicators"), dcc.Graph(id='kpi-chart', figure=kpi_fig)], style={'margin-bottom': '30px'}),
    html.Div([html.H2("Sales Timeline Analysis"), dcc.Graph(id='sales-timeline', figure=sales_timeline_fig)], style={'margin-bottom': '30px'}),
    html.Div([html.H2("Product Family Sales"), dcc.Graph(id='family-sales', figure=family_sales_fig)], style={'margin-bottom': '30px'}),
    html.Div([html.H2("Monthly Sales Distribution"), dcc.Graph(id='monthly-distribution', figure=monthly_box_fig)], style={'margin-bottom': '30px'}),
    html.Div([html.H2("Weekday Sales Analysis"), dcc.Graph(id='weekday-analysis', figure=weekday_analysis_fig)], style={'margin-bottom': '30px'}),
    html.Div([html.H2("Oil Price vs Sales Correlation"),
              html.Div([dcc.Graph(id='oil-scatter', figure=oil_scatter_fig, style={'width': '50%', 'display': 'inline-block'}),
                        dcc.Graph(id='oil-timeseries', figure=oil_timeseries_fig, style={'width': '50%', 'display': 'inline-block'})])],
             style={'margin-bottom': '30px'}),
    html.Div([html.H2("Geographic Sales Analysis"),
              html.Div([dcc.Graph(id='city-treemap', figure=city_treemap_fig, style={'width': '50%', 'display': 'inline-block'}),
                        dcc.Graph(id='state-bar', figure=state_bar_fig, style={'width': '50%', 'display': 'inline-block'})])],
             style={'margin-bottom': '30px'}),
    html.Div([html.H2("Store Type Performance"), dcc.Graph(id='store-type-timeline', figure=store_type_fig)], style={'margin-bottom': '30px'}),
    html.Div([html.H2("Monthly Trends by Year"), dcc.Graph(id='monthly-trends', figure=monthly_trends_fig)], style={'margin-bottom': '30px'}),
    html.Div([html.H2("Time Series Metrics"), dcc.Graph(id='time-series-metrics', figure=time_series_metrics_fig)], style={'margin-bottom': '30px'})
], style={'padding': '20px', 'font-family': 'Arial, sans-serif'})

if __name__ == "__main__":
    app.run(debug=True)