In [1]:
import pandas as pd
import altair as alt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


file_path_df_035_join = r'C:\Users\sebas\OneDrive\Documenten\GitHub\Supermarketcasegroupproject\Group4B\data\interim\df_035_join.parquet'


df_aggregated_sales = pd.read_parquet(file_path_df_035_join)

df_aggregated_sales['date'] = pd.to_datetime(df_aggregated_sales[['month', 'year']].assign(DAY=1))


In [None]:
#df_aggregated_sales1 = df_aggregated_sales[(df_aggregated_sales['store_nbr'] == 1)&df_aggregated_sales([df_aggregated_sales['date'] == '2013-01-01')]
df_aggregated_sales1 = df_aggregated_sales[(df_aggregated_sales['store_nbr'] == 1) & (df_aggregated_sales['date'] == '2013-01-01')]

#df [ ( df['store_nbr'] == 25 ) & (df['day'] == 1) & (df['month'] == 1) & (df['year'] == 2013) ] .shape[0]

df_aggregated_sales1.head(1000)



In [None]:
df_aggregated_sales.info()

In [None]:
df_aggregated_sales_store = df_aggregated_sales.groupby(['store_nbr', 'date','year']).agg({
                                'type': 'first', 
                                'cluster': 'first', 
                                'city': 'first', 
                                'state': 'first',
                                'unit_sales': 'sum'}).reset_index()
df_aggregated_sales_store

## Step 2.1 - Total sales in units is generated by 

When looking into the data, we first delved deeper into just the amount of stores per city. Here, we found that especially Quito is overrepresented by the amount of supermarkets located there.

In [None]:
df_aggregated_sales_type = df_aggregated_sales.groupby('type')['unit_sales'].sum()/1000

df_aggregated_sales_type = df_aggregated_sales_type.sort_values(ascending=False)

df_aggregated_sales_type2 = pd.DataFrame(df_aggregated_sales_type)

df_aggregated_sales_type2['Percentage'] = df_aggregated_sales_type2/df_aggregated_sales_type2.sum()*100

df_aggregated_sales_type2 = round(df_aggregated_sales_type2,2)

df_aggregated_sales_type2


## Step 2.1 - Total sales per store by month over all time periods in the dataset - Some stores are relatively new and might be excluded for further analysis

The first step we take when combining the store data with the transaction date is to look if there's unit sale data for each store available. Although we have data available for each store, some stores don't have data for all the dates in the dataset. Most likely, these stores are new, this makes forecasting for these store less convenient and as this is our first forecast for Corporacion Favorita, we rather have a more generalizable store to choose from (having a higher impact on business as the likelihood of giving a more accurate forecast is higher).

In [None]:
sns.set_theme(style="white")

g = sns.relplot(
    data=df_aggregated_sales_store,
    x="date", y="unit_sales", col="store_nbr", hue="store_nbr",
    kind="line", palette="crest", linewidth=4, zorder=5,
    col_wrap=3, height=2, aspect=1.5, legend=False,
)

## Step 2.1 - Total sales per store by month over all time periods in the dataset - It seems not only new stores miss data but also older stores have missing data

The only way to really check if stores have data for all the months in the dataset is to count the amount of months available of data. Thereby we find that 12 stores don't have data for the full timeperiod. 

Stores that are relatively new:

Started in 2013
Store 36 (per May)

Started in 2014
Store 53 (per May)

Started in 2015
Store 20 (per Feb)
Store 21 (per Jul)
Store 22 (per Oct)
Store 29 (per Mar)
Store 42 (per Aug)

Started in 2017
Store 52 (per Apr)

Stores that have missing data (might be closed for one or multiple months)

2014 - Store number 24
2015 - Store number 12
2016 - Store number 18 and 25



In [None]:
row_count_per_store = df_aggregated_sales_store['store_nbr'].value_counts()

row_count_per_store = row_count_per_store.sort_values(ascending=True)

print(row_count_per_store)

type(row_count_per_store)

In [None]:
row_count_per_store56 = row_count_per_store[row_count_per_store == 56]
row_count_per_store56

In [None]:
sns.set_theme(style="white")

df_store_25 = df_aggregated_sales_store[df_aggregated_sales_store['store_nbr'] == 18]
df_store_25.plot.line(x='date', y='unit_sales', figsize=(10, 6))

#df_store_25_filtered = df_store_25[df_store_25['unit_sales'] <= 1000]
#df_store_25_filtered.plot.line(x='date', y='unit_sales', figsize=(10, 6))

In [None]:
sns.set_theme(style="white")

# Pivot the dataframe to have store_nbr as rows, date as columns, and unit_sales as values
pivot_df = df_aggregated_sales_store.pivot(index='store_nbr', columns='date', values='unit_sales')

# Set the size of the figure
plt.figure(figsize=(15, 15))

# Create the heatmap
heatmapdatestoresales = sns.heatmap(pivot_df, cmap='Blues', annot=False, fmt='.1f')

# Set the labels for x and y axis
plt.xlabel('Date')
plt.ylabel('Store Number')

heatmapdatestoresales.xaxis.tick_bottom()

# Show the plot
plt.show()


From the heatmap we can atleast observe two things. First, we can see the several stores that are missing data. Second, we can see that several stores have significant higher sales in terms of units sold (store 3 and 44,45,46,47,48,49). Now, we can delve further into seeing wether the missing data influences the type and clusters of stores by comparing the original data to a filtered dataset.

## Step 2.2 - For investigating the types and clusters further, we drop the stores that don't have data for the full timeline.

In that way, we don't have to worry about these stores interfering with the results on any insights of type and cluster. However, to see if the missing data actually has an effect on the clusters and types, we first will make a boxplot on botht he original data (without filtering) as well as a filtered set.

Within the boxplot one can see the range of total unit_sales per cluster (a range determined by the unit sales per store within the cluster).

In [None]:
#First boxplot is made on the original data (not the filtered)
sns.set_theme(style="ticks", palette="rocket")

# Here i make a order list
order = df_aggregated_sales_store.groupby(['cluster'])['unit_sales'].max().sort_values(ascending=False).index

# Draw a nested boxplot to show bills by day and time
sns.boxplot(x="cluster", y="unit_sales",
            hue="type",palette='Set2',
            data=df_aggregated_sales_store,
            order=order,
            width = 0.5,
            dodge=False,
            showmeans= True).set_title("Total unit sales and distribution per cluster and type - Unfiltered", fontsize=17)
plt.ylabel("Unit sales in millions", fontsize=13)
plt.xlabel("Cluster", fontsize=13)
# sns.despine(offset=1, trim=True)
# sns.stripplot(dodge = True)
plt.show

In [None]:
df_aggregated_sales_store.shape

In [None]:
df_aggregated_sales_store_filtered = df_aggregated_sales_store.merge(row_count_per_store56.astype(df_aggregated_sales_store['store_nbr'].dtype), left_on= 'store_nbr', right_index=True, how='inner')

df_aggregated_sales_store_filtered.shape

In [None]:
df_aggregated_sales_store_filtered.head()

In [None]:
#Second boxplot is made on the filtered data
sns.set_theme(style="ticks", palette="rocket")

# Here i make a order list
order = df_aggregated_sales_store_filtered.groupby(['cluster'])['unit_sales'].max().sort_values(ascending=False).index

# Draw a nested boxplot to show bills by day and time
sns.boxplot(x="cluster", y="unit_sales",
            hue="type",palette='Set2',
            data=df_aggregated_sales_store_filtered,
            order=order,
            width=0.5,
            dodge=False,
            showmeans= True).set_title("Total unit sales and distribution per cluster and type - Filtered", fontsize=17)
plt.ylabel("Unit sales in millions", fontsize=13)
plt.xlabel("Cluster", fontsize=13)

To do

1- Average sales per city?
2- Sales per store (with color for city?)
3- 

In [None]:
df_aggregated_sales_store_filtered

In [None]:
df_aggregated_sales_store_filtered_sum = df_aggregated_sales_store_filtered.groupby(['store_nbr',]).agg({
    'city' :'first',
    'state':'first',
    'type':'first',
    'unit_sales' : 'sum'}).reset_index()
df_aggregated_sales_store_filtered_sum = df_aggregated_sales_store_filtered_sum.astype({'store_nbr':str})

df_aggregated_sales_store_filtered_sum

In [18]:
# Sort the dataframe to unit_sales
df_aggregated_sales_store_filtered_sum =df_aggregated_sales_store_filtered_sum.sort_values(by = 'unit_sales', ascending=False)


In [None]:

# Set the figure size
plt.figure(figsize=(10, 10))  # Adjust the width and height as desired

# Create the barplot
sns.barplot(data=df_aggregated_sales_store_filtered_sum, x='unit_sales', y='store_nbr', hue='city', palette='pastel', width = 0.5)

# Set the title and labels
plt.title('Unit Sales by Store Number')
plt.xlabel('Unit Sales')
plt.ylabel('Store Number')

# Show the plot
plt.show()

In [None]:
sns.set_theme(rc={'figure.figsize':(20.7,12.27)},style="white", palette=None)
sns.scatterplot(
    data = df_aggregated_sales_store_filtered_sum, x="city", y='store_nbr', hue ='unit_sales', size = 'unit_sales',  
    sizes =(20,1000), legend = "brief"
)


In [None]:

# Calculate the percentage of unit_sales per store in relation to the total unit_sales
df_aggregated_sales_store_filtered_sum['Percentage'] = (df_aggregated_sales_store_filtered_sum['unit_sales'] / df_aggregated_sales_store_filtered_sum['unit_sales'].sum())*100

# Make a cumulative sum of the amount of stores
df_aggregated_sales_store_filtered_sum['Cum_sum'] = df_aggregated_sales_store_filtered_sum['unit_sales'].cumsum()

# Make a cumulative percentage column
df_aggregated_sales_store_filtered_sum['Cumulative Percentage'] = round(100*df_aggregated_sales_store_filtered_sum.Cum_sum/df_aggregated_sales_store_filtered_sum['unit_sales'].sum(),2)

df_aggregated_sales_store_filtered_sum.reset_index(drop=True, inplace=True)

df_aggregated_sales_store_filtered_sum

## Step 2.3 - Find out of the stores that are filtered out have a big impact on sales units and thus alternative ways should be found

Although we filtered out the stores in the former steps, we have to find out if filtering out was the right decision to take. They might have a big impact on total sales and therefore excluding them might have not been the right decision to make. Therefore in this step i will analyse the impact of the stores with missing data on the total sales units.

Step 1 - Make a aggregated dataset for sales per store over the full timeline and mark the stores according to: 1) stores that have data available for all data points, 2) stores that miss a lot of data (started later), 3) stores that miss some data. 

Step 2 - Plot the relative size of sales of the combination of missing data categories 2 and 3 in relation to the total.

Step 3 - For new stores, also look into the last year/months of sales to see if they are relatively big (they might be new but exceptionally big. Thus, being important for further exploration).

In [22]:
# Impact of new stores on total sales 
# What new stores might be interesting?
# Look at daily level sales data with stores to see if you can find interesting insights. 
# Oil price

In [None]:
# Step 1 - Make aggregated data set for sales per store over the full timeline and mark stores according to  1) stores that have data available for all data points, 2) stores that miss a lot of data (started later), 3) stores that miss some data. 

df_aggregated_sales_store_unfiltered = df_aggregated_sales_store.merge(row_count_per_store.astype(df_aggregated_sales_store['store_nbr'].dtype), left_on= 'store_nbr', right_index=True, how='inner')

# Rename the result of the count column to a monthcount (as it stands for monthcount)
df_aggregated_sales_store_unfiltered = df_aggregated_sales_store_unfiltered.rename(columns={'count':'monthcount'})

# Aggregate the dataframe on sales and store level from a month basis to a total basis
df_aggregated_sales_store_unfiltered_sum = df_aggregated_sales_store_unfiltered.groupby(['store_nbr','monthcount']).agg({
    'city' :'first',
    'state':'first',
    'type':'first',
    'unit_sales' : 'sum'}).reset_index()
df_aggregated_sales_store_unfiltered_sum = df_aggregated_sales_store_unfiltered_sum.astype({'store_nbr':str})

# Assign a value of 0 to the stores that have data available for all data points and 0 to the stores that miss data.
df_aggregated_sales_store_unfiltered_sum['store_status'] = np.where(df_aggregated_sales_store_unfiltered_sum['monthcount'] == 56, 0, 1)
df_aggregated_sales_store_unfiltered_sum['dummy'] = 1

df_aggregated_sales_store_unfiltered_sum

In [None]:
# Step 2 - Plot the relative size of sales of the combination of missing data categories 2 and 3 in relation to the total.

alt.Chart(df_aggregated_sales_store_unfiltered_sum).mark_bar(size = 40).encode(
    x=alt.X('sum(unit_sales)').stack("normalize"),
    y='dummy',
    color='store_status'
)



In [None]:
# Find out the real percentage of sales for both categories
df_aggregated_sales_store_unfiltered_storestatus_sum = df_aggregated_sales_store_unfiltered_sum.groupby(['store_status']).sum(['unit_sales']).reset_index()

df_aggregated_sales_store_unfiltered_storestatus_sum['Percentage'] = (df_aggregated_sales_store_unfiltered_storestatus_sum['unit_sales'] / df_aggregated_sales_store_unfiltered_storestatus_sum['unit_sales'].sum())*100

df_aggregated_sales_store_unfiltered_storestatus_sum

In [None]:
# Does this differ when we look at one of the last months of data?

df_aggregated_sales_store_july2017 = df_aggregated_sales_store[df_aggregated_sales_store['date'] == '2017-07-01']

df_aggregated_sales_store_july2017 = df_aggregated_sales_store_july2017[['store_nbr','unit_sales','date']]

df_aggregated_sales_store_july2017['store_nbr'] = df_aggregated_sales_store_july2017['store_nbr'].astype('str')

df_aggregated_sales_store_july2017_status = df_aggregated_sales_store_july2017.merge(df_aggregated_sales_store_unfiltered_sum[['store_nbr','store_status','dummy']].astype(df_aggregated_sales_store_july2017['store_nbr'].dtype), left_on='store_nbr',right_on= 'store_nbr', how='inner')

df_aggregated_sales_store_july2017_status

In [None]:
df_aggregated_sales_store_july2017_status_grouped = df_aggregated_sales_store_july2017_status.groupby('store_status').sum('unit_sales')

df_aggregated_sales_store_july2017_status_grouped['Percentage'] = (df_aggregated_sales_store_july2017_status_grouped['unit_sales']/df_aggregated_sales_store_july2017_status_grouped['unit_sales'].sum())*100

df_aggregated_sales_store_july2017_status_grouped