### Supermarket data science case study - Exploring first data


### Importing packages

In [None]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
import altair as alt
import vegafusion as vf
import sklearn
import vega_datasets
from sklearn.pipeline import Pipeline, make_pipeline
import seaborn as sns
from matplotlib.patches import Patch
import numpy as np


### Functions

In [None]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)

    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len - 1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len - 1]

    # Return result.
    return dummy

In [None]:
def f_describe(df_input, n_top=10):

    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))

    df_numeric = df_input.select_dtypes(
        include=[
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "float64",
        ]
    )

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

    df_textual = df_input.select_dtypes(include=["category", "object", "bool"])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())

    v_na = [
        col
        + " ("
        + str(df[col].isna().sum())
        + ", "
        + str(round(100 * df[col].isna().sum() / df.shape[0], 1))
        + "%)"
        for col in df.columns
        if df[col].isna().sum() > 0
    ]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

### Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [None]:
def standardize_column_names(s):
    return s.replace(" ", "")


def optimize_memory(df):
    # Change: Objects to Categorical.                                               #WHEN needed to transform Objects to Categorical?
    # object_cols = df.select_dtypes(include="object").columns
    # if not object_cols.empty:
    #     print("Change: Objects to Categorical")
    #     df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


def month_year_to_int(df, i):
    # Change: Month and Year to integer
    if i == 0:
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})

    return df

### Transform date-related columns to datetime format.

In [None]:
# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        df["date"] = pd.to_datetime(df[["year", "month", "day"]], unit="us")

       # print(
       #     "Change: Dropped 'year', 'month', 'day' columns and transformed to Datetime64[us] feature"
       # )
        #df.drop(columns=["day", "month", "year"], inplace=True)

    else:
        if "date" in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype")
            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    return df

### Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [1]:
def f_get_data(i=0):

    # Define path.
    c_path = "/Users/Georgi/Documents/EAISI/EASI_4B_Supermarket/Group4B/data/raw/"
    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "history_aggregated",  # 1
        "holidays_events",  # 2
        "items",  # 3
        "oil",  # 4
        "stores",  # 5
        "transactions",
    )  # 6

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### Importing data: Here I import the daily data. The data contains daily sales data excluding holidays

In [4]:
df_salesdata= f_get_data(0)
df_items =f_get_data(3)
df_stores =f_get_data(5)

NameError: name 'pd' is not defined

In [None]:
#print(df_stores.info())
#print(df_salesdata.info())
print(df_items.info())

In [None]:
# Adjust data types and drop columns we don't need
df_salesdata['store_nbr'] = df_salesdata['store_nbr'].astype(str)
df_salesdata = df_salesdata.drop(columns=['year', 'day','onpromotion','month'])
df_stores['store_nbr'] = df_stores['store_nbr'].astype(str)
df_stores['cluster'] = df_stores['cluster'].astype(str)

In [None]:
print(df_stores.info())
print(df_salesdata.info())
print(df_items.info())

In [None]:
# Group the sales date by store and item
df_salesdatagrouped = df_salesdata.groupby(['store_nbr','date']).agg({'unit_sales':'sum'}).reset_index()

print(f' In df_salesdatagrouped zitten nu {df_salesdatagrouped.shape[0]} rijen en {df_salesdatagrouped.shape[1]} kolommen')
print(df_salesdatagrouped.info())

In [None]:
df_salesandstoresdata = df_salesdatagrouped.merge(df_stores, left_on='store_nbr', right_on='store_nbr', how='inner')

print(f' In df_salesandstoredata zitten nu {df_salesandstoresdata.shape[0]} rijen en {df_salesandstoresdata.shape[1]} kolommen')
print(df_salesandstoresdata.info())

## Step 3.0- Filter out all stores that don't have all the datapoint or atleast mark them


In [None]:
# Count amount of values per store
se_storedatecount = df_salesandstoresdata['store_nbr'].value_counts()


In [None]:
# Create a date range from the start date to the end date of the sales data
start_date = pd.to_datetime('2013-01-02')
end_date = pd.to_datetime('2017-08-15')

# Create a date range variable from the start date to the end date of the sales data
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a dataframe from the date range
date_range = pd.DataFrame(date_range, columns=['date'])

print(f'The date_range dataframe starts at {date_range["date"].min()} and ends at {date_range["date"].max()}')

In [None]:
df_salesandstoresdata34 = df_salesandstoresdata[df_salesandstoresdata['store_nbr'] == '34']

df_salesandstoresdata34missingdates = df_salesandstoresdata34.merge(date_range, left_on='date', right_on='date', how='outer')

empty_unit_sales = df_salesandstoresdata34missingdates[df_salesandstoresdata34missingdates['unit_sales'].isnull()]
print(empty_unit_sales)
print('As we can see, stores that have all data seem to be closed on christmas day and on new years day')

In [None]:
# Let's try to make a dataframe that consists of all stores that are missing data for a certain date
# Step 1 - Crossjoin stores with the daterange
df_storesreduced = df_stores.drop(columns=['city', 'state', 'type', 'cluster'])
df_storesanddates = df_storesreduced.merge(date_range, how='cross')

print(f' Now we onstructed a dataframe with all stores and all dates, it contains {df_storesanddates.shape[0]} rows')
print(df_storesanddates.head(5))

# Step 2 - Merge the salesdata with the storesanddates dataframe to have a dataframe consisting of all stores and all dates with unit_sales

df_salesandstoresdata_alldates = df_salesandstoresdata.merge(df_storesanddates, on = ['store_nbr','date'], how='outer')

print(f' Now we onstructed a dataframe with all stores and all dates, it contains {df_salesandstoresdata_alldates.shape[0]} rows')
print(df_salesandstoresdata_alldates.head(5))

In [None]:
# Merge the two dataframes and keep only the records that are in the first dataframe but not in the second dataframe
Difference_df_salesandstoresdata_alldates_df_storesanddates = df_salesandstoresdata_alldates.merge(df_storesanddates, on = ['store_nbr','date'], how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only']
Difference_df_salesandstoresdata_alldates_df_storesanddates

In [None]:
# Now, let's see how this works out for store number 30 (just a random one that is missing some dates according to our earlier analysis)
df_salesandstoresdata_alldates30 = df_salesandstoresdata_alldates[df_salesandstoresdata_alldates['store_nbr']=='30']
df_salesandstoresdata_alldates30 = df_salesandstoresdata_alldates30[df_salesandstoresdata_alldates30['unit_sales'].isnull()]
df_salesandstoresdata_alldates30.head(5)

In [None]:
# Only the stores that have are value count less than 1679 in se_storedatecount
se_storedatecountmissing = se_storedatecount[se_storedatecount < 1679]

# Now, let's have df_salesandstoresdata_alldates but only for the stores where we are missing some of the data (well, atleast we miss sales on those date, we don't have them in the original data)
df_salesandstoresdata_alldatesnull = df_salesandstoresdata_alldates[df_salesandstoresdata_alldates['store_nbr'].isin(se_storedatecountmissing.index)]

# From the stores with missing data, we only want the records where the unit_sales is missing
df_salesandstoresdata_alldatesnull = df_salesandstoresdata_alldatesnull[df_salesandstoresdata_alldatesnull['unit_sales'].isnull()]

df_salesandstoresdata_alldatesnull = df_salesandstoresdata_alldatesnull[['date', 'store_nbr','unit_sales']]

# Add a unit_sales of 1 to the dataframe to make it easier to plot, it's just a dummy value
df_salesandstoresdata_alldatesnull['unit_sales'] = 1

# Merge the dataframe with the date_range dataframe to have all dates in the dataframe
df_salesandstoresdata_alldatesnull = df_storesanddates.merge(df_salesandstoresdata_alldatesnull, on=['store_nbr','date'] ,how='left')

# Now we have a dataframe with all stores and all dates, but only for the stores that are missing some data
df_salesandstoresdata_alldatesnull = df_salesandstoresdata_alldatesnull[df_salesandstoresdata_alldatesnull['store_nbr'].isin(se_storedatecountmissing.index)]

print(f"Stores {df_salesandstoresdata_alldatesnull['store_nbr'].unique()} are in the dataset with stores with <1679 datapoints and all dates, having imputed a value of 1 for all dates missing in the range")

In [None]:
df_salesandstoresdata_alldatesnull36 = df_salesandstoresdata_alldatesnull[df_salesandstoresdata_alldatesnull['store_nbr'] == '36']
df_salesandstoresdata_alldatesnull36

In [None]:
# Stores that are new we mark as 1, stores that are old we mark as 0
se_storedatecountmissingsome = se_storedatecount[se_storedatecount < 1670]

df_salesandstoresdata_alldatesnull1 = df_salesandstoresdata_alldatesnull.copy()

# Identify stores that are new based on having a dummy value on 2013-01-02
new_store_nbrs = df_salesandstoresdata_alldatesnull[
    (df_salesandstoresdata_alldatesnull['date'] == '2013-01-02') & 
    (df_salesandstoresdata_alldatesnull['unit_sales'] == 1)
]['store_nbr'].unique()

# Make a new column missingdatacategory where stores that have a dummy unit for 2013-01-02 are marked as a new store, the rest for now is seen as an old store. This is still the whole dataset 
# We get the right storenumbers based on the isin part of the expression
df_salesandstoresdata_alldatesnull1['missingdatacategory'] = np.where(df_salesandstoresdata_alldatesnull1['store_nbr'].isin(new_store_nbrs),
                                                                     'new_store', 
                                                                     'old_store'
                                                                     )

# Step 2 - For all stores that have < 1670 days of data, name the stores that are not new and old store missing > days of data
df_salesandstoresdata_alldatesnull2 = df_salesandstoresdata_alldatesnull1[df_salesandstoresdata_alldatesnull1['store_nbr'].isin(se_storedatecountmissingsome.index)]

df_salesandstoresdata_alldatesnull2['missingdatacategory'] = np.where((df_salesandstoresdata_alldatesnull2['missingdatacategory'] == 'new_store'),
                                                                    'new_store',
                                                                    'old_store missing >9 days'
                                                                    )

# Step 3 - For all stores that are missing <9 days of data we just label them "missing < 9 days"
df_salesandstoresdata_alldatesnull3 = df_salesandstoresdata_alldatesnull[~df_salesandstoresdata_alldatesnull1['store_nbr'].isin(se_storedatecountmissingsome.index)]
df_salesandstoresdata_alldatesnull3['missingdatacategory'] =        'missing <9 days'

# Put the dataframes of step 2 and 3 together to get all rows back together as in the original dataframes
df_salesandstoresdata_alldatesnullfinal = pd.concat([df_salesandstoresdata_alldatesnull2, df_salesandstoresdata_alldatesnull3])

print(df_salesandstoresdata_alldatesnull.shape)
print(df_salesandstoresdata_alldatesnull1.shape)
print(df_salesandstoresdata_alldatesnull2.shape)
print(df_salesandstoresdata_alldatesnull3.shape)

# Make a dataframe that groups the data by store and missingdatacategory
df_salesandstoresdata_alldatesnullfinal = df_salesandstoresdata_alldatesnullfinal.groupby(['store_nbr','missingdatacategory']).agg({'unit_sales':'count'}).reset_index()

# Add a dummy value to the dataframe to make it easier to plot or to join with other dataframes
df_salesandstoresdata_alldatesnullfinal['missingdata'] = '1'
df_salesandstoresdata_alldatesnullfinal

#4.0 Determine the imapct of stores

In [None]:
# Calculate the total unit sales per store from the salesandstoresdata dataframe (this original dataframe grouped the data by store and date with the original dates)
df_salesandstoresdatatotal = df_salesandstoresdata.groupby(['store_nbr']).agg({'unit_sales':'sum'}).reset_index()

# Take the df_salesandstoresdata_alldatesnullfinal dataframe and merge it with the df_salesandstoresdatatotal dataframe to get the total unit sales per store and the marking if dates are missing per store including the categories why something is missing.
df_salesandstoresdatatotal = df_salesandstoresdatatotal.merge(df_salesandstoresdata_alldatesnullfinal, on='store_nbr', how='left')

# Drop the unit_sales_y column and rename the unit_sales_x column to unit_sales (just cleaning things from the last merge)
df_salesandstoresdatatotal = df_salesandstoresdatatotal.drop(columns=['unit_sales_y'])
df_salesandstoresdatatotal = df_salesandstoresdatatotal.rename(columns={'unit_sales_x':'unit_sales'})

# If a store isn't missing data, give the missingdata column a value of 0, do the same for the missingdatacategory column
df_salesandstoresdatatotal['missingdata'] = df_salesandstoresdatatotal['missingdata'].fillna('0')
df_salesandstoresdatatotal['missingdatacategory'] = df_salesandstoresdatatotal['missingdatacategory'].fillna('0')

# Sort the dataframe by unit_sales (we want to have the highest sales first)
df_salesandstoresdatatotal = df_salesandstoresdatatotal.sort_values(by='unit_sales', ascending=False)

df_salesandstoresdatatotal

In [None]:
# Group the data by missingdata and missingdatacategory and calculate the percentage of the total unit sales per store
df_salesandstoresdatatotalgroupedby = df_salesandstoresdatatotal.groupby(['missingdata','missingdatacategory']).agg({'unit_sales':'sum'}).reset_index()
df_salesandstoresdatatotalgroupedby['Percentage'] = df_salesandstoresdatatotalgroupedby['unit_sales']/df_salesandstoresdatatotalgroupedby['unit_sales'].sum()*100

df_salesandstoresdatatotalgroupedby

In [None]:
# Take the df_salesandstoresdata dataframe again and filter it for the year 2017 and the month july

# Filter rows for July 2017
df_salesandstoresdata_july_2017 = df_salesandstoresdata[(df_salesandstoresdata['date'].dt.year == 2017) & (df_salesandstoresdata['date'].dt.month == 7)]

# Print the filtered DataFrame
print(df_salesandstoresdata_july_2017)

In [None]:
# Group the data by store and calculate the total unit sales per store (we repeat the same steps as we did for the whole time period)
df_salesandstoresdata_july_2017_total = df_salesandstoresdata_july_2017.groupby(['store_nbr']).agg({'unit_sales':'sum'}).reset_index()

# Take the df_salesandstoresdata_alldatesnullfinal dataframe and merge it with the df_salesandstoresdatatotal dataframe to get the total unit sales per store and the marking if dates are missing per store including the categories why something is missing.
df_salesandstoresdata_july_2017_total = df_salesandstoresdata_july_2017_total.merge(df_salesandstoresdata_alldatesnullfinal, on='store_nbr', how='left')

# Drop the unit_sales_y column and rename the unit_sales_x column to unit_sales (just cleaning things from the last merge)
df_salesandstoresdata_july_2017_total = df_salesandstoresdata_july_2017_total.drop(columns=['unit_sales_y'])
df_salesandstoresdata_july_2017_total = df_salesandstoresdata_july_2017_total.rename(columns={'unit_sales_x':'unit_sales'})

# If a store isn't missing data, give the missingdata column a value of 0, do the same for the missingdatacategory column
df_salesandstoresdata_july_2017_total['missingdata'] = df_salesandstoresdata_july_2017_total['missingdata'].fillna('0')
df_salesandstoresdata_july_2017_total['missingdatacategory'] = df_salesandstoresdata_july_2017_total['missingdatacategory'].fillna('0')

# Sort the dataframe by unit_sales (we want to have the highest sales first)
df_salesandstoresdata_july_2017_total = df_salesandstoresdata_july_2017_total.sort_values(by='unit_sales', ascending=False)

df_salesandstoresdata_july_2017_total

In [None]:
# Group the data by missingdata and missingdatacategory and calculate the percentage of the total unit sales per store
df_salesandstoresdata_july_2017_totalgroupedby = df_salesandstoresdata_july_2017_total.groupby(['missingdata','missingdatacategory']).agg({'unit_sales':'sum'}).reset_index()
df_salesandstoresdata_july_2017_totalgroupedby['Percentage'] = df_salesandstoresdata_july_2017_totalgroupedby['unit_sales']/df_salesandstoresdata_july_2017_totalgroupedby['unit_sales'].sum()*100

df_salesandstoresdata_july_2017_totalgroupedby

In [None]:
df_salesandstoresdata_july_2017_totalcopy = df_salesandstoresdata_july_2017_total.copy()

# Let's investigate the share per store type for july 2017
df_salesandstoresdata_july_2017_total_type = df_salesandstoresdata_july_2017_totalcopy.merge(df_stores, on='store_nbr', how='left')
df_salesandstoresdata_july_2017_total_type = df_salesandstoresdata_july_2017_total_type.groupby(['type']).agg({'unit_sales':'sum'}).reset_index()
df_salesandstoresdata_july_2017_total_type = df_salesandstoresdata_july_2017_total_type.sort_values(by='unit_sales', ascending=False)
df_salesandstoresdata_july_2017_total_type['Percentage'] = df_salesandstoresdata_july_2017_total_type['unit_sales']/df_salesandstoresdata_july_2017_total_type['unit_sales'].sum()*100
df_salesandstoresdata_july_2017_total_type['CumulativePercentage'] = df_salesandstoresdata_july_2017_total_type['Percentage'].cumsum()

df_salesandstoresdata_july_2017_total_type

In [None]:
# Add store information (like type, cluster, state and city) to the dataframe with the total unit sales per store for July 2017
df_salesandstoresdata_july_2017_total2 = df_salesandstoresdata_july_2017_totalcopy.merge(df_stores, on='store_nbr', how='left')

# Add the total unit sales for all stores to each row of the dataframe (this makes it easier to calculate the percentage of the total unit sales per store)
df_salesandstoresdata_july_2017_total2['Total unit sales'] = df_salesandstoresdata_july_2017_total2['unit_sales'].sum()

# Calculate the percentage of the total unit sales per store
df_salesandstoresdata_july_2017_total2['Percentage'] = df_salesandstoresdata_july_2017_total2['unit_sales']/df_salesandstoresdata_july_2017_total2['Total unit sales']*100

df_salesandstoresdata_july_2017_total2

In [None]:
df_salesandstoresdata_july_2017_total3 = df_salesandstoresdata_july_2017_total2.copy()

# STEP 1 - df_salesandstoresdata_july_2017_total2 only with missingdatacategory 0 or missing <9 days
df_salesandstoresdata_july_2017_total4 = df_salesandstoresdata_july_2017_total3[(df_salesandstoresdata_july_2017_total3['missingdatacategory'] == '0') | (df_salesandstoresdata_july_2017_total3['missingdatacategory'] == 'missing <9 days')]

# STEP 2 - Drop the rows where the cluster is 10 (this is a missing value)
df_salesandstoresdata_july_2017_total4 = df_salesandstoresdata_july_2017_total4[df_salesandstoresdata_july_2017_total4['cluster'] != '10']



In [None]:

df_salesandstoresdata_july_2017

In [None]:
df_stores_f = df_salesandstoresdata_july_2017_total4[['store_nbr']]

In [None]:
filtered_df = df_salesandstoresdata_july_2017[df_salesandstoresdata_july_2017['store_nbr'].isin(df_stores_f['store_nbr'])]
print(filtered_df)


In [None]:
df_salesdata.head()

In [None]:
# Filter the df_salesdata DataFrame for rows where 'store_nbr' is in df_stores_f
df_filtered_salesdata = df_salesdata[df_salesdata['store_nbr'].isin(df_stores_f['store_nbr'])]

# Display the filtered DataFrame
print(df_filtered_salesdata)

In [None]:
# Filter the df_salesdata DataFrame for rows where 'store_nbr' is in df_stores_f and date is in July 2017
filtered_df_salesdata_july2017 = df_filtered_salesdata[
   
    (df_filtered_salesdata['date'] >= '2017-07-01') & 
    (df_filtered_salesdata['date'] <= '2017-07-31')
]

# Display the filtered DataFrame
filtered_df_salesdata_july2017

In [None]:
df_items.head(5)

In [None]:
# Merge the filtered sales data with the items data
merged_df = pd.merge(filtered_df_salesdata_july2017, df_items, on="item_nbr", how="inner")

# Display the merged DataFrame
print(merged_df)

In [None]:
# Calculate total sales for each family
family_sales = merged_df.groupby('family')['unit_sales'].sum()

# Calculate total sales across all families
total_sales = family_sales.sum()

# Calculate the percentage of total sales for each family
family_sales_percentage = (family_sales / total_sales) * 100

# Convert the result to a DataFrame for better readability
family_sales_percentage_df = family_sales_percentage.reset_index()
family_sales_percentage_df.columns = ['family', 'percentage_of_total_sales']

# Sort the DataFrame by percentage in descending order
family_sales_percentage_df = family_sales_percentage_df.sort_values(by='percentage_of_total_sales', ascending=False)

# Display the result
print(family_sales_percentage_df)

In [None]:
# Calculate total sales for each family
total_sales_per_family = merged_df.groupby('family')['unit_sales'].sum()

# Calculate overall total sales
overall_total_sales = total_sales_per_family.sum()

# Calculate percentage of total sales for each family
sales_percentage_per_family = (total_sales_per_family / overall_total_sales) * 100

# Group by 'family' and count distinct 'store_nbr'
distinct_stores_per_family = merged_df.groupby('family')['store_nbr'].nunique()

# Convert results to DataFrames for merging
distinct_stores_df = distinct_stores_per_family.reset_index()
distinct_stores_df.columns = ['family', 'distinct_stores_count']

sales_percentage_df = sales_percentage_per_family.reset_index()
sales_percentage_df.columns = ['family', 'sales_percentage']

# Merge the two DataFrames
result_df = pd.merge(distinct_stores_df, sales_percentage_df, on='family')

# Sort by the percentage of sales in descending order
result_df = result_df.sort_values(by='sales_percentage', ascending=False)

# Display the result
print(result_df)

 So, as we can see all WE can removie baby care and books as they are not seen in all stores. We lose 0.02. Now we want to look on items level 

In [None]:


# Calculate total sales for each item
total_sales_per_item = merged_df.groupby('item_nbr')['unit_sales'].sum()

# Calculate overall total sales
overall_total_sales = total_sales_per_item.sum()

# Calculate percentage of total sales for each item
sales_percentage_per_item = (total_sales_per_item / overall_total_sales) * 100

# Count distinct stores for each item
distinct_stores_per_item = merged_df.groupby('item_nbr')['store_nbr'].nunique()

# Convert results to DataFrames for merging
distinct_stores_df = distinct_stores_per_item.reset_index()
distinct_stores_df.columns = ['item_nbr', 'distinct_stores_count']

sales_percentage_df = sales_percentage_per_item.reset_index()
sales_percentage_df.columns = ['item_nbr', 'sales_percentage']

# Merge the two DataFrames
result_df = pd.merge(distinct_stores_df, sales_percentage_df, on='item_nbr')

# Sort by sales percentage in descending order
result_df = result_df.sort_values(by='sales_percentage', ascending=False)

# Display the result
print(result_df)

result_df

In [None]:

# Ensure 'date' column is datetime type
merged_df['date'] = pd.to_datetime(merged_df['date'])

# Define the range of dates (July 2017 in this case)
date_range = pd.date_range(start='2017-07-01', end='2017-07-31')

# Get unique store and item numbers
stores = merged_df['store_nbr'].unique()
items = merged_df['item_nbr'].unique()

# Create a DataFrame with all combinations of store, item, and date
all_combinations = pd.MultiIndex.from_product([stores, items, date_range], names=['store_nbr', 'item_nbr', 'date']).to_frame(index=False)

# Merge with the original DataFrame
df_full = pd.merge(all_combinations, merged_df, on=['store_nbr', 'item_nbr', 'date'], how='left')

# Fill NaN values in 'unit_sales' with 0
df_full['unit_sales'] = df_full['unit_sales'].fillna(0)

# Optionally, reset the index if needed
df_full.reset_index(drop=True, inplace=True)

# Display the balanced DataFrame
print(df_full)


In [None]:
# Ensure 'date' column is datetime type
df_full['date'] = pd.to_datetime(df_full['date'])

# Step 1: Filter out rows where 'unit_sales' is zero or NaN
df_positive_sales = df_full[df_full['unit_sales'] > 0]

# Step 2: Group by store and item, then count unique dates with sales > 0
daily_sales_counts = df_positive_sales.groupby(['store_nbr', 'item_nbr'])['date'].nunique().reset_index()
daily_sales_counts.rename(columns={'date': 'days_with_sales'}, inplace=True)

# Step 3: Calculate the average number of days with sales > 0 per item
average_daily_sales = daily_sales_counts.groupby(['item_nbr'])['days_with_sales'].mean().reset_index()

# Step 4: Calculate total sales for each item across all stores and dates
total_sales_per_item = df_full.groupby('item_nbr')['unit_sales'].sum().reset_index()
total_sales_per_item.rename(columns={'unit_sales': 'total_sales'}, inplace=True)

# Step 5: Calculate total sales for all items
grand_total_sales = total_sales_per_item['total_sales'].sum()

# Step 6: Calculate the percentage of sales for each item
total_sales_per_item['sales_percentage'] = (total_sales_per_item['total_sales'] / grand_total_sales) * 100

# Step 7: Merge the percentage information with the average daily sales
average_daily_sales = average_daily_sales.merge(total_sales_per_item[['item_nbr', 'sales_percentage']], on='item_nbr')

# Step 8: Print or inspect the results
print(average_daily_sales)

In [None]:
import pandas as pd



# Ensure 'date' column is datetime type
df_full['date'] = pd.to_datetime(df_full['date'])

# Filter out rows where 'unit_sales' is zero or NaN
df_positive_sales = df_full[df_full['unit_sales'] > 0]

# Group by store and item, then count unique dates with sales > 0
daily_sales_counts = df_positive_sales.groupby(['store_nbr', 'item_nbr'])['date'].nunique().reset_index()
daily_sales_counts.rename(columns={'date': 'days_with_sales'}, inplace=True)

# Calculate the average number of days with sales > 0 per item
average_daily_sales = daily_sales_counts.groupby(['item_nbr'])['days_with_sales'].mean().reset_index()

# Calculate total sales for each item across all stores and dates
total_sales_per_item = df_full.groupby('item_nbr')['unit_sales'].sum().reset_index()
total_sales_per_item.rename(columns={'unit_sales': 'total_sales'}, inplace=True)

# Calculate total sales for all items
grand_total_sales = total_sales_per_item['total_sales'].sum()

# Calculate the percentage of sales for each item
total_sales_per_item['sales_percentage'] = (total_sales_per_item['total_sales'] / grand_total_sales) * 100

# Merge the percentage information with the average daily sales
average_daily_sales = average_daily_sales.merge(total_sales_per_item[['item_nbr', 'sales_percentage']], on='item_nbr')

# Print basic statistics for average_daily_sales
print("Basic Statistics for `average_daily_sales` DataFrame:")
print(average_daily_sales.describe())

# View the first few rows
print("\nFirst few rows of `average_daily_sales` DataFrame:")
print(average_daily_sales.head())

# View the last few rows
print("\nLast few rows of `average_daily_sales` DataFrame:")
print(average_daily_sales.tail())

# DataFrame info
print("\nInfo on `average_daily_sales` DataFrame:")
print(average_daily_sales.info())


In [None]:



# Ensure 'date' column is datetime type
df_full['date'] = pd.to_datetime(df_full['date'])

# Filter out rows where 'unit_sales' is zero or NaN
df_positive_sales = df_full[df_full['unit_sales'] > 0]

# Group by store and item, then count unique dates with sales > 0
daily_sales_counts = df_positive_sales.groupby(['store_nbr', 'item_nbr'])['date'].nunique().reset_index()
daily_sales_counts.rename(columns={'date': 'days_with_sales'}, inplace=True)

# Calculate the average number of days with sales > 0 per item
average_daily_sales = daily_sales_counts.groupby(['item_nbr'])['days_with_sales'].mean().reset_index()

# Calculate total sales for each item across all stores and dates
total_sales_per_item = df_full.groupby('item_nbr')['unit_sales'].sum().reset_index()
total_sales_per_item.rename(columns={'unit_sales': 'total_sales'}, inplace=True)

# Calculate total sales for all items
grand_total_sales = total_sales_per_item['total_sales'].sum()

# Calculate the percentage of sales for each item
total_sales_per_item['sales_percentage'] = (total_sales_per_item['total_sales'] / grand_total_sales) * 100

# Merge the percentage information with the average daily sales
average_daily_sales = average_daily_sales.merge(total_sales_per_item[['item_nbr', 'sales_percentage']], on='item_nbr')

# Filter items that are sold for fewer than 10 days on average
items_sold_fewer_than_10_days = average_daily_sales[average_daily_sales['days_with_sales'] < 10]

# Calculate the total number of items
total_items = average_daily_sales.shape[0]

# Calculate the number of items sold for fewer than 10 days
items_sold_fewer_than_10_days_count = items_sold_fewer_than_10_days.shape[0]

# Compute the percentage
percentage_sold_fewer_than_10_days = (items_sold_fewer_than_10_days_count / total_items) * 100

# Print the result
print(f"Total percentage of items sold for fewer than 10 days on average: {percentage_sold_fewer_than_10_days:.2f}%")
import pandas as pd

# Sample DataFrame setup (Replace this with loading your actual DataFrame)
# df_full = pd.read_csv('your_data.csv')

# Ensure 'date' column is datetime type
df_full['date'] = pd.to_datetime(df_full['date'])

# Filter out rows where 'unit_sales' is zero or NaN
df_positive_sales = df_full[df_full['unit_sales'] > 0]

# Group by store and item, then count unique dates with sales > 0
daily_sales_counts = df_positive_sales.groupby(['store_nbr', 'item_nbr'])['date'].nunique().reset_index()
daily_sales_counts.rename(columns={'date': 'days_with_sales'}, inplace=True)

# Calculate the average number of days with sales > 0 per item
average_daily_sales = daily_sales_counts.groupby(['item_nbr'])['days_with_sales'].mean().reset_index()

# Calculate total sales for each item across all stores and dates
total_sales_per_item = df_full.groupby('item_nbr')['unit_sales'].sum().reset_index()
total_sales_per_item.rename(columns={'unit_sales': 'total_sales'}, inplace=True)

# Calculate total sales for all items
grand_total_sales = total_sales_per_item['total_sales'].sum()

# Calculate the percentage of sales for each item
total_sales_per_item['sales_percentage'] = (total_sales_per_item['total_sales'] / grand_total_sales) * 100

# Merge the percentage information with the average daily sales
average_daily_sales = average_daily_sales.merge(total_sales_per_item[['item_nbr', 'sales_percentage']], on='item_nbr')

# Filter items that are sold for fewer than 10 days on average
items_sold_fewer_than_10_days = average_daily_sales[average_daily_sales['days_with_sales'] < 10]

# Calculate the total number of items
total_items = average_daily_sales.shape[0]

# Calculate the number of items sold for fewer than 10 days
items_sold_fewer_than_10_days_count = items_sold_fewer_than_10_days.shape[0]

# Compute the percentage
percentage_sold_fewer_than_10_days = (items_sold_fewer_than_10_days_count / total_items) * 100

# Print the result
print(f"Total percentage of items sold for fewer than 10 days on average: {percentage_sold_fewer_than_10_days:.2f}%")




In [None]:
import pandas as pd

# Sample DataFrame setup (Replace this with loading your actual DataFrame)
# df_full = pd.read_csv('your_data.csv')

# Ensure 'date' column is datetime type
df_full['date'] = pd.to_datetime(df_full['date'])

# Filter out rows where 'unit_sales' is zero or NaN
df_positive_sales = df_full[df_full['unit_sales'] > 0]

# Group by store and item, then count unique dates with sales > 0
daily_sales_counts = df_positive_sales.groupby(['store_nbr', 'item_nbr'])['date'].nunique().reset_index()
daily_sales_counts.rename(columns={'date': 'days_with_sales'}, inplace=True)

# Calculate the average number of days with sales > 0 per item
average_daily_sales = daily_sales_counts.groupby(['item_nbr'])['days_with_sales'].mean().reset_index()

# Calculate total sales for each item across all stores and dates
total_sales_per_item = df_full.groupby('item_nbr')['unit_sales'].sum().reset_index()
total_sales_per_item.rename(columns={'unit_sales': 'total_sales'}, inplace=True)

# Calculate total sales for all items
grand_total_sales = total_sales_per_item['total_sales'].sum()

# Calculate the percentage of sales for each item
total_sales_per_item['sales_percentage'] = (total_sales_per_item['total_sales'] / grand_total_sales) * 100

# Merge the percentage information with the average daily sales
average_daily_sales = average_daily_sales.merge(total_sales_per_item[['item_nbr', 'sales_percentage']], on='item_nbr')

# Calculate the total number of days in the period
total_days = (df_full['date'].max() - df_full['date'].min()).days + 1

# Calculate the number of weeks in the period
total_weeks = total_days / 7

# Determine the number of days with sales data for each item
item_sales_per_week = daily_sales_counts.groupby('item_nbr')['days_with_sales'].sum() / total_weeks

# Determine if the item is sold at least once a week on average
items_sold_at_least_once_a_week = item_sales_per_week[item_sales_per_week >= 1].index

# Calculate the total number of items
total_items = average_daily_sales.shape[0]

# Calculate the number of items sold at least once a week
items_sold_at_least_once_a_week_count = len(items_sold_at_least_once_a_week)

# Compute the percentage
percentage_sold_at_least_once_a_week = (items_sold_at_least_once_a_week_count / total_items) * 100

# Print the result
print(f"Total percentage of items sold at least once a week on average: {percentage_sold_at_least_once_a_week:.2f}%")
