### Supermarket data science case study - Exploring first data


In [None]:
#pip install --user pyarrow

### Importing packages

In [None]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
import sklearn

from sklearn.pipeline import Pipeline, make_pipeline

### Functions

In [None]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)
    
    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len-1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len-1]

    # Return result.
    return dummy

In [None]:
def f_describe(df_input, n_top = 10):
    
    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))
     
    
    df_numeric = df_input.select_dtypes(include = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'])

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

        
    df_textual = df_input.select_dtypes(include = ['category', 'object', 'bool'])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())
        
        
    v_na = [col + " (" + str(df[col].isna().sum()) + ", " + str(round(100 * df[col].isna().sum() / df.shape[0], 1)) + "%)" for col in df.columns if df[col].isna().sum() > 0]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

### Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [None]:
def standardize_column_names(s):
    return s.replace(" ", "")

def optimize_memory(df):   
    # Change: Objects to Categorical.                                               #WHEN needed to transform Objects to Categorical?
    # object_cols = df.select_dtypes(include="object").columns
    # if not object_cols.empty:
    #     print("Change: Objects to Categorical")
    #     df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df

def month_year_to_int(df, i):
    # Change: Month and Year to integer
    if(i == 0):
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})
    
    return df

### Transform date-related columns to datetime format.

In [None]:
##HOW TO DOCUMENT? Findings regarding dataset, template or normalized structure? 
    #day/month/year, UTC, date = object Dtypes


# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        df['date'] = pd.to_datetime(df[['year', 'month', 'day']])    

        #print("Change: Dropped 'year', 'month', 'day' columns and transformed to Datetime feature")
        #df.drop(columns=['day', 'month', 'year'])                                                      #To do: Not dropping while running code

    else: #holiday, transactions and oil 
        if 'date' in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype") #datetime.datetime.fromisoformat()
            df['date']=pd.to_datetime(df['date']) 
    
    return df

### Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [None]:
def f_get_data(i=0):

    # Define path.
    c_path = 'C:/Users/J.Heuvelmans/OneDrive - Brain Research Center/Documenten/EAISI/2024Supermarket/Code/data/raw/'
    
    # Identify file.
    v_file = ("history-per-year",   # 0
              "history_aggregated", # 1
              "holidays_events",    # 2
              "items",              # 3
              "oil",                # 4
              "stores",             # 5
              "transactions")       # 6

    # Load data.   
    df = (
         pd.read_parquet(c_path + v_file[i] + ".parquet")
         .rename(columns = standardize_column_names)
         .pipe(optimize_memory)
         .pipe(month_year_to_int, i)
         .pipe(transform_date_to_datetime, i)
    )
    
    # Return data.
    return df

### Importing data

In [None]:
df = f_get_data(0)

In [None]:
f_describe(df)

In [None]:
df.info()

In [None]:
a = df['onpromotion'].value_counts()
b = df['onpromotion'].isna().sum()

print(a)
print(b)
print(a+b)
perc_missing = b/125497040 * 100
perc_false = 96028767/125497040 * 100
perc_true = 7810622/125497040 *100

print(perc_missing)
print(perc_false)
print(perc_true)
total = perc_missing + perc_false + perc_true
print(total)

In [None]:
df_maxsales = df[df['unit_sales'] > 10000]
df_maxsales

df_850389 = df[df['item_nbr']==850389]
df_850389

import altair as alt

hist = alt.Chart(df_850389).mark_bar().encode(
    alt.X('date', title='date'),
    alt.Y('unit_sales', title='unit_sales')
).properties(
    title='Sales of 850389',
    width=600,
    height=400
)

hist.display()

Below calculation of total sales per store: 

In [None]:
def calc_total_sales(df):
    results = []
    for i in range(1, 55):
        df_store = df[df['store_nbr']==i]
        total_sales = df_store['unit_sales'].sum()
        results.append({'store_nbr': i, 'total_sales': total_sales})
    df_totals = pd.DataFrame(results)
    df_totals = df_totals.sort_values(by='total_sales', ascending=False).reset_index(drop=True)                                                                                                                                                                                                                                                                                                                                     
    return df_totals

df_totals = calc_total_sales(df)

import altair as alt

hist = alt.Chart(df_totals).mark_bar().encode(
    alt.X('store_nbr:O', title='Store Number', sort='-y'),
    alt.Y('total_sales:Q', title='Total Sales')
).properties(
    title='Histogram of Total Sales',
    width=600,
    height=400
)

hist.display()


### Some Statistics:

In [None]:
print("The data\n")
print(f"-> Contains:                {round(df.shape[0]/1e6, 1)} million observations and {df.shape[1]} features.\n")
print(f"-> Contains:                {df.shape[0]} observations and {df.shape[1]} features.\n")
print(f"-> Have feature names:      {f_concat(df.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB.")

In [None]:
#df = f_get_data(0)
#f_describe(df)

#df.head()
#df.tail(10)
df.sample(20)
#df.info()
#df.describe()
#df.nunique

In [None]:
df = f_get_data(1)
#f_describe(df)                             #To do: investigate transforming datetime64[us, UTC] 	2014-09-24 00:00:00+00:00 to normal datetime
df.info()
df.sample(20)

In [None]:
df = f_get_data(2)
f_describe(df)
df.info()
df.sample(20)

In [None]:
df = f_get_data(3)
f_describe(df)
df.info()

In [None]:
df = f_get_data(4)
f_describe(df)
df.info()
df.sample(20)

In [None]:
df = f_get_data(5)
f_describe(df)
df.info()

In [None]:
df = f_get_data(6)
f_describe(df)
df.info()
df.sample(20)