### Supermarket data science case study - Exploring first data


### Importing packages

In [1]:
import pandas as pd
import sys

# import matplotlib.pyplot as plt
# import altair as alt
# import vegafusion as vf
import sklearn

from sklearn.pipeline import Pipeline, make_pipeline

### Functions

In [2]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)

    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len - 1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len - 1]

    # Return result.
    return dummy

In [None]:
def f_describe(df_input, n_top=10):

    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))

    df_numeric = df_input.select_dtypes(
        include=[
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "float64",
        ]
    )

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

    df_textual = df_input.select_dtypes(include=["category", "object", "bool"])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())

    v_na = [
        col
        + " ("
        + str(df[col].isna().sum())
        + ", "
        + str(round(100 * df[col].isna().sum() / df.shape[0], 1))
        + "%)"
        for col in df.columns
        if df[col].isna().sum() > 0
    ]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

In [3]:
def extract_datetime_features(df):
    """
    Extracting datetime features
    year, month, day of month, and weekday
    """
    df = df.copy()
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.dayofweek

    # To-do: add weeknumber-year feature

    return df  ## Weekday range from 0 to 6 --> First weekday starts on Monday = 0 en end at Sunday=6


# df_dt_features = extract_datetime_features(df)

### Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [4]:
def standardize_column_names(s):
    return s.replace(" ", "")


def optimize_memory(df):
    # Change: Objects to Categorical.                                               #WHEN needed to transform Objects to Categorical?
    object_cols = df.select_dtypes(include="object").columns

    if not object_cols.empty:
        print("Change: Objects to Categorical")
        df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


def month_year_to_int(df, i):
    # Change: Month and Year to integer
    if i == 0:
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})

    return df

### Transform date-related columns to datetime format.

In [5]:
# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        df["date"] = pd.to_datetime(df[["year", "month", "day"]], unit="us")

        # print(
        #     "Change: Dropped 'year', 'month', 'day' columns and transformed to Datetime64[us] feature"
        # )
        # df.drop(columns=["day", "month", "year"], inplace=True)

    else:
        if "date" in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype")
            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    return df

### Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [6]:
def f_get_data(i=0):

    # Define path.
    c_path = "C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/raw/"

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "history_aggregated",  # 1
        "holidays_events",  # 2
        "items",  # 3
        "oil",  # 4
        "stores",  # 5
        "transactions",  # 6
    )

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### Importing data

In [7]:
# Sales History per year
df_0 = f_get_data(0)

Change: integer --> unsigned
Change: float --> float
Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature


In [15]:
# Holidays
df_2 = f_get_data(2)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float
Change: Transformed 'date' column to Datetime Dtype


In [8]:
# Items
df_3 = f_get_data(3)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float


In [9]:
# Stores
df_5 = f_get_data(5)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float


### Aggregation of Sales per Month and join of df_0 History sales + df_3 Items + df_5 Stores Datasets

In [10]:
def onpromotion_month_count(df):

    if "onpromotion" in df.columns:

        df["onpromotion_month_count"] = df.groupby(
            ["item_nbr", "store_nbr", "day", "month", "year"]
        )["onpromotion"].transform("sum")

        print(
            "Change: 'onpromotion' column transformed to 'onpromotion_month_count' feature."
        )
    else:

        print("The DataFrame does not contain an 'onpromotion' column.")

    return df

In [11]:
df_0_agg = (
    onpromotion_month_count(df_0)  # Transformation to 'onpromotion_month_count' feature
    .drop(
        columns=["id", "date", "onpromotion"]
    )  # Drop unnecessary columns "id", "date", "onpromotion"
    .groupby(["month", "year", "store_nbr", "item_nbr"])
    .agg({"unit_sales": "sum", "onpromotion_month_count": "sum"})
    .reset_index()
)

Change: 'onpromotion' column transformed to 'onpromotion_count' feature.


In [12]:
# Merge of df_0_agg with df_3 and df_5

df_03_join = df_0_agg.merge(df_3, on="item_nbr", how="left")

df_035_join = df_03_join.merge(df_5, on="store_nbr", how="left")

In [13]:
print("The transformed and joined data\n")
print(
    f"-> Contains:                {round(df_035_join.shape[0]/1e6, 1)} million observations and {df_035_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_035_join.shape[0]} observations and {df_035_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_035_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_035_join)/1024/1024/1024, 2)} GB."
)

The transformed and joined data

-> Contains:                5.8 million observations and 13 features.

-> Contains:                5813304 observations and 13 features.

-> Have feature names:      month, year, store_nbr, item_nbr, unit_sales, onpromotion_month_count, family, class, perishable, city, state, type, and cluster.

-> Has optimized size of    0.18 GB.


In [14]:
# Sample of final merged dataframe 'df_02_join'

df_035_join.sample(20)

Unnamed: 0,month,year,store_nbr,item_nbr,unit_sales,onpromotion_month_count,family,class,perishable,city,state,type,cluster
2096592,5,2014,46,114800,823.0,0,PERSONAL CARE,4126,0,Quito,Pichincha,A,14
2166620,5,2015,33,1412115,12.0,0,PERSONAL CARE,4139,0,Quevedo,Los Rios,C,3
3534774,7,2017,44,1579070,516.0,4,PRODUCE,2030,1,Quito,Pichincha,A,5
2606934,6,2014,45,866927,803.0,1,DELI,2632,1,Quito,Pichincha,A,11
3873487,8,2016,23,1981370,81.0,0,GROCERY I,1042,0,Ambato,Tungurahua,D,9
2404071,5,2017,24,890214,100.0,0,GROCERY I,1026,0,Guayaquil,Guayas,D,1
1104038,3,2014,43,1373081,16.0,0,HOME AND KITCHEN II,6328,0,Esmeraldas,Esmeraldas,E,10
1441837,3,2017,38,1047699,45.0,0,BEVERAGES,1122,0,Loja,Loja,D,4
879524,2,2017,18,454593,12.0,0,GROCERY I,1092,0,Quito,Pichincha,B,16
4622463,10,2014,32,938576,117.0,0,GROCERY I,1042,0,Guayaquil,Guayas,C,3


### df_0 History sales join with df_2 Holidays 

In [17]:
print(
    f"-> Contains:                {round(df_2.shape[0]/1e6, 1)} million observations and {df_2.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_2.shape[0]} observations and {df_2.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_2.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df_2)/1024/1024/1024, 2)} GB.")

df_2.sample(10)

-> Contains:                0.0 million observations and 6 features.

-> Contains:                350 observations and 6 features.

-> Have feature names:      date, type, locale, locale_name, description, and transferred.

-> Has optimized size of    0.0 GB.


Unnamed: 0,date,type,locale,locale_name,description,transferred
150,2014-12-21,Additional,National,Ecuador,Navidad-4,False
272,2016-09-28,Holiday,Local,Ibarra,Fundacion de Ibarra,False
122,2014-07-08,Event,National,Ecuador,Mundial de futbol Brasil: Semifinales,False
335,2017-11-07,Holiday,Regional,Santa Elena,Provincializacion Santa Elena,False
134,2014-10-07,Holiday,Local,Quevedo,Cantonizacion de Quevedo,False
79,2013-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
340,2017-12-06,Holiday,Local,Quito,Fundacion de Quito,True
64,2013-07-24,Additional,Local,Guayaquil,Fundacion de Guayaquil-1,False
254,2016-05-16,Event,National,Ecuador,Terremoto Manabi+30,False
288,2016-12-08,Holiday,Local,Loja,Fundacion de Loja,False


In [18]:
# drop unnecessary columns 'description'
df_2_cleaned = df_2.drop(columns=["description"])

In [21]:
# drop unnecessary columns 'id' to save memory
df_0_cleaned = df_0.drop(
    columns=[
        "id"
        # , "onpromotion_month_count"
    ]
)

In [23]:
# Merge of df_0_cleaned with df_2

df_02_join = df_0_cleaned.merge(df_2, on="date", how="left")

In [24]:
print(
    f"-> Contains:                {round(df_02_join.shape[0]/1e6, 1)} million observations and {df_02_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_02_join.shape[0]} observations and {df_02_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_02_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_02_join)/1024/1024/1024, 2)} GB."
)

-> Contains:                128.0 million observations and 13 features.

-> Contains:                127970257 observations and 13 features.

-> Have feature names:      store_nbr, item_nbr, unit_sales, onpromotion, day, year, month, date, type, locale, locale_name, description, and transferred.

-> Has optimized size of    7.71 GB.


In [25]:
# Sample of final merged dataframe 'df_02_join'

df_02_join.sample(20)

Unnamed: 0,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date,type,locale,locale_name,description,transferred
73945818,53,1060036,1.0,True,7,2016,11,2016-11-07,Holiday,Regional,Santa Elena,Provincializacion Santa Elena,False
111811071,2,2006310,3.0,False,20,2017,3,2017-03-20,,,,,
124948500,5,1718319,1.0,False,18,2017,7,2017-07-18,,,,,
28981150,11,1157562,2.0,False,9,2014,4,2014-04-09,,,,,
18298866,54,1346628,4.0,,28,2014,1,2014-01-28,,,,,
17517895,48,956011,4.0,,16,2014,1,2014-01-16,,,,,
17571079,41,1229023,1.0,,17,2014,1,2014-01-17,,,,,
106378718,26,1950703,1.0,False,27,2017,1,2017-01-27,,,,,
45033862,23,1060036,2.0,False,14,2015,11,2015-11-14,,,,,
15406859,51,165988,1.0,,5,2013,9,2013-09-05,,,,,
