### Supermarket data science case study - Exploring first data


### Importing packages

In [5]:
import pandas as pd
import sys

# import matplotlib.pyplot as plt
# import altair as alt
# import vegafusion as vf
import sklearn

from sklearn.pipeline import Pipeline, make_pipeline

### Functions

In [13]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)

    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len - 1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len - 1]

    # Return result.
    return dummy

In [None]:
def f_describe(df_input, n_top=10):

    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))

    df_numeric = df_input.select_dtypes(
        include=[
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "float64",
        ]
    )

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

    df_textual = df_input.select_dtypes(include=["category", "object", "bool"])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())

    v_na = [
        col
        + " ("
        + str(df[col].isna().sum())
        + ", "
        + str(round(100 * df[col].isna().sum() / df.shape[0], 1))
        + "%)"
        for col in df.columns
        if df[col].isna().sum() > 0
    ]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

In [None]:
def extract_datetime_features(df):
    """
    Extracting datetime features
    year, month, day of month, and weekday
    """
    df = df.copy()
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.dayofweek

    # To-do: add weeknumber-year feature

    return df  ## Weekday range from 0 to 6 --> First weekday starts on Monday = 0 en end at Sunday=6


# df_dt_features = extract_datetime_features(df)

### Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [6]:
def standardize_column_names(s):
    return s.replace(" ", "")


def optimize_memory(df):
    # Change: Objects to Categorical.                                               #WHEN needed to transform Objects to Categorical?
    object_cols = df.select_dtypes(include="object").columns

    if not object_cols.empty:
        print("Change: Objects to Categorical")
        df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


def month_year_to_int(df, i):
    # Change: Month and Year to integer
    if i == 0:
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})

    return df

### Transform date-related columns to datetime format.

In [7]:
# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        df["date"] = pd.to_datetime(df[["year", "month", "day"]], unit="us")

        # print(
        #     "Change: Dropped 'year', 'month', 'day' columns and transformed to Datetime64[us] feature"
        # )
        # df.drop(columns=["day", "month", "year"], inplace=True)

    else:
        if "date" in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype")
            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    return df

### Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [8]:
def f_get_data(i=0):

    # Define path.
    c_path = "C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/raw/"

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "history_aggregated",  # 1
        "holidays_events",  # 2
        "items",  # 3
        "oil",  # 4
        "stores",  # 5
        "transactions",  # 6
    )

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### Importing data

In [9]:
# Sales History per year
df_0 = f_get_data(0)

Change: integer --> unsigned
Change: float --> float
Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature


In [10]:
# Holidays
df_2 = f_get_data(2)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float
Change: Transformed 'date' column to Datetime Dtype


In [None]:
# Items
df_3 = f_get_data(3)

In [None]:
# Stores
df_5 = f_get_data(5)

### Aggregation of Sales per Month and join of df_0 History sales + df_3 Items + df_5 Stores Datasets

In [None]:
def onpromotion_month_count(df):

    if "onpromotion" in df.columns:

        df["onpromotion_month_count"] = df.groupby(
            ["item_nbr", "store_nbr", "day", "month", "year"]
        )["onpromotion"].transform("sum")

        print(
            "Change: 'onpromotion' column transformed to 'onpromotion_count' feature."
        )
    else:

        print("The DataFrame does not contain an 'onpromotion' column.")

    return df

In [None]:
df_0_agg = (
    onpromotion_month_count(df_0)  # transformation to 'onpromotion_count' feature
    .drop(columns=["id", "date", "onpromotion"])  # drop unnecessary columns
    .groupby(["month", "year", "store_nbr", "item_nbr"])
    .agg({"unit_sales": "sum", "onpromotion_month_count": "sum"})
    .reset_index()
)

In [None]:
# Merge of df_0_agg with df_3 and df_5

df_03_join = df_0_agg.merge(df_3, on="item_nbr", how="left")

df_035_join = df_03_join.merge(df_5, on="store_nbr", how="left")

In [None]:
print("The transformed and joined data\n")
print(
    f"-> Contains:                {round(df_035_join.shape[0]/1e6, 1)} million observations and {df_035_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_035_join.shape[0]} observations and {df_035_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_035_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_035_join)/1024/1024/1024, 2)} GB."
)

In [None]:
df_035_join.sample(20)

### df_0 History sales join with df_2 Holidays 

In [14]:
print(
    f"-> Contains:                {round(df_2.shape[0]/1e6, 1)} million observations and {df_2.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_2.shape[0]} observations and {df_2.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_2.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df_2)/1024/1024/1024, 2)} GB.")


df_2.sample(10)

-> Contains:                0.0 million observations and 6 features.

-> Contains:                350 observations and 6 features.

-> Have feature names:      date, type, locale, locale_name, description, and transferred.

-> Has optimized size of    0.0 GB.


Unnamed: 0,date,type,locale,locale_name,description,transferred
55,2013-05-12,Event,National,Ecuador,Dia de la Madre,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False
313,2017-05-26,Transfer,National,Ecuador,Traslado Batalla de Pichincha,False
257,2016-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda,False
5,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False
56,2013-05-24,Holiday,National,Ecuador,Batalla de Pichincha,False
18,2012-10-07,Holiday,Local,Quevedo,Cantonizacion de Quevedo,False
318,2017-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
23,2012-11-06,Holiday,Regional,Santo Domingo de los Tsachilas,Provincializacion de Santo Domingo,False
13,2012-08-05,Holiday,Local,Esmeraldas,Fundacion de Esmeraldas,False


In [15]:
df_2_cleaned = df_2.drop(columns=["description"])  # drop unnecessary columns

df_2_cleaned.sample(10)

Unnamed: 0,date,type,locale,locale_name,transferred
171,2015-05-09,Additional,National,Ecuador,False
312,2017-05-24,Holiday,National,Ecuador,True
22,2012-11-03,Holiday,National,Ecuador,False
270,2016-08-15,Holiday,Local,Riobamba,False
338,2017-11-12,Holiday,Local,Ambato,False
164,2015-03-02,Holiday,Local,Manta,False
9,2012-06-25,Holiday,Local,Machala,False
291,2016-12-22,Holiday,Local,Salinas,False
135,2014-10-09,Holiday,National,Ecuador,True
56,2013-05-24,Holiday,National,Ecuador,False


In [None]:
# drop unnecessary in df_0 columns to save memory
df_0_cleaned = df_0.drop(
    columns=[
        "id"
        # , "onpromotion_month_count"
    ]
)

In [16]:
df_02_join = df_0_cleaned.merge(df_2, on="date", how="left")

In [18]:
print(
    f"-> Contains:                {round(df_02_join.shape[0]/1e6, 1)} million observations and {df_02_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_02_join.shape[0]} observations and {df_02_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_02_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_02_join)/1024/1024/1024, 2)} GB."
)

-> Contains:                128.0 million observations and 13 features.

-> Contains:                127970257 observations and 13 features.

-> Have feature names:      store_nbr, item_nbr, unit_sales, onpromotion, day, year, month, date, type, locale, locale_name, description, and transferred.

-> Has optimized size of    7.71 GB.


In [19]:
df_02_join.sample(20)

Unnamed: 0,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date,type,locale,locale_name,description,transferred
55079673,17,1354382,6.0,False,2,2015,5,2015-05-02,,,,,
70266371,28,1970428,16.0,True,1,2016,10,2016-10-01,,,,,
49779077,4,634015,3.823,False,5,2015,2,2015-02-05,,,,,
108514867,41,2019084,2.0,False,16,2017,2,2017-02-16,,,,,
74291829,36,1239794,1.0,False,11,2016,11,2016-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
51421136,11,368419,3.0,False,4,2015,3,2015-03-04,,,,,
79802518,12,374464,2.0,False,3,2016,2,2016-02-03,,,,,
88700963,27,1460808,2.0,False,4,2016,5,2016-05-04,Event,National,Ecuador,Terremoto Manabi+18,False
27158418,6,1093340,1.0,,10,2014,3,2014-03-10,,,,,
33058638,45,463598,8.0,False,26,2014,6,2014-06-26,,,,,
