### Supermarket data science case study - Exploring first data


### Importing packages

In [101]:
import pandas as pd
import sys

# import matplotlib.pyplot as plt
# import altair as alt
# import vegafusion as vf
import sklearn

from sklearn.pipeline import Pipeline, make_pipeline

### Functions

In [102]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)

    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len - 1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len - 1]

    # Return result.
    return dummy

In [103]:
def f_describe(df_input, n_top=10):

    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))

    df_numeric = df_input.select_dtypes(
        include=[
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "float64",
        ]
    )

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

    df_textual = df_input.select_dtypes(include=["category", "object", "bool"])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())

    v_na = [
        col
        + " ("
        + str(df[col].isna().sum())
        + ", "
        + str(round(100 * df[col].isna().sum() / df.shape[0], 1))
        + "%)"
        for col in df.columns
        if df[col].isna().sum() > 0
    ]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

In [104]:
def extract_datetime_features(df):
    """
    Extracting datetime features
    year, month, day of month, and weekday
    """
    df = df.copy()
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.dayofweek

    # To-do: add weeknumber-year feature

    return df  ## Weekday range from 0 to 6 --> First weekday starts on Monday = 0 en end at Sunday=6


# df_dt_features = extract_datetime_features(df)

### Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [105]:
def standardize_column_names(s):
    return s.replace(" ", "")


def optimize_memory(df):
    # Change: Objects to Categorical.                                               #WHEN needed to transform Objects to Categorical?
    object_cols = df.select_dtypes(include="object").columns

    if not object_cols.empty:
        print("Change: Objects to Categorical")
        df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


def month_year_to_int(df, i):
    # Change: Month and Year to integer
    if i == 0:
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})

    return df

### Transform date-related columns to datetime format.

In [106]:
# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        df["date"] = pd.to_datetime(df[["year", "month", "day"]], unit="us")

        # print(
        #     "Change: Dropped 'year', 'month', 'day' columns and transformed to Datetime64[us] feature"
        # )
        # df.drop(columns=["day", "month", "year"], inplace=True)

    else:
        if "date" in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype")
            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    return df

### Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [107]:
def f_get_data(i=0):

    # Define path.
    c_path = "C:/Users/sebas/OneDrive/Documenten/GitHub/Supermarketcasegroupproject/Group4B/data/raw/"

    

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "history_aggregated",  # 1
        "holidays_events",  # 2
        "items",  # 3
        "oil",  # 4
        "stores",  # 5
        "transactions",  # 6
    )

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### Importing data

In [108]:
# Sales History per year
df_0 = f_get_data(0)

df_0.head()

Change: integer --> unsigned
Change: float --> float
Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature


Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
0,0,25,103665,7.0,,1,2013,1,2013-01-01
1,1,25,105574,1.0,,1,2013,1,2013-01-01
2,2,25,105575,2.0,,1,2013,1,2013-01-01
3,3,25,108079,1.0,,1,2013,1,2013-01-01
4,4,25,108701,1.0,,1,2013,1,2013-01-01


In [109]:
# Holidays
df_2 = f_get_data(2)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float
Change: Transformed 'date' column to Datetime Dtype


In [110]:
# Items
df_3 = f_get_data(3)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float


In [111]:
# Stores
df_5 = f_get_data(5)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float


### Aggregation of Sales per Month and join of df_0 History sales + df_3 Items + df_5 Stores Datasets

In [112]:
def onpromotion_month_count(df):

    if "onpromotion" in df.columns:

        df["onpromotion_month_count"] = df.groupby(
            ["item_nbr", "store_nbr", "day", "month", "year"]
        )["onpromotion"].transform("sum")

        print(
            "Change: 'onpromotion' column transformed to 'onpromotion_month_count' feature."
        )
    else:

        print("The DataFrame does not contain an 'onpromotion' column.")

    return df

In [113]:
df_0_agg = (
    onpromotion_month_count(df_0)  # Transformation to 'onpromotion_month_count' feature
    .drop(
        columns=["id", "date", "onpromotion"]
    )  # Drop unnecessary columns "id", "date", "onpromotion"
    .groupby(["month", "year", "store_nbr", "item_nbr"])
    .agg({"unit_sales": "sum", "onpromotion_month_count": "sum"})
    .reset_index()
)

Change: 'onpromotion' column transformed to 'onpromotion_month_count' feature.


In [114]:
# Merge of df_0_agg with df_3 and df_5

df_03_join = df_0_agg.merge(df_3, on="item_nbr", how="left")

df_035_join = df_03_join.merge(df_5, on="store_nbr", how="left")

In [115]:
print("The transformed and joined data\n")
print(
    f"-> Contains:                {round(df_035_join.shape[0]/1e6, 1)} million observations and {df_035_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_035_join.shape[0]} observations and {df_035_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_035_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_035_join)/1024/1024/1024, 2)} GB."
)

The transformed and joined data

-> Contains:                5.8 million observations and 13 features.

-> Contains:                5813304 observations and 13 features.

-> Have feature names:      month, year, store_nbr, item_nbr, unit_sales, onpromotion_month_count, family, class, perishable, city, state, type, and cluster.

-> Has optimized size of    0.18 GB.


In [116]:
# Sample of final merged dataframe 'df_02_join'

df_035_join.sample(20)

#df_035_join.to_parquet('C:/Users/sebas/OneDrive/Documenten/GitHub/Supermarketcasegroupproject/Group4B/data/interim/df_035_join.parquet')

Unnamed: 0,month,year,store_nbr,item_nbr,unit_sales,onpromotion_month_count,family,class,perishable,city,state,type,cluster
5164744,11,2015,30,687549,9.0,0,GROCERY I,1062,0,Guayaquil,Guayas,C,3
929065,2,2017,37,409738,65.0,0,BREAD/BAKERY,2708,1,Cuenca,Azuay,D,2
277563,1,2016,24,1751174,1131.462036,4,PRODUCE,2008,1,Guayaquil,Guayas,D,1
946982,2,2017,43,573832,220.0,0,GROCERY I,1016,0,Esmeraldas,Esmeraldas,E,10
3073043,7,2013,51,958514,24.0,0,BEAUTY,4252,0,Guayaquil,Guayas,A,17
1570177,4,2014,18,472314,37.0,0,GROCERY I,1030,0,Quito,Pichincha,B,16
5233895,11,2016,4,405304,32.0,0,GROCERY I,1032,0,Quito,Pichincha,D,9
5173241,11,2015,34,1104599,49.0,0,PERSONAL CARE,4114,0,Guayaquil,Guayas,B,6
2535505,6,2013,39,759657,40.0,0,LINGERIE,7016,0,Cuenca,Azuay,B,6
5769378,12,2016,39,1660260,128.0,3,PRODUCE,2028,1,Cuenca,Azuay,B,6


### df_0 History sales join with df_2 Holidays 

In [None]:
print(
    f"-> Contains:                {round(df_2.shape[0]/1e6, 1)} million observations and {df_2.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_2.shape[0]} observations and {df_2.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_2.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df_2)/1024/1024/1024, 2)} GB.")

df_2.sample(10)

-> Contains:                0.0 million observations and 6 features.

-> Contains:                350 observations and 6 features.

-> Have feature names:      date, type, locale, locale_name, description, and transferred.

-> Has optimized size of    0.0 GB.


Unnamed: 0,date,type,locale,locale_name,description,transferred
245,2016-05-08,Event,National,Ecuador,Dia de la Madre,False
283,2016-11-12,Work Day,National,Ecuador,Recupero Puente Dia de Difuntos,False
337,2017-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
170,2015-05-01,Holiday,National,Ecuador,Dia del Trabajo,False
41,2013-01-01,Holiday,National,Ecuador,Primer dia del ano,False
181,2015-07-23,Holiday,Local,Cayambe,Cantonizacion de Cayambe,False
58,2013-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
60,2013-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
157,2014-12-26,Additional,National,Ecuador,Navidad+1,False
223,2016-04-20,Event,National,Ecuador,Terremoto Manabi+4,False


In [None]:
# drop unnecessary columns 'description'
df_2_cleaned = df_2.drop(columns=["description"])

In [None]:
# drop unnecessary columns 'id' to save memory
df_0_cleaned = df_0.drop(
    columns=[
        "id"
        # , "onpromotion_month_count"
    ]
)

In [None]:
# Merge of df_0_cleaned with df_2

df_02_join = df_0_cleaned.merge(df_2, on="date", how="left")

KeyboardInterrupt: 

In [None]:
print(
    f"-> Contains:                {round(df_02_join.shape[0]/1e6, 1)} million observations and {df_02_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_02_join.shape[0]} observations and {df_02_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_02_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_02_join)/1024/1024/1024, 2)} GB."
)

-> Contains:                128.0 million observations and 14 features.

-> Contains:                127970257 observations and 14 features.

-> Have feature names:      store_nbr, item_nbr, unit_sales, onpromotion, day, year, month, date, onpromotion_month_count, type, locale, locale_name, description, and transferred.

-> Has optimized size of    8.7 GB.


In [None]:
# Sample of final merged dataframe 'df_02_join'

df_02_join.sample(20)

Unnamed: 0,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date,onpromotion_month_count,type,locale,locale_name,description,transferred
77639586,45,1400334,1.0,False,13,2016,12,2016-12-13,0,,,,,
105274147,36,2054101,6.0,False,16,2017,1,2017-01-16,0,,,,,
20950138,1,913966,4.0,False,5,2014,11,2014-11-05,0,,,,,
76911104,42,639586,2.0,False,6,2016,12,2016-12-06,0,Holiday,Local,Quito,Fundacion de Quito,False
49343909,4,949297,4.0,False,30,2015,12,2015-12-30,0,,,,,
45128215,21,954796,3.0,False,15,2015,11,2015-11-15,0,,,,,
83215238,15,165594,9.0,False,10,2016,3,2016-03-10,0,,,,,
43347484,54,807495,17.0,False,26,2015,10,2015-10-26,0,,,,,
46368881,51,1229643,1.0,False,28,2015,11,2015-11-28,0,,,,,
3429759,8,916885,10.0,,17,2013,11,2013-11-17,0,,,,,
