### Supermarket data science case study - Exploring first data


### Importing packages

In [3]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
import altair as alt
import vegafusion as vf
import sklearn

from sklearn.pipeline import Pipeline, make_pipeline

### Functions

In [4]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)

    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len - 1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len - 1]

    # Return result.
    return dummy

In [5]:
def f_describe(df_input, n_top=10):

    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))

    df_numeric = df_input.select_dtypes(
        include=[
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "float64",
        ]
    )

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

    df_textual = df_input.select_dtypes(include=["category", "object", "bool"])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())

    v_na = [
        col
        + " ("
        + str(df[col].isna().sum())
        + ", "
        + str(round(100 * df[col].isna().sum() / df.shape[0], 1))
        + "%)"
        for col in df.columns
        if df[col].isna().sum() > 0
    ]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

In [7]:
def extract_datetime_features(df):
    """
    Extracting datetime features
    year, month, day of month, and day of week
    """
    df = df.copy()
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["weekday"] = df["date"].dt.dayofweek
    # df["week"] = df["date"].dt.week

    return df


# df_dt_features = extract_datetime_features(df)

### Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [8]:
def standardize_column_names(s):
    return s.replace(" ", "")


def optimize_memory(df):
    # Change: Objects to Categorical.                                               #WHEN needed to transform Objects to Categorical?
    # object_cols = df.select_dtypes(include="object").columns

    # if not object_cols.empty:
    #     print("Change: Objects to Categorical")
    #     df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


def month_year_to_int(df, i):
    # Change: Month and Year to integer
    if i == 0:
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})

    return df


# To-do: build pipeline for rounding digits #,0000000 for UnitSales

# To-do: build pipeline for N/A for onpromotion

### Transform date-related columns to datetime format.

In [9]:
# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        df["date"] = pd.to_datetime(df[["year", "month", "day"]], unit="us")

        # print(
        #     "Change: Dropped 'year', 'month', 'day' columns and transformed to Datetime64[us] feature"
        # )
        # df.drop(columns=["day", "month", "year"], inplace=True)

    else:
        if "date" in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype")
            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    return df

### Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [11]:
def f_get_data(i=0):

    # Define path.
    c_path = "C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/raw/"

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "history_aggregated",  # 1
        "holidays_events",  # 2
        "items",  # 3
        "oil",  # 4
        "stores",  # 5
        "transactions",  # 6
    )

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### Importing data

In [None]:
df = f_get_data(0)
# f_describe(df)
# df.info()

### Some Statistics:

In [46]:
print("The data\n")
print(
    f"-> Contains:                {round(df_0.shape[0]/1e6, 1)} million observations and {df_0.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_0.shape[0]} observations and {df_0.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_0.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB.")

The data

-> Contains:                125.5 million observations and 9 features.

-> Contains:                125497040 observations and 9 features.

-> Have feature names:      id, store_nbr, item_nbr, unit_sales, onpromotion, day, year, month, and date.



NameError: name 'df' is not defined

In [None]:
# df = f_get_data(0)
# f_describe(df)

df["year"] = df["date"].dt.isocalendar().year
df["week"] = df["date"].dt.isocalendar().week

df_0_unit_sales_weeksum = (
    df.groupby(["item_nbr", "year", "week"]).agg({"unit_sales": "sum"}).reset_index()
)

df_0_unit_sales_weeksum["first_day_of_week"] = pd.to_datetime(
    df_0_unit_sales_weeksum["year"].astype(str)
    + "-"
    + df_0_unit_sales_weeksum["week"].astype(str)
    + "-1",
    format="%G-%V-%u",
)

filtered_df = df_0_unit_sales_weeksum[df_0_unit_sales_weeksum["year"] == 2013]

# df_0_unit_sales_sum = df.groupby("week").sum().reset_index()
filtered_df.head()
# df.tail(10)
# df.sample(20)
# df.info()
# df.describe()
# df.nunique

In [None]:
df_aggregated = f_get_data(1)

# df_aggregated.sort_values(by='date')
df_aggregated["year"] = df_aggregated["date"].dt.isocalendar().year
filtered_df_1 = df_aggregated[df_aggregated["date"] == 2016]
filtered_df_1.head()
# f_describe(df)
# df.info()
# df.sample(20)

In [None]:
df_1_test = f_get_data(1)
df_1_test.tail()

In [None]:
print(
    f"-> [Normal     #0] Contains:                {round(df.shape[0]/1e6, 1)} million observations and {df.shape[1]} features."
)
print(
    f"-> [Aggregated #1] Contains:                {round(df_aggregated.shape[0]/1e6, 1)} million observations and {df_aggregated.shape[1]} features.\n"
)

print(
    f"-> [Normal     #0] Contains:                {df.shape[0]} observations and {df.shape[1]} features."
)
print(
    f"-> [Aggregated #1] Contains:                {df_aggregated.shape[0]} observations and {df_aggregated.shape[1]} features.\n"
)

print(f"-> [Normal     #0] Have feature names:      {f_concat(df.columns)}.")
print(
    f"-> [Aggregated #1] Have feature names:      {f_concat(df_aggregated.columns)}.\n"
)

print(
    f"-> [Normal     #0] Has optimized size of    {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB."
)
print(
    f"-> [Aggregated #1] Has optimized size of    {round(sys.getsizeof(df_aggregated)/1024/1024/1024, 2)} GB."
)

In [None]:
df = f_get_data(6)
f_describe(df)
df.info()
df.sample(20)

In [None]:
df = f_get_data(6)

In [None]:
# alt.data_transformers.enable("default")
# alt.data_transformers.disable_max_rows()


aggregated_df = df.groupby("date").sum().reset_index()

print(
    f"-> Contains:                {df.shape[0]} observations and {df.shape[1]} features.\n"
)

print(
    f"-> Contains:                {aggregated_df.shape[0]} observations and {aggregated_df.shape[1]} features.\n"
)

aggregated_df2 = df.groupby(["date", "store_nbr"]).sum().reset_index()
print(
    f"-> Contains:                {aggregated_df2.shape[0]} observations and {aggregated_df2.shape[1]} features.\n"
)


print(1682 / 5)
# aggregated_df.head(20)

print(df.dtypes)
print(aggregated_df.dtypes)

In [None]:
# Enable the Vegafusion transformer and disable the maximum rows limit
# alt.data_transformers.enable("vegafusion")
df_6 = f_get_data(6)

alt.data_transformers.disable_max_rows()

aggregated_df = df_6.groupby("date").sum().reset_index()

chart = (
    alt.Chart(aggregated_df)
    .mark_circle()
    .encode(x="date:T", y="transactions:Q")  # , color="store_nbr:N") #Dtype = uint64
    .properties(width=1200, height=480)
    .interactive()
)

chart  # .display()

In [22]:
# Enable the Vegafusion transformer and disable the maximum rows limit
# vf.enable()
# alt.data_transformers.enable("vegafusion")
# alt.data_transformers.disable_max_rows()

# Enable Vegafusion and set a higher row limit
# vf.enable(row_limit=100000)
# alt.data_transformers.enable("vegafusion")

filtered_df = df_0[df_0["date"].dt.year == 2014]

df["store_nbr"] = df["store_nbr"].astype("category")

chart = (
    alt.Chart(filtered_df)
    .mark_circle()
    .encode(x="date:T", y="transactions:Q", color="store_nbr:N")  # Dtype = uint64
    .properties(width=1200, height=480)
    .interactive()
)


chart.display()

NameError: name 'df' is not defined

## Sale by the date "scatter plot"

In [None]:
# df_0 = f_get_data(0)

df_0.info()

In [11]:
df_0 = f_get_data(0)
df_1 = f_get_data(1)
df_ = f_get_data(1)

Change: integer --> unsigned
Change: float --> float
Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature
Change: integer --> unsigned
Change: float --> float
Change: Transformed 'date' column to Datetime Dtype


In [33]:
unique_item_count_df_0 = df_0["item_nbr"].nunique()
print(f"Unique item numbers in df_0: {unique_item_count_df_0}")

unique_item_count_df_1 = df_1["item_nbr"].nunique()
print(f"Unique item numbers in df_1: {unique_item_count_df_1}")

Unique item numbers in df_0: 54
Unique item numbers in df_1: 4036


In [29]:
print(df_0["item_nbr"].dtype)
print(df_1["item_nbr"].dtype)

uint32
uint32


In [34]:
df_1.sample(10)

Unnamed: 0,onpromotion,item_nbr,week,date,unit_sales_sum,count,family,class,perishable
450093,False,2046528,201649,2015-12-04,213.0,62,LAWN AND GARDEN,6922,0
739171,False,123601,201510,2015-03-05,3045.0,313,GROCERY I,1072,0
399500,False,1920046,201541,2015-10-08,821.0,235,GROCERY I,1016,0
513392,True,583925,201720,2016-05-22,177.509,24,MEATS,2302,1
708090,True,1960591,201711,2016-03-20,253.0,27,PRODUCE,2074,1
341886,False,1473403,201627,2015-07-03,6523.233,284,PRODUCE,2010,1
404229,False,1937083,201546,2015-11-12,116.0,40,BEVERAGES,1136,0
440970,False,2011329,201611,2015-03-13,2745.0,324,GROCERY I,1083,0
768459,False,220435,201727,2016-07-10,5000.0,168,GROCERY I,1080,0
901007,False,586911,201652,2015-12-25,439.0,205,PERSONAL CARE,4114,0


In [12]:
df_dt_features = extract_datetime_features(df_0)

## Looks like dayofweek start 1 on tuesday instead of monday
## Weekday from 0 to 6? First weekday starts on 0

# df_dt_features.sample(10)

In [13]:
df_dt_features = extract_datetime_features(df_0)
df_0_cleaned = df_dt_features.drop(columns=["id"])

df_1_cleaned = df_1.drop(columns=["week", "date", "unit_sales_sum", "count"])


# df_0['item_nbr'] = df_0['item_nbr'].astype('category')


# df_1_cleaned['item_nbr'] = df_1_cleaned['item_nbr'].astype('category')

df_0 = (
    df_0.groupby(["weekday", "store_nbr", "item_nbr"])
    .agg({"unit_sales": "mean"})
    .reset_index()
)


df_0_join_item_nbr = df_0.merge(df_1_cleaned, on="item_nbr", how="left")


df_0_join_item_nbr.head(10)

KeyError: 'weekday'

In [21]:
# alt.data_transformers.disable_max_rows()

# data = pd.DataFrame({"date": df["date"], "unit_sales ": df["unit_sales"]})

# #chart = (                        ##To many rows, aggregation needed first
#     alt.Chart(data)
#     .mark_point(size=10)
#     .encode(
#         x="date:T",
#         y="unit_sales:Q",
#         tooltip=["date", "unit_sales "],
#     )
#     .properties(width=800, height=400, title="Unit_sales by Date")
#     .interactive()
# )

# chart.show()

IndentationError: unexpected indent (1729989079.py, line 6)

In [None]:
df.info()

In [19]:
agg = df_0.groupby(["month", "store_nbr"]).agg({"unit_sales": "mean"}).reset_index()

# Set the row limit to a higher value
alt.data_transformers.disable_max_rows()

# agg['month'] = agg['month'].astype(str)

# Create the line plot
line_chart = (
    alt.Chart(agg)
    .mark_line()
    .encode(
        x="month:O",
        y="unit_sales:Q",
        color="store_nbr:N",
        tooltip=["month", "store_nbr", "unit_sales"],
    )
    .properties(title="Average Sales by Date and Store number", width=1200, height=600)
    .interactive()
)

# Display the plot
line_chart.show()

In [24]:
# Set the row limit to a higher value
alt.data_transformers.disable_max_rows()

# Create the stacked area plot
stacked_area_chart = (
    alt.Chart(agg)
    .mark_area()
    .encode(
        x="date:T",
        y=alt.Y("unit_sales:Q", stack="zero"),
        color="store_nbr:N",
        tooltip=["date:T", "store_nbr:N", "unit_sales:Q"],
    )
    .properties(title="Average Sales by Date and Store Number", width=1200, height=600)
    .interactive()
)

# Display the plot
stacked_area_chart.show()

In [28]:
agg2 = df_0.groupby(["date", "store_nbr"]).agg({"unit_sales": "mean"}).reset_index()

# Set the row limit to a higher value
alt.data_transformers.disable_max_rows()

# Create the line plot
line_chart = (
    alt.Chart(agg2)
    .mark_area()
    .encode(
        x="date:T",
        y="unit_sales:Q",
        color="store_nbr:N",
        tooltip=["date", "store_nbr", "unit_sales"],
    )
    .properties(title="Average Sales by Date and Store number", width=1200, height=600)
    .interactive()
)

# Display the plot
line_chart.show()

In [20]:
df_3 = f_get_data(3)

df_3.sample(10)

# df_1.sample(10)

# df_0.sample(10)

Change: integer --> unsigned
Change: float --> float


Unnamed: 0,item_nbr,family,class,perishable
3935,2058758,BEVERAGES,1136,0
3812,2035576,"LIQUOR,WINE,BEER",1318,0
742,586967,PERSONAL CARE,4122,0
2326,1412204,FROZEN FOODS,2246,0
2341,1418844,GROCERY I,1004,0
255,269287,CLEANING,3014,0
536,463903,CLEANING,3034,0
3164,1920863,GROCERY I,1006,0
1962,1239897,POULTRY,2420,1
2011,1255777,BEVERAGES,1124,0


In [41]:
df_1.sample(10)

Unnamed: 0,onpromotion,item_nbr,week,date,unit_sales_sum,count,family,class,perishable
770122,False,223434,201653,2016-01-01,3033.0,297,GROCERY I,1032,0
676534,True,1584575,201537,2015-09-10,1837.0,36,PRODUCE,2016,1
870937,False,517903,201541,2015-10-08,795.0,164,DELI,2632,1
723435,True,2042947,201721,2016-05-29,78.0,3,BEVERAGES,1148,0
737042,False,119191,201502,2015-01-08,754.0,123,CLEANING,3044,0
684393,True,1693657,201626,2015-06-26,862.0,26,PRODUCE,2018,1
283716,False,1402017,201728,2016-07-17,224.0,122,CLEANING,3034,0
494752,True,456870,201613,2015-03-27,1.0,1,GROCERY I,1010,0
438153,False,2010916,201712,2016-03-27,2723.0,345,GROCERY I,1032,0
201665,False,1209721,201403,2014-01-15,356.0,121,GROCERY I,1042,0


In [22]:
df_0.sample(10)

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
41226408,58851854,22,991331,10.0,False,10,2015,10,2015-10-10
9338708,4895240,1,939663,5.0,,29,2013,4,2013-04-29
6048353,1604885,15,108701,1.0,,10,2013,2,2013-02-10
45330501,62955947,3,1345352,12.0,False,24,2015,11,2015-11-24
112199508,112199508,13,843462,3.0,False,12,2017,4,2017-04-12
31384498,24863645,26,1159726,2.0,False,4,2014,6,2014-06-04
12318959,7875491,54,1047679,7.0,,4,2013,7,2013-07-04
2141806,12809036,1,329397,4.0,,21,2013,10,2013-10-21
87104550,77941044,47,307740,8.0,False,1,2016,5,2016-05-01
37818601,31297748,48,795612,1.0,False,19,2014,9,2014-09-19


In [12]:
df_0 = f_get_data(0)

Change: integer --> unsigned
Change: float --> float
Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature


In [25]:
df_0.tail(10)

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
125497030,125497030,54,2086882,1.0,False,15,2017,8,2017-08-15
125497031,125497031,54,2087409,3.0,False,15,2017,8,2017-08-15
125497032,125497032,54,2087978,8.0,False,15,2017,8,2017-08-15
125497033,125497033,54,2088922,7.0,False,15,2017,8,2017-08-15
125497034,125497034,54,2089036,4.0,False,15,2017,8,2017-08-15
125497035,125497035,54,2089339,4.0,False,15,2017,8,2017-08-15
125497036,125497036,54,2106464,1.0,True,15,2017,8,2017-08-15
125497037,125497037,54,2110456,192.0,False,15,2017,8,2017-08-15
125497038,125497038,54,2113914,198.0,True,15,2017,8,2017-08-15
125497039,125497039,54,2116416,2.0,False,15,2017,8,2017-08-15


In [31]:
# def onpromotion_month_count(df):
#     if ['onpromotion'] == True:
#         [onpromotion_month_count] = 1

#     else:
#         0

#     return df


def onpromotion_month_count(df):

    if "onpromotion" in df.columns:

        df["onpromotion_month_count"] = df.groupby(
            ["item_nbr", "store_nbr", "day", "month", "year"]
        )["onpromotion"].transform("sum")

        print("Change: Onpromotion column transformed to 'onpromotion_count' feature.")
    else:

        print("The DataFrame does not contain an 'onpromotion' column.")

    return df

In [32]:
df_0_promo_count = onpromotion_month_count(df_0)

Change: Onpromotion column transformed to 'onpromotion_count' feature.


Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date,onpromotion_month_count
125497030,125497030,54,2086882,1.0,False,15,2017,8,2017-08-15,0
125497031,125497031,54,2087409,3.0,False,15,2017,8,2017-08-15,0
125497032,125497032,54,2087978,8.0,False,15,2017,8,2017-08-15,0
125497033,125497033,54,2088922,7.0,False,15,2017,8,2017-08-15,0
125497034,125497034,54,2089036,4.0,False,15,2017,8,2017-08-15,0
125497035,125497035,54,2089339,4.0,False,15,2017,8,2017-08-15,0
125497036,125497036,54,2106464,1.0,True,15,2017,8,2017-08-15,1
125497037,125497037,54,2110456,192.0,False,15,2017,8,2017-08-15,0
125497038,125497038,54,2113914,198.0,True,15,2017,8,2017-08-15,1
125497039,125497039,54,2116416,2.0,False,15,2017,8,2017-08-15,0


In [34]:
df_0_promo_count.sample(20)

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date,onpromotion_month_count
57623128,49141408,38,1463787,21.0,False,16,2015,6,2015-06-16,0
8524603,4081135,50,368140,3.0,,9,2013,4,2013-04-09,0
89956598,80793092,33,414421,3.0,False,31,2016,5,2016-05-31,0
48733442,66358888,51,1968452,57.0,False,30,2015,12,2015-12-30,0
100157678,90994172,45,1660271,22.0,True,14,2016,9,2016-09-14,1
29285414,22764561,9,521818,7.0,False,23,2014,4,2014-04-23,0
17008665,17008665,28,308916,1.0,,12,2014,1,2014-01-12,0
112439436,112439436,36,463598,2.0,True,14,2017,4,2017-04-14,1
78632197,69468691,49,1463765,5.0,False,2,2016,2,2016-02-02,0
12451695,8008227,46,987308,41.0,,7,2013,7,2013-07-07,0


In [None]:
# df_1_cleaned['item_nbr'] = df_1_cleaned['item_nbr'].astype('category')

# df_0_promo_count

df_0_cleaned = (
    df_0_promo_count.drop(columns=["id", "date", "onpromotion"])
    .groupby(["month", "year", "store_nbr", "item_nbr"])  # when true +1, else 0
    .agg({"unit_sales": "sum", "onpromotion_month_count": "sum"})
    .reset_index()
)

# df_0_cleaned.sample(10)

df_03_join = df_0_cleaned.merge(df_3, on="item_nbr", how="left")

In [39]:
df_03_join.sample(20)

Unnamed: 0,month,year,store_nbr,item_nbr,unit_sales,onpromotion_month_count,family,class,perishable
753394,2,2016,22,1328907,230.0,0,PERSONAL CARE,4114,0
5700673,12,2016,13,841607,8.0,0,CLEANING,3024,0
5422539,12,2013,44,1040170,223.0,0,FROZEN FOODS,2222,0
3007771,6,2017,52,1151128,61.0,0,GROCERY I,1038,0
3980476,8,2017,9,1473475,111.264,3,PRODUCE,2014,1
5318515,11,2016,37,1463784,50.0,0,BEVERAGES,1114,0
2668267,6,2015,25,759697,88.0,0,BEVERAGES,1124,0
5359135,11,2016,50,1463786,192.0,0,BEVERAGES,1116,0
720614,2,2016,7,1393047,114.0,0,GROCERY I,1040,0
2126618,5,2015,8,1367438,73.0,0,HOME AND KITCHEN II,6330,0


In [43]:
df_5 = f_get_data(5)
# df_5.sample(20)  # join on store_nmr

df_035_join = df_03_join.merge(df_5, on="store_nbr", how="left")

df_035_join.sample(20)

Change: integer --> unsigned
Change: float --> float


Unnamed: 0,month,year,store_nbr,item_nbr,unit_sales,onpromotion_month_count,family,class,perishable,city,state,type,cluster
1490485,4,2013,1,364606,1650.0,0,GROCERY I,1014,0,Quito,Pichincha,D,13
1269061,3,2016,26,305227,115.0,0,GROCERY I,1013,0,Guayaquil,Guayas,D,10
5465120,12,2014,14,1696013,306.075989,5,PRODUCE,2022,1,Riobamba,Chimborazo,C,7
3289090,7,2016,4,1117663,138.0,0,GROCERY I,1034,0,Quito,Pichincha,D,9
4060260,8,2017,41,2013621,179.0,0,GROCERY I,1010,0,Machala,El Oro,D,4
2824273,6,2016,40,121964,7.0,0,AUTOMOTIVE,6810,0,Machala,El Oro,C,3
3462322,7,2017,17,2060910,487.0,18,BEVERAGES,1126,0,Quito,Pichincha,C,12
1814911,4,2016,48,1084881,531.0,4,GROCERY I,1040,0,Quito,Pichincha,A,14
1699928,4,2016,2,799461,56.0,0,GROCERY I,1060,0,Quito,Pichincha,D,13
461572,1,2017,42,208699,55.0,0,GROCERY I,1034,0,Cuenca,Azuay,D,2


In [18]:
print("The data\n")
print(
    f"-> Contains:                {round(df_0.shape[0]/1e6, 1)} million observations and {df_0.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_0.shape[0]} observations and {df_0.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_0.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df_0)/1024/1024/1024, 2)} GB.")

The data

-> Contains:                125.5 million observations and 9 features.

-> Contains:                125497040 observations and 9 features.

-> Have feature names:      id, store_nbr, item_nbr, unit_sales, onpromotion, day, year, month, and date.

-> Has optimized size of    3.74 GB.


In [19]:
print("The data\n")
print(
    f"-> Contains:                {round(df_0_cleaned.shape[0]/1e6, 1)} million observations and {df_0_cleaned.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_0_cleaned.shape[0]} observations and {df_0_cleaned.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_0_cleaned.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_0_cleaned)/1024/1024/1024, 2)} GB."
)

The data

-> Contains:                5.8 million observations and 5 features.

-> Contains:                5813304 observations and 5 features.

-> Have feature names:      month, year, store_nbr, item_nbr, and unit_sales.

-> Has optimized size of    0.09 GB.


In [42]:
print("The data\n")
print(
    f"-> Contains:                {round(df_03_join.shape[0]/1e6, 1)} million observations and {df_03_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_03_join.shape[0]} observations and {df_03_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_03_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_03_join)/1024/1024/1024, 2)} GB."
)

The data

-> Contains:                5.8 million observations and 9 features.

-> Contains:                5813304 observations and 9 features.

-> Have feature names:      month, year, store_nbr, item_nbr, unit_sales, onpromotion_month_count, family, class, and perishable.

-> Has optimized size of    0.47 GB.


In [45]:
print("The data\n")
print(
    f"-> Contains:                {round(df_035_join.shape[0]/1e6, 1)} million observations and {df_035_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_035_join.shape[0]} observations and {df_035_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_035_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_035_join)/1024/1024/1024, 2)} GB."
)

The data

-> Contains:                5.8 million observations and 13 features.

-> Contains:                5813304 observations and 13 features.

-> Have feature names:      month, year, store_nbr, item_nbr, unit_sales, onpromotion_month_count, family, class, perishable, city, state, type, and cluster.

-> Has optimized size of    1.36 GB.


In [48]:
df_1 = f_get_data(1)

print(
    f"-> Contains:                {df_1.shape[0]} observations and {df_1.shape[1]} features.\n"
)
df_1.sample(10)

Change: integer --> unsigned
Change: float --> float
Change: Transformed 'date' column to Datetime Dtype
-> Contains:                948220 observations and 9 features.



Unnamed: 0,onpromotion,item_nbr,week,date,unit_sales_sum,count,family,class,perishable
49904,False,865147,201609,2015-02-27,1036.0,277,PERSONAL CARE,4114,0
329686,False,1464081,201723,2016-06-12,1512.0,338,BEVERAGES,1190,0
365570,False,1584379,201619,2015-05-08,241.0,123,LADIESWEAR,7780,0
943382,False,750855,201732,2016-08-14,528.0,188,GROCERY I,1062,0
75582,False,926958,201731,2016-08-07,1286.0,305,GROCERY I,1026,0
623622,True,1260238,201712,2016-03-27,8.0,1,CLEANING,3034,0
431739,False,2005844,201653,2016-01-01,265.0,73,HOME AND KITCHEN II,6350,0
733031,False,115693,201610,2015-03-06,930.0,257,GROCERY I,1040,0
874755,False,527757,201617,2015-04-24,692.0,176,GROCERY I,1032,0
276814,False,1386506,201411,2014-03-12,168.0,109,HOME AND KITCHEN I,6206,0
