# EyeOn Supermarket data science case study - Bird's eye view of the data

In [39]:
#pip install --user pyarrow

### Importing packages

In [40]:
import pandas as pd
import sys
import matplotlib.pyplot as plt

### Functions

In [41]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)
    
    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len-1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len-1]

    # Return result.
    return dummy

In [42]:
def f_info(df_input, c_feature, n_top =  10):

    # Testing!
    #df_input  = df_historyPerYear
    #c_feature = "year"
    #n_top     = 3

    # Do not calculate the frequency table in case the feature has unique values.
    if(df_input[c_feature].is_unique):
        print("Feature '" + c_feature + "' is unique.")
        return


    # Load package.
    from collections import Counter

    # Bereken frequenties.
    c = Counter(df_input[c_feature])

    # Converteer naar data frame.
    df_output         = pd.DataFrame(list(c.items()))

    # Hernoem kolomnamen.
    df_output.columns = ["level", "n"]

    # Bereken percentage.
    df_output["perc"] = round(100 * df_output["n"] / df_input.shape[0], 1).astype(str) + "%"

    # Sorteer data frame op frequentie.
    df_output         = df_output.sort_values(by = "n", ascending = False)

    # Reset index en verwijder index kolom die daardoor ontstaat.
    df_output         = df_output.reset_index().drop(columns=['index'])

    # Display tabel zonder index.
    # https://stackoverflow.com/questions/61363712/how-to-print-a-pandas-io-formats-style-styler-object
    if(df_output.shape[0] <= n_top):
        c.message = "we tonen alle " + str(df_output.shape[0]) + " levels:"
        
    else:
        c.message = "we tonen de Top-" + str(n_top) + " van de " + str(df_output.shape[0]) + " levels:"

        
    # Print header
    print("Frequentietabel voor '" + c_feature + "', " + c.message + "\n")
        
    print(f"'None': {c[None]} ({round(100 * c[None] / df_input.shape[0], 1)}%)")

    print(f"'NA':   {df_input[c_feature].isna().sum()} ({round(100 * df_input[c_feature].isna().sum() / df_input.shape[0], 1)}%)")
            
    display(df_output.head(n_top).style.hide_index())
    
    #plt.hist(df["year"], bins=10, orientation='horizontal') #plt.show()

    # Plot frequency if count is smaller than 20.
    v_data_to_plot = df[c_feature].value_counts(sort = True)[0:n_top]
    
    # https://stackoverflow.com/questions/36367986/how-to-make-inline-plots-in-jupyter-notebook-larger
    plt.rcParams['figure.figsize'] = [15, 5]
    
    fig, ax = plt.subplots()
    v_data_to_plot.plot(kind='bar', ax=ax)  
    plt.show()
    

In [43]:
#df['transactions'].value_counts(sort = True)[0:10]

In [110]:
f_info(df, "month", 15)

Frequentietabel voor 'month', we tonen alle 12 levels:

'None': 0 (0.0%)
'NA':   0 (0.0%)


AttributeError: 'Styler' object has no attribute 'hide_index'

In [90]:
def f_describe(df_input, n_top = 10):
    
    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))
     
    
    df_numeric = df_input.select_dtypes(include = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'])

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

        
    df_textual = df_input.select_dtypes(include = ['category', 'object', 'bool'])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())
        
        
    v_na = [col + " (" + str(df[col].isna().sum()) + ", " + str(round(100 * df[col].isna().sum() / df.shape[0], 1)) + "%)" for col in df.columns if df[col].isna().sum() > 0]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))


In [86]:
def f_get_data(i=0):

    #c_path = "drive/MyDrive/Colab Notebooks/2021 01 - Execute/EyeOn/Data/"
    #c_path = "/Users/home/OneDrive - IWD/Clients/PE/2021 01 - Execute - EyeOn/Data/"

    # Define path.
    c_path = 'C:/Users/alexander/Documents/0. Data Science and AI for Experts/2024 04 12 - Group 4b Project - Supermarket-case/application-project-supermarket/data/'

    # Identify file.
    v_file = ("history-per-year",   # 0
              "history_aggregated", # 1
              "holidays_events",    # 2
              "items",              # 3
              "oil",                # 4
              "stores",             # 5
              "transactions")       # 6

    # Load data.
    df     = pd.read_parquet(c_path + v_file[i] + ".parquet")
    
    # Return data.
    return df

In [111]:
f_describe(df)

First 10 rows in de data:


Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month
0,0,25,103665,7.0,,1,2013,1
1,1,25,105574,1.0,,1,2013,1
2,2,25,105575,2.0,,1,2013,1
3,3,25,108079,1.0,,1,2013,1
4,4,25,108701,1.0,,1,2013,1
5,5,25,108786,3.0,,1,2013,1
6,6,25,108797,1.0,,1,2013,1
7,7,25,108952,1.0,,1,2013,1
8,8,25,111397,13.0,,1,2013,1
9,9,25,114790,3.0,,1,2013,1


Numerical data:


Unnamed: 0,store_nbr,unit_sales,day,year,month
count,125497000.0,125497000.0,125497000.0,125497000.0,125497000.0
mean,27.46458,8.554879,15.60188,2015.223,6.334971
std,16.33051,23.60515,8.816411,1.29914,3.392866
min,1.0,-15372.0,1.0,2013.0,1.0
25%,12.0,2.0,8.0,2014.0,3.0
50%,28.0,4.0,15.0,2015.0,6.0
75%,43.0,9.0,23.0,2016.0,9.0
max,54.0,89440.0,31.0,2017.0,12.0


Textual data:


Unnamed: 0,onpromotion
count,103839389
unique,2
top,False
freq,96028767


Features and their number of missing values:


'onpromotion (21657651, 17.3%)'

In [48]:
#f_describe(df)

### Importing data

In [102]:
df = f_get_data(0)

### Downcast data as needed

In [103]:
# A few basic statistics on df_historyPerYear.
print("The data:\n")
print(f"-> has size of {round(sys.getsizeof(df)/1024/1024/1024, 3)} GB.")
print("")

df.info()

The data:

-> has size of 3.506 GB.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125497040 entries, 0 to 125497039
Data columns (total 8 columns):
 #   Column       Dtype   
---  ------       -----   
 0   id           int64   
 1   store_nbr    uint8   
 2   item_nbr     int64   
 3   unit_sales   float64 
 4   onpromotion  boolean 
 5   day          uint8   
 6   year         category
 7   month        category
dtypes: boolean(1), category(2), float64(1), int64(2), uint8(2)
memory usage: 3.5 GB


In [106]:
# Update formatting of features
i=0

if(i == 0):
    print("Change: Month and Year to integer")
    df = df.astype({"month": int, "year": int})

# objects to categorical - Not applicable here because there are no 'object' features
# df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').astype('category')

# convert integers to smallest unsigned integer and floats to smallest
for old, new in [('integer', 'unsigned'), ('float', 'float')]:
    
    print("change: " + old + " --> " + new)
    
    for col in df.select_dtypes(include=old).columns:
        
        df[col] = pd.to_numeric(df[col], downcast=new)

Change: Month and Year to integer
change: integer --> unsigned
change: float --> float


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125497040 entries, 0 to 125497039
Data columns (total 8 columns):
 #   Column       Dtype   
---  ------       -----   
 0   id           int64   
 1   store_nbr    uint8   
 2   item_nbr     int64   
 3   unit_sales   float64 
 4   onpromotion  boolean 
 5   day          uint8   
 6   year         category
 7   month        category
dtypes: boolean(1), category(2), float64(1), int64(2), uint8(2)
memory usage: 3.5 GB


### Some Statistics:

In [107]:
print("The data:\n")
print(f"-> Contains:            {round(df.shape[0]/1e6, 1)} million observations and {df.shape[1]} features.\n")
print(f"-> Contains:            {df.shape[0]} observations and {df.shape[1]} features.\n")
print(f"-> Have feature names:  {f_concat(df.columns)}.\n")
print(f"-> Has size of          {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB.")

The data:

-> Contains:            125.5 million observations and 8 features.

-> Contains:            125497040 observations and 8 features.

-> Have feature names:  id, store_nbr, item_nbr, unit_sales, onpromotion, day, year, and month.

-> Has size of          2.22 GB.


In [108]:
f_describe(df)

First 10 rows in de data:


Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month
0,0,25,103665,7.0,,1,2013,1
1,1,25,105574,1.0,,1,2013,1
2,2,25,105575,2.0,,1,2013,1
3,3,25,108079,1.0,,1,2013,1
4,4,25,108701,1.0,,1,2013,1
5,5,25,108786,3.0,,1,2013,1
6,6,25,108797,1.0,,1,2013,1
7,7,25,108952,1.0,,1,2013,1
8,8,25,111397,13.0,,1,2013,1
9,9,25,114790,3.0,,1,2013,1


Numerical data:


Unnamed: 0,store_nbr,unit_sales,day,year,month
count,125497000.0,125497000.0,125497000.0,125497000.0,125497000.0
mean,27.46458,8.554879,15.60188,2015.223,6.334971
std,16.33051,23.60515,8.816411,1.29914,3.392866
min,1.0,-15372.0,1.0,2013.0,1.0
25%,12.0,2.0,8.0,2014.0,3.0
50%,28.0,4.0,15.0,2015.0,6.0
75%,43.0,9.0,23.0,2016.0,9.0
max,54.0,89440.0,31.0,2017.0,12.0


Textual data:


Unnamed: 0,onpromotion
count,103839389
unique,2
top,False
freq,96028767


Features and their number of missing values:


'onpromotion (21657651, 17.3%)'

In [109]:
f_info(df, "date")

KeyError: 'date'

In [None]:
# item_nbr: 764438 // count: 180 // week: 201323 // unit_sales_sum: 1925 // onpromotion: false //
# family: grocery I // class: 1072

# df[(df['item_nbr'] == 103665)]

In [58]:
df = f_get_data(0)

In [57]:
# history_per_year:
df[(df['item_nbr'] == 103665) & (df['day'].isin([6,7,8,9,10,11,12])) & (df['month'] == 1) & (df['year'] == 2014)].shape[0]
# --> 205 rows

205

In [None]:
# history_per_year:
df[(df['item_nbr'] == 103665) & (df['day'].isin([6,7,8,9,10,11,12])) & (df['month'] == 1) & (df['year'] == 2014)].unit_sales.sum()
# --> 952.0

952.0

In [82]:
df = f_get_data(1)

In [None]:
# history_aggregated
df[(df['item_nbr'] == 103665) & (df['week'] == 201402)]
# --> onpromotion = False; count = 202; unit_sales_sum = 966.0

Unnamed: 0,onpromotion,item_nbr,week,date,unit_sales_sum,count,family,class,perishable
726055,False,103665,201402,2014-01-08 00:00:00+00:00,966.0,202,BREAD/BAKERY,2712,1


In [85]:
# transactions:
# store_nbr: 25 // date: 2013-01-01 // transactions: 770

# history_per_year:
# df[(df['store_nbr'] == 25) & (df['day'] == 1) & (df['month'] == 1) & (df['year'] == 2013)].shape[0] --> 578

# Number of rows in history_per_year does not correspond with transactions. This is because shopping baskets
# (transactions) have different compositions per shopper.

KeyError: 'store_nbr'

In [None]:
# transactions
#df[(df['description'] == 'Independencia de Guayaquil')]

In [None]:
#df[(df['type'] == 'Transfer')]

In [None]:
#df[(df['type'] == 'Bridge')]

In [None]:
#df[(df['type'] == 'Work Day')]