# EyeOn Supermarket data science case study

In [13]:
#pip install --user pyarrow

# Requirement already satisfied: pyarrow in /usr/local/lib/python3.6/dist-packages (0.14.1)
# Requirement already satisfied: numpy>=1.14 in /usr/local/lib/python3.6/dist-packages (from pyarrow) (1.18.5)
# Requirement already satisfied: six>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from pyarrow) (1.15.0)

### Importing packages

In [14]:
import pandas as pd
import sys
import matplotlib.pyplot as plt

### Functions

In [15]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)
    
    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len-1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len-1]

    # Return result.
    return dummy

In [16]:
def f_info(df_input, c_feature, n_top =  10):

    # Testing!
    #df_input  = df_historyPerYear
    #c_feature = "year"
    #n_top     = 3

    # Do not calculate the frequency table in case the feature has unique values.
    if(df_input[c_feature].is_unique):
        print("Feature '" + c_feature + "' is unique.")
        return


    # Load package.
    from collections import Counter

    # Bereken frequenties.
    c = Counter(df_input[c_feature])

    # Converteer naar data frame.
    df_output         = pd.DataFrame(list(c.items()))

    # Hernoem kolomnamen.
    df_output.columns = ["level", "n"]

    # Bereken percentage.
    df_output["perc"] = round(100 * df_output["n"] / df_input.shape[0], 1).astype(str) + "%"

    # Sorteer data frame op frequentie.
    df_output         = df_output.sort_values(by = "n", ascending = False)

    # Reset index en verwijder index kolom die daardoor ontstaat.
    df_output         = df_output.reset_index().drop(columns=['index'])

    # Display tabel zonder index.
    # https://stackoverflow.com/questions/61363712/how-to-print-a-pandas-io-formats-style-styler-object
    if(df_output.shape[0] <= n_top):
        c.message = "we tonen alle " + str(df_output.shape[0]) + " levels:"
        
    else:
        c.message = "we tonen de Top-" + str(n_top) + " van de " + str(df_output.shape[0]) + " levels:"

        
    # Print header
    print("Frequentietabel voor '" + c_feature + "', " + c.message + "\n")
        
    print(f"'None': {c[None]} ({round(100 * c[None] / df_input.shape[0], 1)}%)")

    print(f"'NA':   {df_input[c_feature].isna().sum()} ({round(100 * df_input[c_feature].isna().sum() / df_input.shape[0], 1)}%)")
            
    display(df_output.head(n_top).style.hide_index())
    
    #plt.hist(df["year"], bins=10, orientation='horizontal') #plt.show()

    # Plot frequency if count is smaller than 20.
    v_data_to_plot = df[c_feature].value_counts(sort = True)[0:n_top]
    
    # https://stackoverflow.com/questions/36367986/how-to-make-inline-plots-in-jupyter-notebook-larger
    plt.rcParams['figure.figsize'] = [15, 5]
    
    fig, ax = plt.subplots()
    v_data_to_plot.plot(kind='bar', ax=ax)  
    plt.show()
    

In [17]:
#df['transactions'].value_counts(sort = True)[0:10]

In [18]:
#f_info(df, "month", 15)

In [19]:
def f_describe(df_input, n_top = 10):
    
    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))
     
    
    df_numeric = df_input.select_dtypes(include = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'])

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

        
    df_textual = df_input.select_dtypes(include = ['category', 'object', 'bool'])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())
        
        
    v_na = [col + " (" + str(df[col].isna().sum()) + ", " + str(round(100 * df[col].isna().sum() / df.shape[0], 1)) + "%)" for col in df.columns if df[col].isna().sum() > 0]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))


In [22]:
def f_get_data(i=0):

    #c_path = "drive/MyDrive/Colab Notebooks/2021 01 - Execute/EyeOn/Data/"

    # Define path.
    c_path = "/Users/home/OneDrive - IWD/Clients/PE/2021 01 - Execute - EyeOn/Data/"

    # Identify file.
    v_file = ("history-per-year",   # 0
              "history_aggregated", # 1
              "holidays_events",    # 2
              "items",              # 3
              "oil",                # 4
              "stores",             # 5
              "transactions")       # 6

    # Load data.
    df     = pd.read_parquet(c_path + v_file[i] + ".parquet")
    
    # Return data.
    return df

In [23]:
#f_describe(df)

In [24]:
#f_describe(df)

### Importing data
Importing the parquet files took a bit of fidling. Googling on this topic, I came across a few URLs, which I included for reference ([unable-to-read-a-parquet-file]("https://stackoverflow.com/questions/55147424/unable-to-read-a-parquet-file"),   [google-colab-dealing-with-files]("https://neptune.ai/blog/google-colab-dealing-with-files")). Though in the end, I simply downloaded the data to the local disk of my computer, and updated the path the to data. Alternatively, you can connect your Google Drive to your Colab session, and update the path accordingly, see example above, 'drive/MyDrive/Colab Notebooks/..'

In [25]:
df = f_get_data(0)

### Downcast data as needed

In [346]:
# A few basic statistics on df_historyPerYear.
print("The data:\n")
print(f"-> has size of {round(sys.getsizeof(df)/1024/1024/1024, 1)} GB.")
print("")

df.info()

The data:

-> has size of 0.1 GB.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948220 entries, 0 to 948219
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype              
---  ------          --------------   -----              
 0   onpromotion     948220 non-null  bool               
 1   item_nbr        948220 non-null  int64              
 2   week            948220 non-null  int64              
 3   date            948220 non-null  datetime64[ns, UTC]
 4   unit_sales_sum  948220 non-null  float64            
 5   count           948220 non-null  int64              
 6   family          948220 non-null  category           
 7   class           948220 non-null  int64              
 8   perishable      948220 non-null  int64              
dtypes: bool(1), category(1), datetime64[ns, UTC](1), float64(1), int64(5)
memory usage: 52.5 MB


In [347]:
# Update formatting of features
if(i == 0):
    print("Change: Month and year to integer")
    df = df.astype({"month": int, "year": int})

# objects to categorical - Not applicable here because there are no 'object' features
# df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').astype('category')

# convert integers to smallest unsigned integer and floats to smallest
for old, new in [('integer', 'unsigned'), ('float', 'float')]:
    
    print("change: " + old + " --> " + new)
    
    for col in df.select_dtypes(include=old).columns:
        
        df[col] = pd.to_numeric(df[col], downcast=new)

change: integer --> unsigned
change: float --> float


In [348]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948220 entries, 0 to 948219
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype              
---  ------          --------------   -----              
 0   onpromotion     948220 non-null  bool               
 1   item_nbr        948220 non-null  uint32             
 2   week            948220 non-null  uint32             
 3   date            948220 non-null  datetime64[ns, UTC]
 4   unit_sales_sum  948220 non-null  float32            
 5   count           948220 non-null  uint16             
 6   family          948220 non-null  category           
 7   class           948220 non-null  uint16             
 8   perishable      948220 non-null  uint8              
dtypes: bool(1), category(1), datetime64[ns, UTC](1), float32(1), uint16(2), uint32(2), uint8(1)
memory usage: 24.4 MB


### Some Statistics:

In [349]:
print("The data:\n")
print(f"-> contain {round(df.shape[0]/1e6, 1)} million observations and {df.shape[1]} features.\n")
print(f"-> contain {df.shape[0]} observations and {df.shape[1]} features.\n")
print(f"-> have feature names: {f_concat(df.columns)}.\n")
print(f"-> has size of {round(sys.getsizeof(df)/1024/1024/1024, 1)} GB.")

The data:

-> contain 0.9 million observations and 9 features.

-> contain 948220 observations and 9 features.

-> have feature names: onpromotion, item_nbr, week, date, unit_sales_sum, count, family, class, and perishable.

-> has size of 0.0 GB.


In [350]:
f_describe(df)

First 10 rows in de data:


Unnamed: 0,onpromotion,item_nbr,week,date,unit_sales_sum,count,family,class,perishable
0,False,764438,201323,2013-06-04 00:00:00+00:00,1925.0,180,GROCERY I,1072,0
1,False,764438,201324,2013-06-11 00:00:00+00:00,1516.0,164,GROCERY I,1072,0
2,False,764438,201325,2013-06-18 00:00:00+00:00,1656.0,176,GROCERY I,1072,0
3,False,764438,201326,2013-06-25 00:00:00+00:00,1610.0,177,GROCERY I,1072,0
4,False,764438,201327,2013-07-02 00:00:00+00:00,1987.0,182,GROCERY I,1072,0
5,False,764438,201328,2013-07-09 00:00:00+00:00,1588.0,169,GROCERY I,1072,0
6,False,764438,201329,2013-07-16 00:00:00+00:00,1688.0,171,GROCERY I,1072,0
7,False,764438,201330,2013-07-23 00:00:00+00:00,1572.0,165,GROCERY I,1072,0
8,False,764438,201331,2013-07-30 00:00:00+00:00,1755.0,176,GROCERY I,1072,0
9,False,764438,201332,2013-08-06 00:00:00+00:00,1780.0,178,GROCERY I,1072,0


Numerical data:


Unnamed: 0,item_nbr,week,unit_sales_sum,count,class,perishable
count,948220.0,948220.0,948220.0,948220.0,948220.0,948220.0
mean,1033259.0,201558.380135,1131.572754,132.35013,2152.235304,0.320138
std,524408.6,121.625703,2019.575439,103.404191,1337.917574,0.46653
min,96995.0,201301.0,-9757.0,1.0,1002.0,0.0
25%,584125.0,201447.0,137.0,27.0,1072.0,0.0
50%,1047756.0,201552.0,627.0,131.0,2022.0,0.0
75%,1456935.0,201644.0,1290.0,213.0,2718.0,1.0
max,2127114.0,201733.0,120981.757812,378.0,7780.0,1.0


Textual data:


Unnamed: 0,onpromotion,family
count,948220,948220
unique,2,33
top,False,GROCERY I
freq,676666,292938


In [351]:
#f_info(df, "date")

In [None]:
# item_nbr: 764438 // count: 180 // week: 201323 // unit_sales_sum: 1925 // onpromotion: false //
# family: grocery I // class: 1072

# df[(df['item_nbr'] == 103665)]

In [386]:
df = f_get_data(0)

In [388]:
# history_per_year:
df[(df['item_nbr'] == 103665) & (df['day'].isin([6,7,8,9,10,11,12])) & (df['month'] == 1) & (df['year'] == 2014)].shape[0]
# --> 205 rows

205

In [379]:
# history_per_year:
df[(df['item_nbr'] == 103665) & (df['day'].isin([6,7,8,9,10,11,12])) & (df['month'] == 1) & (df['year'] == 2014)].unit_sales.sum()
# --> 952.0

952.0

In [383]:
df = f_get_data(1)

In [384]:
# history_aggregated
df[(df['item_nbr'] == 103665) & (df['week'] == 201402)]
# --> onpromotion = False; count = 202; unit_sales_sum = 966.0

Unnamed: 0,onpromotion,item_nbr,week,date,unit_sales_sum,count,family,class,perishable
726055,False,103665,201402,2014-01-08 00:00:00+00:00,966.0,202,BREAD/BAKERY,2712,1


In [254]:
# transactions:
# store_nbr: 25 // date: 2013-01-01 // transactions: 770

# history_per_year:
# df[(df['store_nbr'] == 25) & (df['day'] == 1) & (df['month'] == 1) & (df['year'] == 2013)].shape[0] --> 578

# Number of rows in history_per_year does not correspond with transactions. This is because shopping baskets
# (transactions) have different compositions per shopper.

In [282]:
# transactions
#df[(df['description'] == 'Independencia de Guayaquil')]

In [281]:
#df[(df['type'] == 'Transfer')]

In [279]:
#df[(df['type'] == 'Bridge')]

In [280]:
#df[(df['type'] == 'Work Day')]