### Supermarket data science case study - Explorting the first data

In [12]:
#pip install --user pyarrow

### Importing packages

In [13]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
import sklearn

from sklearn.pipeline import Pipeline, make_pipeline

### Functions

In [14]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)
    
    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len-1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len-1]

    # Return result.
    return dummy

In [15]:
def f_describe(df_input, n_top = 10):
    
    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))
     
    
    df_numeric = df_input.select_dtypes(include = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'])

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

        
    df_textual = df_input.select_dtypes(include = ['category', 'object', 'bool'])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())
        
        
    v_na = [col + " (" + str(df[col].isna().sum()) + ", " + str(round(100 * df[col].isna().sum() / df.shape[0], 1)) + "%)" for col in df.columns if df[col].isna().sum() > 0]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

### Downcast data
Update formatting of features to optimize memory and standardize column names.

In [16]:
def standardize_column_names(s):
    return s.replace(" ", "")

def optimize_memory(df):
    # objects to categorical.
    df[df.select_dtypes(include="object").columns] = df.select_dtypes(
        include="object"
    ).astype("category")

    # convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df

In [17]:
# # Update formatting of features
# i=0

# if(i == 0):
#     print("Change: Month and Year to integer")
#     df = df.astype({"month": int, "year": int})

# # objects to categorical - Not applicable here because there are no 'object' features
# # df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').astype('category')

# # convert integers to smallest unsigned integer and floats to smallest
# for old, new in [('integer', 'unsigned'), ('float', 'float')]:
    
#     print("change: " + old + " --> " + new)
    
#     for col in df.select_dtypes(include=old).columns:
        
#         df[col] = pd.to_numeric(df[col], downcast=new)

### Import data from local PATH
Import trough pipeline to downcast the data

In [18]:
def f_get_data(i=0):

    # Define path.
    c_path = 'C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/raw/'
    

    # Identify file.
    v_file = ("history-per-year",   # 0
              "history_aggregated", # 1
              "holidays_events",    # 2
              "items",              # 3
              "oil",                # 4
              "stores",             # 5
              "transactions")       # 6

    # Load data.
    df     = pd.read_parquet(c_path + v_file[i] + ".parquet")
    
    df = (pd.read_parquet(c_path + v_file[i] + ".parquet")
         .rename(columns = standardize_column_names)
         .pipe(optimize_memory))
    
    # Return data.
    return df

### Importing data

In [19]:
df = f_get_data(0)


In [20]:
f_describe(df)

First 10 rows in de data:


Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month
0,0,25,103665,7.0,,1,2013,1
1,1,25,105574,1.0,,1,2013,1
2,2,25,105575,2.0,,1,2013,1
3,3,25,108079,1.0,,1,2013,1
4,4,25,108701,1.0,,1,2013,1
5,5,25,108786,3.0,,1,2013,1
6,6,25,108797,1.0,,1,2013,1
7,7,25,108952,1.0,,1,2013,1
8,8,25,111397,13.0,,1,2013,1
9,9,25,114790,3.0,,1,2013,1


Numerical data:


Unnamed: 0,store_nbr,unit_sales,day
count,125497000.0,125497000.0,125497000.0
mean,27.46458,8.554879,15.60188
std,16.33051,23.60515,8.816411
min,1.0,-15372.0,1.0
25%,12.0,2.0,8.0
50%,28.0,4.0,15.0
75%,43.0,9.0,23.0
max,54.0,89440.0,31.0


Textual data:


Unnamed: 0,onpromotion,year,month
count,103839389,125497040,125497040
unique,2,5,12
top,False,2016,7
freq,96028767,35229871,12292189


Features and their number of missing values:


'onpromotion (21657651, 17.3%)'

In [21]:
# A few basic statistics on df_historyPerYear(0).
print("The data:\n")
print(f"-> has size of {round(sys.getsizeof(df)/1024/1024/1024, 3)} GB.")
print("")

df.info()

The data:

-> has size of 2.104 GB.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125497040 entries, 0 to 125497039
Data columns (total 8 columns):
 #   Column       Dtype   
---  ------       -----   
 0   id           uint32  
 1   store_nbr    uint8   
 2   item_nbr     uint32  
 3   unit_sales   float32 
 4   onpromotion  boolean 
 5   day          uint8   
 6   year         category
 7   month        category
dtypes: boolean(1), category(2), float32(1), uint32(2), uint8(2)
memory usage: 2.1 GB


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125497040 entries, 0 to 125497039
Data columns (total 8 columns):
 #   Column       Dtype   
---  ------       -----   
 0   id           uint32  
 1   store_nbr    uint8   
 2   item_nbr     uint32  
 3   unit_sales   float32 
 4   onpromotion  boolean 
 5   day          uint8   
 6   year         category
 7   month        category
dtypes: boolean(1), category(2), float32(1), uint32(2), uint8(2)
memory usage: 2.1 GB


### Some Statistics:

In [23]:
print("The data:\n")
print(f"-> Contains:                {round(df.shape[0]/1e6, 1)} million observations and {df.shape[1]} features.\n")
print(f"-> Contains:                {df.shape[0]} observations and {df.shape[1]} features.\n")
print(f"-> Have feature names:      {f_concat(df.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB.")

The data:

-> Contains:                125.5 million observations and 8 features.

-> Contains:                125497040 observations and 8 features.

-> Have feature names:      id, store_nbr, item_nbr, unit_sales, onpromotion, day, year, and month.

-> Has optimized size of    2.1 GB.


In [None]:
df = f_get_data(0)
f_describe(df)

#df.head()
#df.tail(10)
#df.sample(20)
#df.info()
#df.describe()
#df.nunique

In [26]:
df = f_get_data(1)
f_describe(df)

First 10 rows in de data:


Unnamed: 0,onpromotion,item_nbr,week,date,unit_sales_sum,count,family,class,perishable
0,False,764438,201323,2013-06-04 00:00:00+00:00,1925.0,180,GROCERY I,1072,0
1,False,764438,201324,2013-06-11 00:00:00+00:00,1516.0,164,GROCERY I,1072,0
2,False,764438,201325,2013-06-18 00:00:00+00:00,1656.0,176,GROCERY I,1072,0
3,False,764438,201326,2013-06-25 00:00:00+00:00,1610.0,177,GROCERY I,1072,0
4,False,764438,201327,2013-07-02 00:00:00+00:00,1987.0,182,GROCERY I,1072,0
5,False,764438,201328,2013-07-09 00:00:00+00:00,1588.0,169,GROCERY I,1072,0
6,False,764438,201329,2013-07-16 00:00:00+00:00,1688.0,171,GROCERY I,1072,0
7,False,764438,201330,2013-07-23 00:00:00+00:00,1572.0,165,GROCERY I,1072,0
8,False,764438,201331,2013-07-30 00:00:00+00:00,1755.0,176,GROCERY I,1072,0
9,False,764438,201332,2013-08-06 00:00:00+00:00,1780.0,178,GROCERY I,1072,0


Numerical data:


Unnamed: 0,unit_sales_sum,count,class,perishable
count,948220.0,948220.0,948220.0,948220.0
mean,1132.237528,132.35013,2152.235304,0.320138
std,2018.911355,103.404191,1337.917574,0.46653
min,-9757.0,1.0,1002.0,0.0
25%,137.0,27.0,1072.0,0.0
50%,627.0,131.0,2022.0,0.0
75%,1290.0,213.0,2718.0,1.0
max,120981.76,378.0,7780.0,1.0


Textual data:


Unnamed: 0,onpromotion,family
count,948220,948220
unique,2,33
top,False,GROCERY I
freq,676666,292938


In [27]:
df = f_get_data(2)
f_describe(df)

First 10 rows in de data:


Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
5,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False
6,2012-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda,False
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False


Textual data:


Unnamed: 0,date,type,locale,locale_name,description,transferred
count,350,350,350,350,350,350
unique,312,6,3,24,103,2
top,2014-06-25,Holiday,National,Ecuador,Carnaval,False
freq,4,221,174,174,10,338


In [28]:
df = f_get_data(3)
f_describe(df)

First 10 rows in de data:


Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0
1,99197,GROCERY I,1067,0
2,103501,CLEANING,3008,0
3,103520,GROCERY I,1028,0
4,103665,BREAD/BAKERY,2712,1
5,105574,GROCERY I,1045,0
6,105575,GROCERY I,1045,0
7,105576,GROCERY I,1045,0
8,105577,GROCERY I,1045,0
9,105693,GROCERY I,1034,0


Numerical data:


Unnamed: 0,class,perishable
count,4100.0,4100.0
mean,2169.65,0.240488
std,1484.9109,0.427432
min,1002.0,0.0
25%,1068.0,0.0
50%,2004.0,0.0
75%,2990.5,0.0
max,7780.0,1.0


Textual data:


Unnamed: 0,family
count,4100
unique,33
top,GROCERY I
freq,1334


In [29]:
df = f_get_data(4)
f_describe(df)

First 10 rows in de data:


Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.139999
2,2013-01-03,92.970001
3,2013-01-04,93.120003
4,2013-01-07,93.199997
5,2013-01-08,93.209999
6,2013-01-09,93.080002
7,2013-01-10,93.809998
8,2013-01-11,93.599998
9,2013-01-14,94.269997


Numerical data:


Unnamed: 0,dcoilwtico
count,1175.0
mean,67.714363
std,25.630476
min,26.190001
25%,46.404999
50%,53.189999
75%,95.660004
max,110.620003


Textual data:


Unnamed: 0,date
count,1218
unique,1218
top,2013-01-01
freq,1


Features and their number of missing values:


'dcoilwtico (43, 3.5%)'

In [30]:
df = f_get_data(5)
f_describe(df)

First 10 rows in de data:


Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


Numerical data:


Unnamed: 0,store_nbr,cluster
count,54.0,54.0
mean,27.5,8.481481
std,15.732133,4.693395
min,1.0,1.0
25%,14.25,4.0
50%,27.5,8.5
75%,40.75,13.0
max,54.0,17.0


Textual data:


Unnamed: 0,city,state,type
count,54,54,54
unique,22,16,5
top,Quito,Pichincha,D
freq,18,19,18


In [31]:
df = f_get_data(6)
f_describe(df)

First 10 rows in de data:


Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903
6,2013-01-02,6,2143
7,2013-01-02,7,1874
8,2013-01-02,8,3250
9,2013-01-02,9,2940


Numerical data:


Unnamed: 0,store_nbr,transactions
count,83488.0,83488.0
mean,26.939237,1694.602158
std,15.608204,963.286644
min,1.0,5.0
25%,13.0,1046.0
50%,27.0,1393.0
75%,40.0,2079.0
max,54.0,8359.0


Textual data:


Unnamed: 0,date
count,83488
unique,1682
top,2017-08-15
freq,54
