# Data Exploration
## sales.csv

In [1]:
# Import pandas library
import pandas as pd
sales = pd.read_csv("https://media.githubusercontent.com/media/E-man85/projectII/main/raw_data/sales.csv",\
                    dtype={'promo_bin_2': str, 'promo_discount_type_2': str})
# Dimensions
sales.shape

(8886058, 14)

## Overview

In [2]:
# Global Information
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8886058 entries, 0 to 8886057
Data columns (total 14 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Unnamed: 0             int64  
 1   store_id               object 
 2   product_id             object 
 3   date                   object 
 4   sales                  float64
 5   revenue                float64
 6   stock                  float64
 7   price                  float64
 8   promo_type_1           object 
 9   promo_bin_1            object 
 10  promo_type_2           object 
 11  promo_bin_2            object 
 12  promo_discount_2       float64
 13  promo_discount_type_2  object 
dtypes: float64(5), int64(1), object(8)
memory usage: 949.1+ MB


In [3]:
# Duplicate rows
sales.duplicated().any()

False

### Column Unnamed: 0

In [4]:
# Missing Values
print(sales['Unnamed: 0'].isnull().any())
# Duplicate rows
print(sales['Unnamed: 0'].duplicated().any())
# Unique values
print(sales['Unnamed: 0'].nunique())
# Statistics
print(sales['Unnamed: 0'].describe())

False
False
8886058
count    8.886058e+06
mean     4.443030e+06
std      2.565184e+06
min      1.000000e+00
25%      2.221515e+06
50%      4.443030e+06
75%      6.664544e+06
max      8.886058e+06
Name: Unnamed: 0, dtype: float64


In [5]:
# Verificar se não existem registos em falta tendo em conta a sequencia numerica da coluna Unnamed: 0
import numpy as np
# converter listas de valores em arrays uma vez que são superiores a 8 milhoes de valores e o calculo de listas exige muitos recursos
list_id = np.array([id for id in range(1,sales.shape[0]+1)])
values_sales_id = np.array(sales.iloc[:, 0].tolist())
# np.setxor1d() (Find the set exclusive-or of two arrays.)
np.setxor1d(list_id, values_sales_id)

array([], dtype=int32)

### Column store_id

In [6]:
# Missing Values
print(sales['store_id'].isnull().any())
# Duplicate rows
print(sales['store_id'].duplicated().any())
# Unique values
print(sales['store_id'].nunique())
# Statistics
print(sales['store_id'].describe())

False
True
63
count     8886058
unique         63
top         S0038
freq       334082
Name: store_id, dtype: object


### Column product_id

In [7]:
# Missing Values
print(sales['product_id'].isnull().any())
# Duplicate rows
print(sales['product_id'].duplicated().any())
# Unique values
print(sales['product_id'].nunique())
# Statistics
print(sales['product_id'].describe())

False
True
615
count     8886058
unique        615
top         P0664
freq        59051
Name: product_id, dtype: object


### Column date

In [8]:
# Missing Values
print(sales['date'].isnull().any())
# Duplicate rows
print(sales['date'].duplicated().any())
# Unique values
print(sales['date'].nunique())
# Statistics
print(sales['date'].describe())

False
True
1033
count        8886058
unique          1033
top       2019-08-10
freq           10090
Name: date, dtype: object


### Column sales

In [9]:
# Missing Values
print(sales['sales'].isnull().any())
# Duplicate rows
print(sales['sales'].duplicated().any())
# Unique values
print(sales['sales'].nunique())
# Statistics 
print(sales['sales'].describe())

True
True
5435
count    8.583762e+06
mean     4.734080e-01
std      2.129059e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      4.330100e+04
Name: sales, dtype: float64


### Column revenue

In [10]:
# Missing Values
print(sales['revenue'].isnull().any())
# Duplicate rows
print(sales['revenue'].duplicated().any())
# Unique values
print(sales['revenue'].nunique())
# Statistics
print(sales['revenue'].describe())

True
True
12155
count    8.583762e+06
mean     2.285173e+00
std      5.406806e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      8.419796e+04
Name: revenue, dtype: float64


### Column stock

In [11]:
# Missing Values
print(sales['stock'].isnull().any())
# Duplicate rows
print(sales['stock'].duplicated().any())
# Unique values
print(sales['stock'].nunique())
# Statistics 
print(sales['stock'].describe())

True
True
9039
count    8.583762e+06
mean     1.600575e+01
std      3.751692e+01
min      0.000000e+00
25%      4.000000e+00
50%      8.000000e+00
75%      1.700000e+01
max      4.655000e+03
Name: stock, dtype: float64


### Column price

In [12]:
# Missing Values
print(sales['price'].isnull().any())
# Duplicate rows
print(sales['price'].duplicated().any())
# Unique values
print(sales['price'].nunique())
# Statistics 
print(sales['price'].describe())

True
True
606
count    8.794677e+06
mean     1.575377e+01
std      3.277869e+01
min      1.000000e-02
25%      3.450000e+00
50%      8.000000e+00
75%      1.695000e+01
max      1.599000e+03
Name: price, dtype: float64


### Column promo_type_1

In [13]:
# Missing Values
print(sales['promo_type_1'].isnull().any())
# Duplicate rows
print(sales['promo_type_1'].duplicated().any())
# Unique values
print(sales['promo_type_1'].nunique())
# Statistics  
print(sales['promo_type_1'].describe())

False
True
17
count     8886058
unique         17
top          PR14
freq      7653515
Name: promo_type_1, dtype: object


### Column promo_type_2

In [14]:
# Missing Values
print(sales['promo_type_2'].isnull().any())
# Duplicate rows
print(sales['promo_type_2'].duplicated().any())
# Unique values
print(sales['promo_type_2'].nunique())
# Statistics 
print(sales['promo_type_2'].describe())

False
True
4
count     8886058
unique          4
top          PR03
freq      8873337
Name: promo_type_2, dtype: object


### Column promo_bin_1

In [15]:
# Missing Values
print(sales['promo_bin_1'].isnull().any())
# Duplicate rows
print(sales['promo_bin_1'].duplicated().any())
# Unique values
print(sales['promo_bin_1'].nunique())
# Statistics 
print(sales['promo_bin_1'].describe())

True
True
5
count     1232543
unique          5
top       verylow
freq       514398
Name: promo_bin_1, dtype: object


### Column promo_bin_2

In [16]:
# Missing Values
print(sales['promo_bin_2'].isnull().any())
# Duplicate rows
print(sales['promo_bin_2'].duplicated().any())
# Unique values
print(sales['promo_bin_2'].nunique())
# Statistics 
print(sales['promo_bin_2'].describe())

True
True
3
count       12721
unique          3
top       verylow
freq         6441
Name: promo_bin_2, dtype: object


### Column promo_discount_2

In [17]:
# Missing Values
print(sales['promo_discount_2'].isnull().any())
# Duplicate rows
print(sales['promo_discount_2'].duplicated().any())
# Unique values
print(sales['promo_discount_2'].nunique())
# Statistics 
print(sales['promo_discount_2'].describe())

True
True
6
count    12721.000000
mean        30.110605
std         11.850900
min         16.000000
25%         20.000000
50%         20.000000
75%         35.000000
max         50.000000
Name: promo_discount_2, dtype: float64


### Column promo_discount_type_2

In [18]:
# Missing Values
print(sales['promo_discount_type_2'].isnull().any())
# Duplicate rows
print(sales['promo_discount_type_2'].duplicated().any())
# Unique values
print(sales['promo_discount_type_2'].nunique())
# Statistics  
print(sales['promo_discount_type_2'].describe())

True
True
4
count     12721
unique        4
top        PR01
freq       3762
Name: promo_discount_type_2, dtype: object


In [19]:
# First Rows
sales.head(10)

Unnamed: 0.1,Unnamed: 0,store_id,product_id,date,sales,revenue,stock,price,promo_type_1,promo_bin_1,promo_type_2,promo_bin_2,promo_discount_2,promo_discount_type_2
0,1,S0002,P0001,2017-01-02,0.0,0.0,8.0,6.25,PR14,,PR03,,,
1,2,S0002,P0005,2017-01-02,0.0,0.0,11.0,33.9,PR14,,PR03,,,
2,3,S0002,P0011,2017-01-02,0.0,0.0,9.0,49.9,PR14,,PR03,,,
3,4,S0002,P0015,2017-01-02,1.0,2.41,19.0,2.6,PR14,,PR03,,,
4,5,S0002,P0017,2017-01-02,0.0,0.0,12.0,1.49,PR14,,PR03,,,
5,6,S0002,P0018,2017-01-02,1.0,1.81,37.0,1.95,PR14,,PR03,,,
6,7,S0002,P0024,2017-01-02,0.0,0.0,36.0,1.95,PR14,,PR03,,,
7,8,S0002,P0035,2017-01-02,2.0,4.54,15.0,2.45,PR14,,PR03,,,
8,9,S0002,P0046,2017-01-02,0.0,0.0,11.0,34.5,PR14,,PR03,,,
9,10,S0002,P0051,2017-01-02,7.0,4.54,132.0,0.7,PR14,,PR03,,,


In [20]:
# Last Rows
sales.tail(10)

Unnamed: 0.1,Unnamed: 0,store_id,product_id,date,sales,revenue,stock,price,promo_type_1,promo_bin_1,promo_type_2,promo_bin_2,promo_discount_2,promo_discount_type_2
8886048,8886049,S0143,P0639,2019-10-31,,,,9.75,PR14,,PR03,,,
8886049,8886050,S0143,P0642,2019-10-31,,,,4.0,PR14,,PR03,,,
8886050,8886051,S0143,P0658,2019-10-31,,,,41.5,PR14,,PR03,,,
8886051,8886052,S0143,P0663,2019-10-31,,,,6.75,PR10,verylow,PR03,,,
8886052,8886053,S0143,P0664,2019-10-31,,,,1.75,PR14,,PR03,,,
8886053,8886054,S0143,P0676,2019-10-31,,,,19.9,PR03,verylow,PR03,,,
8886054,8886055,S0143,P0680,2019-10-31,,,,139.9,PR14,,PR03,,,
8886055,8886056,S0143,P0694,2019-10-31,,,,7.5,PR14,,PR03,,,
8886056,8886057,S0143,P0718,2019-10-31,,,,23.75,PR14,,PR03,,,
8886057,8886058,S0143,P0747,2019-10-31,,,,21.9,PR14,,PR03,,,
