In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset_path = "../../data/raw/" 

### ***Database table relationships (from kaggle)***
<img src="https://i.imgur.com/HRhd2Y0.png" alt="Database table relationships" style="height: 500px; width:900px;"/>

In [3]:
files = os.listdir(dataset_path)
print(f'The dataset contains {len(files)} files:')
for file in files:
    print(f'    * {file}')

The dataset contains 9 files:
    * olist_customers_dataset.csv
    * olist_geolocation_dataset.csv
    * olist_orders_dataset.csv
    * olist_order_items_dataset.csv
    * olist_order_payments_dataset.csv
    * olist_order_reviews_dataset.csv
    * olist_products_dataset.csv
    * olist_sellers_dataset.csv
    * product_category_name_translation.csv


In [4]:
def tables_info(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes   
        info_df.loc[column,'rows_count'] = df[column].shape[0]
        info_df.loc[column,'non_null_rows_count'] =df[column].notnull().sum()
        info_df.loc[column,'null_row_count'] =df[column].isna().sum()
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [5]:
def data_subtype(df, column):
    # column = df.columns[4]
    # print(column)
    df = df[df[column].notnull()].reset_index(drop=True) #Remove possible not null values
    # dtype
    # print(df[column].dtype)
    if df[column].dtype == 'object':    
        #Date/time or str?
        try: #Date/time: Up to now only datetime  
            x = pd.to_datetime(df[column], infer_datetime_format=True)                   
            data_subtype = x.dtype

        except: #if it fails -> str
            if df[column].str.len().unique().shape[0] == 1: #All the elements have the same length if == 1
                data_subtype = f"CHAR({df[column].str.len().unique()[0]})"
            elif df[column].map(lambda x: len(x)).max() > 255: #Variable length higher than 255
                data_subtype = 'TEXT()' 
            else:
                # data_type = f"VARCHAR({df[column].map(lambda x: len(x)).max()})"   
                data_subtype = f"VARCHAR(255)"       
    elif df[column].dtype == 'float64':
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    else: #int64
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    # print(data_subtype)
    example = df[column][random.randint(0,df[column].shape[0])]
    return [data_subtype, example]

In [6]:
def tables_info2(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes
        data_subtype_example = data_subtype(df, column)
        info_df.loc[column, 'subtype'] = data_subtype_example[0]
        info_df.loc[column, 'example'] = data_subtype_example[1]
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]
        info_df.loc[column,'is_not_null'] =~df[column].isnull().values.any()
        info_df.loc[column, 'is_unique'] = df[column].is_unique        
        info_df.loc[column, 'is_primary_key'] = (column.find('_id') != -1) & info_df.loc[column,'is_not_null'] & info_df.loc[column, 'is_unique']
        info_df.loc[column, 'is_foreign_key'] = (column.find('_id') != -1) & ~info_df.loc[column, 'is_primary_key']
        info_df.loc[column,'is_id'] = (column.find('_id') != -1)       

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [7]:
csv_file_name = 'olist_order_payments_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
df = pd.read_csv(csv_file_path)

In [8]:
info_df = tables_info(df)
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,order_id,object,103886.0,103886.0,0.0,99440.0
1,payment_sequential,int64,103886.0,103886.0,0.0,29.0
2,payment_type,object,103886.0,103886.0,0.0,5.0
3,payment_installments,int64,103886.0,103886.0,0.0,24.0
4,payment_value,float64,103886.0,103886.0,0.0,29077.0


In [9]:
info_df = tables_info2(df)
info_df

Unnamed: 0,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,order_id,object,CHAR(32),e77f09ad0e02176a15af6981ee2d8ea0,99440.0,True,False,False,True,True
1,payment_sequential,int64,int64,1,29.0,True,False,False,False,False
2,payment_type,object,VARCHAR(255),credit_card,5.0,True,False,False,False,False
3,payment_installments,int64,int64,2,24.0,True,False,False,False,False
4,payment_value,float64,float64,99.37,29077.0,True,False,False,False,False


## Comments:
This dataset has ***NO NULL VALUES!***

As shown in the figure above, table `olist_order_payments_dataset` is related to `olist_orders_dataset` (through FK: `order_id`).
1. `order_id` and `payment_sequential` columns do not have **ONLY** unique values (meaning none of them is PK), but the combination of both is unique. This can be considered as PK. **Is this optimal?**
2. `payment_sequential` is a correlative number specific for each order (max value=29, meaning the biggest order contains 21 items). **See below.**
3. `order_id` column has 103886 values (>99441 in `olist_orders_dataset` -> this is OK considering 1 order may have more than 1 item). However, `unique_value_count` = 99940 < 99441, meaning some orders in `olist_orders_dataset` are not in `olist_order_payments_dataset`. **Why?** Missing orders in `olist_order_payments_dataset` are not related to any 'order_status' category, they are just missing. We have to agree about what to do here. Maybe, remove these orders from `olist_orders_dataset`. ***ONLY ONE ROW IS MISSING*** **(See below!)**

### ***Checking point list 1***

In [10]:
df_new_index = (df['order_id'] + '_' + df['payment_sequential'].astype(str)).to_frame('order_id + payment_sequential')
info_df = tables_info(df_new_index)
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,order_id + payment_sequential,object,103886.0,103886.0,0.0,103886.0


In [11]:
info_df = tables_info2(df_new_index)
info_df

Unnamed: 0,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,order_id + payment_sequential,object,VARCHAR(255),5c748cfa838e65b06c230b1f917ec95e_1,103886.0,True,True,True,False,True


### ***Checking point list 2***

In [12]:
# Order per number of items -> Mostly 1 item per order (~87.6% of orders)
df['payment_sequential'].value_counts()

1     99360
2      3039
3       581
4       278
5       170
6       118
7        82
8        54
9        43
10       34
11       29
12       21
13       13
14       10
15        8
18        6
19        6
16        6
17        6
21        4
20        4
22        3
26        2
24        2
23        2
25        2
29        1
28        1
27        1
Name: payment_sequential, dtype: int64

In [13]:
# Items per order -> Biggest order contains 29 items
df['order_id'].value_counts()

fa65dad1b0e818e3ccc5cb0e39231352    29
ccf804e764ed5650cd8759557269dc13    26
285c2e15bebd4ac83635ccc563dc71f4    22
895ab968e7bb0d5659d16cd74cd1650c    21
fedcd9f7ccdc8cba3a18defedd1a5547    19
                                    ..
6d2a30c9b7dcee3ed507dc9a601f99e7     1
a7737f6d9208dd56ea498a322ed3c37f     1
646e62df54f3e236eb6d5ff3b31429b8     1
e115da7a49ec2acf622e1f31da65cfb9     1
28bbae6599b09d39ca406b747b6632b1     1
Name: order_id, Length: 99440, dtype: int64

#### ***Taking a look at biggest order***

In [14]:
idx = 0
info_df = tables_info(df[df['order_id']==df['order_id'].value_counts().index[idx]])
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,order_id,object,29.0,29.0,0.0,1.0
1,payment_sequential,int64,29.0,29.0,0.0,29.0
2,payment_type,object,29.0,29.0,0.0,1.0
3,payment_installments,int64,29.0,29.0,0.0,1.0
4,payment_value,float64,29.0,29.0,0.0,25.0


In [15]:
df[df['order_id']==df['order_id'].value_counts().index[idx]].sort_values('payment_sequential')

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
14321,fa65dad1b0e818e3ccc5cb0e39231352,1,voucher,1,3.71
23074,fa65dad1b0e818e3ccc5cb0e39231352,2,voucher,1,8.51
65641,fa65dad1b0e818e3ccc5cb0e39231352,3,voucher,1,2.95
9985,fa65dad1b0e818e3ccc5cb0e39231352,4,voucher,1,29.16
28330,fa65dad1b0e818e3ccc5cb0e39231352,5,voucher,1,0.66
29648,fa65dad1b0e818e3ccc5cb0e39231352,6,voucher,1,5.02
82593,fa65dad1b0e818e3ccc5cb0e39231352,7,voucher,1,0.32
68853,fa65dad1b0e818e3ccc5cb0e39231352,8,voucher,1,26.02
17274,fa65dad1b0e818e3ccc5cb0e39231352,9,voucher,1,1.08
19565,fa65dad1b0e818e3ccc5cb0e39231352,10,voucher,1,12.86


### ***Checking point list 3***

In [16]:
# order_id` column has 112650 values (>99441 in `olist_orders_dataset` -> this is OK considering 1 order 
# may have more than 1 item). However, `unique_value_count` < 99441, meaning some orders in `olist_orders_dataset` 
# are not in `olist_orders_dataset`. **Why? Canceled orders?**

In [17]:
csv_file_name = 'olist_orders_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
order_df = pd.read_csv(csv_file_path)

In [18]:
list_1 = order_df['order_id'].unique().tolist()
list_1.sort(key = str)
print(f'List 1 has {len(list_1)} elements')

List 1 has 99441 elements


In [19]:
list_2 = df['order_id'].unique().tolist()
list_2.sort(key = str)
print(f'List 2 has {len(list_2)} elements')

List 2 has 99440 elements


In [20]:
set_1 = set(list_1)
set_2 = set(list_2)
missing = list(sorted(set_1 - set_2))
print(f'There are {len(missing)} missing orders in olist_order_items_dataset. Why?')

There are 1 missing orders in olist_order_items_dataset. Why?


In [21]:
# Missing orders in olist_order_items_dataset
missing_orders_df = order_df[order_df['order_id'].isin(missing)]
missing_orders_df#.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
30710,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574,delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-07 17:11:53,2016-11-09 07:47:38,2016-10-04 00:00:00


In [22]:
# Checking their order_status -> not only 'unavailable' or 'canceled' are missing. Also 'shipped' items.
missing_orders_df['order_status'].value_counts() 

delivered    1
Name: order_status, dtype: int64

In [23]:
# On the other hand, not ALL 'canceled' or 'unavailable' missing.
order_df['order_status'].value_counts()

delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64

In [24]:
# Summary per 'order_status'
# Missing orders in olist_order_items_dataset are not defined by 'order_status' categories, they are just missing. 
# We have to agree about what to do here. Maybe, remove these from olist_orders_dataset
sum_order_status = order_df['order_status'].value_counts().to_frame('All orders')
sum_order_status['Missing orders in order_items_dataset'] = missing_orders_df['order_status'].value_counts()
sum_order_status['Missing orders in order_items_dataset'].fillna(0, inplace=True)
sum_order_status['Total orders in order_items_dataset'] = sum_order_status['All orders'] - sum_order_status['Missing orders in order_items_dataset']
sum_order_status.iloc[:, [0,2,1]]

Unnamed: 0,All orders,Total orders in order_items_dataset,Missing orders in order_items_dataset
delivered,96478,96477.0,1.0
shipped,1107,1107.0,0.0
canceled,625,625.0,0.0
unavailable,609,609.0,0.0
invoiced,314,314.0,0.0
processing,301,301.0,0.0
created,5,5.0,0.0
approved,2,2.0,0.0


In [25]:
status = 'delivered'
not_missing_orders_df = order_df[~order_df['order_id'].isin(missing)]
idx = 1
not_missing_orders_df[not_missing_orders_df['order_status'] == status].reset_index(drop=True).iloc[idx]

order_id                         53cdb2fc8bc7dce0b6741e2150273451
customer_id                      b0830fb4747a6c6d20dea0b8c802d7ef
order_status                                            delivered
order_purchase_timestamp                      2018-07-24 20:41:37
order_approved_at                             2018-07-26 03:24:27
order_delivered_carrier_date                  2018-07-26 14:31:00
order_delivered_customer_date                 2018-08-07 15:27:45
order_estimated_delivery_date                 2018-08-13 00:00:00
Name: 1, dtype: object

In [26]:
idxs = not_missing_orders_df[not_missing_orders_df['order_status'] == status].reset_index(drop=True).iloc[idx]
df[df['order_id'].isin(idxs)]

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
7652,53cdb2fc8bc7dce0b6741e2150273451,1,boleto,1,141.46
