In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset_path = "../../data/raw/" 

### ***Database table relationships (from kaggle)***
<img src="https://i.imgur.com/HRhd2Y0.png" alt="Database table relationships" style="height: 500px; width:900px;"/>

In [3]:
files = os.listdir(dataset_path)
print(f'The dataset contains {len(files)} files:')
for file in files:
    print(f'    * {file}')

The dataset contains 9 files:
    * olist_customers_dataset.csv
    * olist_geolocation_dataset.csv
    * olist_orders_dataset.csv
    * olist_order_items_dataset.csv
    * olist_order_payments_dataset.csv
    * olist_order_reviews_dataset.csv
    * olist_products_dataset.csv
    * olist_sellers_dataset.csv
    * product_category_name_translation.csv


In [4]:
def tables_info(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes   
        info_df.loc[column,'rows_count'] = df[column].shape[0]
        info_df.loc[column,'non_null_rows_count'] =df[column].notnull().sum()
        info_df.loc[column,'null_row_count'] =df[column].isna().sum()
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [5]:
def data_subtype(df, column):
    # column = df.columns[4]
    # print(column)
    df = df[df[column].notnull()].reset_index(drop=True) #Remove possible not null values
    # dtype
    # print(df[column].dtype)
    if df[column].dtype == 'object':    
        #Date/time or str?
        try: #Date/time: Up to now only datetime  
            x = pd.to_datetime(df[column], infer_datetime_format=True)                   
            data_subtype = x.dtype

        except: #if it fails -> str
            if df[column].str.len().unique().shape[0] == 1: #All the elements have the same length if == 1
                data_subtype = f"CHAR({df[column].str.len().unique()[0]})"
            elif df[column].map(lambda x: len(x)).max() > 255: #Variable length higher than 255
                data_subtype = 'TEXT()' 
            else:
                # data_type = f"VARCHAR({df[column].map(lambda x: len(x)).max()})"   
                data_subtype = f"VARCHAR(255)"       
    elif df[column].dtype == 'float64':
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    else: #int64
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    # print(data_subtype)
    example = df[column][random.randint(0,df[column].shape[0])]
    return [data_subtype, example]

In [6]:
def tables_info2(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes
        data_subtype_example = data_subtype(df, column)
        info_df.loc[column, 'subtype'] = data_subtype_example[0]
        info_df.loc[column, 'example'] = data_subtype_example[1]
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]
        info_df.loc[column,'is_not_null'] =~df[column].isnull().values.any()
        info_df.loc[column, 'is_unique'] = df[column].is_unique        
        info_df.loc[column, 'is_primary_key'] = (column.find('_id') != -1) & info_df.loc[column,'is_not_null'] & info_df.loc[column, 'is_unique']
        info_df.loc[column, 'is_foreign_key'] = (column.find('_id') != -1) & ~info_df.loc[column, 'is_primary_key']
        info_df.loc[column,'is_id'] = (column.find('_id') != -1)       

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [7]:
csv_file_name = 'olist_order_items_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
df = pd.read_csv(csv_file_path)

In [8]:
info_df = tables_info(df)
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,order_id,object,112650.0,112650.0,0.0,98666.0
1,order_item_id,int64,112650.0,112650.0,0.0,21.0
2,product_id,object,112650.0,112650.0,0.0,32951.0
3,seller_id,object,112650.0,112650.0,0.0,3095.0
4,shipping_limit_date,object,112650.0,112650.0,0.0,93318.0
5,price,float64,112650.0,112650.0,0.0,5968.0
6,freight_value,float64,112650.0,112650.0,0.0,6999.0


In [9]:
info_df = tables_info2(df)
info_df

Unnamed: 0,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,order_id,object,CHAR(32),a917dadebfac700fb6fcbf80126d4f57,98666.0,True,False,False,True,True
1,order_item_id,int64,int64,3,21.0,True,False,False,True,True
2,product_id,object,CHAR(32),e19ddcc85537b41f22116c8d5425ef46,32951.0,True,False,False,True,True
3,seller_id,object,CHAR(32),71039d19d4303bf9054d69e9a9236699,3095.0,True,False,False,True,True
4,shipping_limit_date,object,datetime64[ns],2017-11-30 14:58:03,93318.0,True,False,False,False,False
5,price,float64,float64,25.0,5968.0,True,False,False,False,False
6,freight_value,float64,float64,14.68,6999.0,True,False,False,False,False


## Comments:
This dataset has ***NO NULL VALUES!***

As shown in the figure above, table `olist_order_items_dataset` is related to `olist_orders_dataset` (through FK: `order_id`),`olist_products_dataset` (through FK: `product_id`) and`olist_sellers_dataset` (through FK: `seller_id`).
1. `order_id` and `order_item_id` columns do not have **ONLY** unique values (meaning none of them is PK), but the combination of both is unique. This can be considered as PK. **Is this optimal?**
2. `order_item_id` is a correlative number specific for each order (max value=21, meaning the biggest order contains 21 items). **See below.**
3. `order_id` column has 112650 values (>99441 in `olist_orders_dataset` -> this is OK considering 1 order may have more than 1 item). However, `unique_value_count` = 98666 < 99441, meaning some orders in `olist_orders_dataset` are not in `olist_order_items_dataset`. **Why?** Missing orders in `olist_order_items_dataset` are not related to any 'order_status' category, they are just missing. We have to agree about what to do here. Maybe, remove these orders from `olist_orders_dataset`. **See below!**
4. `product_id` contains 32951 unique values (same values as `olist_products_dataset`) **See below!**
5. `seller_id` contains 3095 unique values (same values as `olist_sellers_dataset`) **See below!**

### ***Checking point list 1***

In [10]:
df_new_index = (df['order_id'] + '_' + df['order_item_id'].astype(str)).to_frame('order_id + order_item_id')
info_df = tables_info(df_new_index)
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,order_id + order_item_id,object,112650.0,112650.0,0.0,112650.0


In [11]:
info_df = tables_info2(df_new_index)
info_df

Unnamed: 0,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,order_id + order_item_id,object,VARCHAR(255),2b4c32b76a976360b6260712b8765761_1,112650.0,True,True,True,False,True


### ***Checking point list 2***

In [12]:
# Order per number of items -> Mostly 1 item per order (~87.6% of orders)
df['order_item_id'].value_counts()

1     98666
2      9803
3      2287
4       965
5       460
6       256
7        58
8        36
9        28
10       25
11       17
12       13
13        8
14        7
15        5
16        3
17        3
18        3
19        3
20        3
21        1
Name: order_item_id, dtype: int64

In [13]:
# Items per order -> Biggest order contains 21 items
df['order_id'].value_counts()

8272b63d03f5f79c56e9e4120aec44ef    21
1b15974a0141d54e36626dca3fdc731a    20
ab14fdcfbe524636d65ee38360e22ce8    20
9ef13efd6949e4573a18964dd1bbe7f5    15
428a2f660dc84138d969ccd69a0ab6d5    15
                                    ..
5a0911d70c1f85d3bed0df1bf693a6dd     1
5a082b558a3798d3e36d93bfa8ca1eae     1
5a07264682e0b8fbb3f166edbbffc6e8     1
5a071192a28951b76774e5a760c8c9b7     1
fffe41c64501cc87c801fd61db3f6244     1
Name: order_id, Length: 98666, dtype: int64

#### ***Taking a look at biggest order***

In [14]:
idx = 0
info_df = tables_info(df[df['order_id']==df['order_id'].value_counts().index[idx]])
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,order_id,object,21.0,21.0,0.0,1.0
1,order_item_id,int64,21.0,21.0,0.0,21.0
2,product_id,object,21.0,21.0,0.0,3.0
3,seller_id,object,21.0,21.0,0.0,1.0
4,shipping_limit_date,object,21.0,21.0,0.0,1.0
5,price,float64,21.0,21.0,0.0,2.0
6,freight_value,float64,21.0,21.0,0.0,2.0


In [15]:
df[df['order_id']==df['order_id'].value_counts().index[idx]]

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
57297,8272b63d03f5f79c56e9e4120aec44ef,1,270516a3f41dc035aa87d220228f844c,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57298,8272b63d03f5f79c56e9e4120aec44ef,2,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57299,8272b63d03f5f79c56e9e4120aec44ef,3,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57300,8272b63d03f5f79c56e9e4120aec44ef,4,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57301,8272b63d03f5f79c56e9e4120aec44ef,5,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57302,8272b63d03f5f79c56e9e4120aec44ef,6,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57303,8272b63d03f5f79c56e9e4120aec44ef,7,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57304,8272b63d03f5f79c56e9e4120aec44ef,8,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57305,8272b63d03f5f79c56e9e4120aec44ef,9,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89
57306,8272b63d03f5f79c56e9e4120aec44ef,10,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89


### ***Checking point list 3***

In [16]:
# order_id` column has 112650 values (>99441 in `olist_orders_dataset` -> this is OK considering 1 order 
# may have more than 1 item). However, `unique_value_count` < 99441, meaning some orders in `olist_orders_dataset` 
# are not in `olist_orders_dataset`. **Why? Canceled orders?**

In [17]:
csv_file_name = 'olist_orders_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
order_df = pd.read_csv(csv_file_path)

In [18]:
list_1 = order_df['order_id'].unique().tolist()
list_1.sort(key = str)
print(f'List 1 has {len(list_1)} elements')

List 1 has 99441 elements


In [19]:
list_2 = df['order_id'].unique().tolist()
list_2.sort(key = str)
print(f'List 2 has {len(list_2)} elements')

List 2 has 98666 elements


In [20]:
set_1 = set(list_1)
set_2 = set(list_2)
missing = list(sorted(set_1 - set_2))
print(f'There are {len(missing)} missing orders in olist_order_items_dataset. Why?')

There are 775 missing orders in olist_order_items_dataset. Why?


In [21]:
# Missing orders in olist_order_items_dataset
missing_orders_df = order_df[order_df['order_id'].isin(missing)]
missing_orders_df#['order_id']#.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
266,8e24261a7e58791d10cb1bf9da94df5c,64a254d30eed42cd0e6c36dddb88adf0,unavailable,2017-11-16 15:09:28,2017-11-16 15:26:57,,,2017-12-05 00:00:00
586,c272bcd21c287498b4883c7512019702,9582c5bbecc65eb568e2c1d839b5cba1,unavailable,2018-01-31 11:31:37,2018-01-31 14:23:50,,,2018-02-16 00:00:00
687,37553832a3a89c9b2db59701c357ca67,7607cd563696c27ede287e515812d528,unavailable,2017-08-14 17:38:02,2017-08-17 00:15:18,,,2017-09-05 00:00:00
737,d57e15fb07fd180f06ab3926b39edcd2,470b93b3f1cde85550fc74cd3a476c78,unavailable,2018-01-08 19:39:03,2018-01-09 07:26:08,,,2018-02-06 00:00:00
1130,00b1cb0320190ca0daa2c88b35206009,3532ba38a3fd242259a514ac2b6ae6b6,canceled,2018-08-28 15:26:39,,,,2018-09-12 00:00:00
...,...,...,...,...,...,...,...,...
99252,aaab15da689073f8f9aa978a390a69d1,df20748206e4b865b2f14a5eabbfcf34,unavailable,2018-01-16 14:27:59,2018-01-17 03:37:34,,,2018-02-06 00:00:00
99283,3a3cddda5a7c27851bd96c3313412840,0b0d6095c5555fe083844281f6b093bb,canceled,2018-08-31 16:13:44,,,,2018-10-01 00:00:00
99347,a89abace0dcc01eeb267a9660b5ac126,2f0524a7b1b3845a1a57fcf3910c4333,canceled,2018-09-06 18:45:47,,,,2018-09-27 00:00:00
99348,a69ba794cc7deb415c3e15a0a3877e69,726f0894b5becdf952ea537d5266e543,unavailable,2017-08-23 16:28:04,2017-08-28 15:44:47,,,2017-09-15 00:00:00


In [22]:
# Checking their order_status -> not only 'unavailable' or 'canceled' are missing. Also 'shipped' items.
missing_orders_df['order_status'].value_counts() 

unavailable    603
canceled       164
created          5
invoiced         2
shipped          1
Name: order_status, dtype: int64

In [23]:
# On the other hand, not ALL 'canceled' or 'unavailable' missing.
order_df['order_status'].value_counts()

delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64

In [24]:
# Summary per 'order_status'
# Missing orders in olist_order_items_dataset are not defined by 'order_status' categories, they are just missing. 
# We have to agree about what to do here. Maybe, remove these from olist_orders_dataset
sum_order_status = order_df['order_status'].value_counts().to_frame('All orders')
sum_order_status['Missing orders in order_items_dataset'] = missing_orders_df['order_status'].value_counts()
sum_order_status['Missing orders in order_items_dataset'].fillna(0, inplace=True)
sum_order_status['Total orders in order_items_dataset'] = sum_order_status['All orders'] - sum_order_status['Missing orders in order_items_dataset']
sum_order_status.iloc[:, [0,2,1]]

Unnamed: 0,All orders,Total orders in order_items_dataset,Missing orders in order_items_dataset
delivered,96478,96478.0,0.0
shipped,1107,1106.0,1.0
canceled,625,461.0,164.0
unavailable,609,6.0,603.0
invoiced,314,312.0,2.0
processing,301,301.0,0.0
created,5,0.0,5.0
approved,2,2.0,0.0


In [25]:
status = 'canceled'
not_missing_orders_df = order_df[~order_df['order_id'].isin(missing)]
idx = 1
not_missing_orders_df[not_missing_orders_df['order_status'] == status].reset_index(drop=True).iloc[idx]

order_id                         714fb133a6730ab81fa1d3c1b2007291
customer_id                      e3fe72696c4713d64d3c10afe71e75ed
order_status                                             canceled
order_purchase_timestamp                      2018-01-26 21:34:08
order_approved_at                             2018-01-26 21:58:39
order_delivered_carrier_date                  2018-01-29 22:33:25
order_delivered_customer_date                                 NaN
order_estimated_delivery_date                 2018-02-22 00:00:00
Name: 1, dtype: object

In [26]:
idxs = not_missing_orders_df[not_missing_orders_df['order_status'] == status].reset_index(drop=True).iloc[idx]
df[df['order_id'].isin(idxs)]

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
49872,714fb133a6730ab81fa1d3c1b2007291,1,a0b7d5a992ccda646f2d34e418fff5a0,95f83f51203c626648c875dd41874c7f,2018-02-01 21:58:39,69.9,26.11


### ***Checking point list 4***

In [27]:
#product_id contains 32951 unique values (same values as olist_products_dataset.csv)
csv_file_name = 'olist_products_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
product_df = pd.read_csv(csv_file_path)

In [28]:
list_1 = product_df['product_id'].unique().tolist()
list_1.sort(key = str)
list_1[0:5]

['00066f42aeeb9f3007548bb9d3f33c38',
 '00088930e925c41fd95ebfe695fd2655',
 '0009406fd7479715e4bef61dd91f2462',
 '000b8f95fcb9e0096488278317764d19',
 '000d9be29b5207b54e86aa1b1ac54872']

In [29]:
list_2 = df['product_id'].unique().tolist()
list_2.sort(key = str)
list_2[0:5]

['00066f42aeeb9f3007548bb9d3f33c38',
 '00088930e925c41fd95ebfe695fd2655',
 '0009406fd7479715e4bef61dd91f2462',
 '000b8f95fcb9e0096488278317764d19',
 '000d9be29b5207b54e86aa1b1ac54872']

In [30]:
list_1.sort(key = str) == list_2.sort(key = str)

True

### ***Checking point list 5***

In [31]:
#seller_id contains 3095 unique values (same values as olist_sellers_dataset.csv) 
csv_file_name = 'olist_sellers_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
seller_df = pd.read_csv(csv_file_path)

In [32]:
list_1 = seller_df['seller_id'].unique().tolist()
list_1.sort(key = str)
list_1[0:10]

['0015a82c2db000af6aaaf3ae2ecb0532',
 '001cca7ae9ae17fb1caed9dfb1094831',
 '001e6ad469a905060d959994f1b41e4f',
 '002100f778ceb8431b7a1020ff7ab48f',
 '003554e2dce176b5555353e4f3555ac8',
 '004c9cd9d87a3c30c522c48c4fc07416',
 '00720abe85ba0859807595bbf045a33b',
 '00ab3eff1b5192e5f1a63bcecfee11c8',
 '00d8b143d12632bad99c0ad66ad52825',
 '00ee68308b45bc5e2660cd833c3f81cc']

In [33]:
list_2 = df['seller_id'].unique().tolist()
list_2.sort(key = str)
list_2[0:10]

['0015a82c2db000af6aaaf3ae2ecb0532',
 '001cca7ae9ae17fb1caed9dfb1094831',
 '001e6ad469a905060d959994f1b41e4f',
 '002100f778ceb8431b7a1020ff7ab48f',
 '003554e2dce176b5555353e4f3555ac8',
 '004c9cd9d87a3c30c522c48c4fc07416',
 '00720abe85ba0859807595bbf045a33b',
 '00ab3eff1b5192e5f1a63bcecfee11c8',
 '00d8b143d12632bad99c0ad66ad52825',
 '00ee68308b45bc5e2660cd833c3f81cc']

In [34]:
list_1.sort(key = str) == list_2.sort(key = str)

True