In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset_path = "../../data/raw/" 

### ***Analyzing the files***

In [3]:
files = os.listdir(dataset_path)
print(f'The dataset contains {len(files)} files:')
for file in files:
    print(f'    * {file}')

The dataset contains 9 files:
    * olist_customers_dataset.csv
    * olist_geolocation_dataset.csv
    * olist_orders_dataset.csv
    * olist_order_items_dataset.csv
    * olist_order_payments_dataset.csv
    * olist_order_reviews_dataset.csv
    * olist_products_dataset.csv
    * olist_sellers_dataset.csv
    * product_category_name_translation.csv


### ***Getting info from tables***
#### column_name, dtype, rows_count, non_null_rows_count, null_row_count, unique_value_count

In [4]:
def tables_info(dataset_path, csv_file_name):
    # csv_file_name = 'olist_customers_dataset.csv'
    csv_file_path = os.path.join(dataset_path, csv_file_name)
    df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes   
        info_df.loc[column,'rows_count'] = df[column].shape[0]
        info_df.loc[column,'non_null_rows_count'] =df[column].notnull().sum()
        info_df.loc[column,'null_row_count'] =df[column].isna().sum()
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

#### ***For only one table***

In [5]:
info_df = tables_info(dataset_path, csv_file_name='olist_orders_dataset.csv')
info_df

Unnamed: 0,table_name,entities,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,olist_orders_dataset,orders,order_id,object,99441.0,99441.0,0.0,99441.0
1,olist_orders_dataset,orders,customer_id,object,99441.0,99441.0,0.0,99441.0
2,olist_orders_dataset,orders,order_status,object,99441.0,99441.0,0.0,8.0
3,olist_orders_dataset,orders,order_purchase_timestamp,object,99441.0,99441.0,0.0,98875.0
4,olist_orders_dataset,orders,order_approved_at,object,99441.0,99281.0,160.0,90734.0
5,olist_orders_dataset,orders,order_delivered_carrier_date,object,99441.0,97658.0,1783.0,81019.0
6,olist_orders_dataset,orders,order_delivered_customer_date,object,99441.0,96476.0,2965.0,95665.0
7,olist_orders_dataset,orders,order_estimated_delivery_date,object,99441.0,99441.0,0.0,459.0


In [6]:
info_df = pd.DataFrame([])
# print(info_df)
for file in files:
    print(f"{file}")
    info_df1 = tables_info(dataset_path, file)    
    if info_df.empty:
        # print('DataFrame is empty!')
        info_df = info_df1.copy()
    else:
        info_df = pd.concat([info_df, info_df1], axis=0)

info_df.reset_index(drop=True, inplace=True)


olist_customers_dataset.csv
olist_geolocation_dataset.csv
olist_orders_dataset.csv
olist_order_items_dataset.csv
olist_order_payments_dataset.csv
olist_order_reviews_dataset.csv
olist_products_dataset.csv
olist_sellers_dataset.csv
product_category_name_translation.csv


#### ***For only one table***

In [7]:
info_df

Unnamed: 0,table_name,entities,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,olist_customers_dataset,customers,customer_id,object,99441.0,99441.0,0.0,99441.0
1,olist_customers_dataset,customers,customer_unique_id,object,99441.0,99441.0,0.0,96096.0
2,olist_customers_dataset,customers,customer_zip_code_prefix,int64,99441.0,99441.0,0.0,14994.0
3,olist_customers_dataset,customers,customer_city,object,99441.0,99441.0,0.0,4119.0
4,olist_customers_dataset,customers,customer_state,object,99441.0,99441.0,0.0,27.0
5,olist_geolocation_dataset,geolocation,geolocation_zip_code_prefix,int64,1000163.0,1000163.0,0.0,19015.0
6,olist_geolocation_dataset,geolocation,geolocation_lat,float64,1000163.0,1000163.0,0.0,717360.0
7,olist_geolocation_dataset,geolocation,geolocation_lng,float64,1000163.0,1000163.0,0.0,717613.0
8,olist_geolocation_dataset,geolocation,geolocation_city,object,1000163.0,1000163.0,0.0,8011.0
9,olist_geolocation_dataset,geolocation,geolocation_state,object,1000163.0,1000163.0,0.0,27.0


#### ***Total number of columns***

In [8]:
info_df.groupby('table_name')['column_name'].count().sum()

52

#### ***Total number of dtypes***

In [9]:
info_df['dtype'].value_counts()

object     33
float64    12
int64       7
Name: dtype, dtype: int64

#### ***Most repeated columns in tables***

In [10]:
info_df['column_name'].value_counts()[0:6]

order_id                 4
customer_id              2
product_category_name    2
seller_id                2
product_id               2
payment_value            1
Name: column_name, dtype: int64

#### ***Looking for related columns and tables***

In [11]:
info_df_new = info_df[['entities', 'column_name']].copy()
info_df_new['present_at_table'] = np.nan
info_df_new['present_#_tables'] = np.nan
for i, row in info_df_new.iterrows():
    item = row['column_name']
    # print(item)
    value = info_df_new[info_df_new['column_name']== item]['entities'].to_list()
    info_df_new['present_at_table'][i] = value    
    info_df_new['present_#_tables'][i] = len(value)
info_df_new_sort = info_df_new.sort_values(by='present_#_tables', ascending=False)
info_df_new_clean = info_df_new_sort.drop_duplicates(subset=['column_name'])[['column_name', 'present_at_table', 'present_#_tables']]
info_df_new_clean = info_df_new_clean[info_df_new_clean['present_#_tables'] > 1].reset_index(drop=True)
info_df_new_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  info_df_new['present_at_table'][i] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  info_df_new['present_#_tables'][i] = len(value)


Unnamed: 0,column_name,present_at_table,present_#_tables
0,order_id,"[orders, order_items, order_payments, order_re...",4.0
1,customer_id,"[customers, orders]",2.0
2,product_id,"[order_items, products]",2.0
3,seller_id,"[order_items, sellers]",2.0
4,product_category_name,"[products, product_category_name_translation]",2.0


In [12]:
info_df_new_clean['present_at_table'][0]

['orders', 'order_items', 'order_payments', 'order_reviews']

In [13]:
info_df[info_df['column_name'] == "order_id"]

Unnamed: 0,table_name,entities,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
10,olist_orders_dataset,orders,order_id,object,99441.0,99441.0,0.0,99441.0
18,olist_order_items_dataset,order_items,order_id,object,112650.0,112650.0,0.0,98666.0
25,olist_order_payments_dataset,order_payments,order_id,object,103886.0,103886.0,0.0,99440.0
31,olist_order_reviews_dataset,order_reviews,order_id,object,99224.0,99224.0,0.0,98673.0


### ***Database table relationships (from kaggle)***
<img src="https://i.imgur.com/HRhd2Y0.png" alt="Database table relationships" style="height: 600px; width:1000px;"/>


### ***Getting columns data types and constraints***

In [14]:
def data_subtype(df, column):
    # column = df.columns[4]
    # print(column)
    df = df[df[column].notnull()].reset_index(drop=True) #Remove possible not null values
    # dtype
    # print(df[column].dtype)
    if df[column].dtype == 'object':    
        #Date/time or str?
        try: #Date/time: Up to now only datetime
            x = pd.to_datetime(df[column], infer_datetime_format=True)                   
            data_subtype = x.dtype

        except: #if it fails -> str
            if df[column].str.len().unique().shape[0] == 1: #All the elements have the same length if == 1
                data_subtype = f"CHAR({df[column].str.len().unique()[0]})"
            elif df[column].map(lambda x: len(x)).max() > 255: #Variable length higher than 255
                data_subtype = 'TEXT()' 
            else:
                # data_type = f"VARCHAR({df[column].map(lambda x: len(x)).max()})"   
                data_subtype = f"VARCHAR(255)"       
    elif df[column].dtype == 'float64':
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    else: #int64
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    # print(data_subtype)
    example = df[column][random.randint(0,df[column].shape[0])]
    return [data_subtype, example]

In [15]:
def tables_info2(dataset_path, csv_file_name):
    # csv_file_name = 'olist_customers_dataset.csv'
    csv_file_path = os.path.join(dataset_path, csv_file_name)
    df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes
        data_subtype_example = data_subtype(df, column)
        info_df.loc[column, 'subtype'] = data_subtype_example[0]
        info_df.loc[column, 'example'] = data_subtype_example[1]
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]
        info_df.loc[column,'is_not_null'] =~df[column].isnull().values.any()
        info_df.loc[column, 'is_unique'] = df[column].is_unique        
        info_df.loc[column, 'is_primary_key'] = (column.find('_id') != -1) & info_df.loc[column,'is_not_null'] & info_df.loc[column, 'is_unique']
        info_df.loc[column, 'is_foreign_key'] = (column.find('_id') != -1) & ~info_df.loc[column, 'is_primary_key']
        info_df.loc[column,'is_id'] = (column.find('_id') != -1)       

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

#### ***For only one table***

In [16]:
csv_file_name='olist_orders_dataset.csv'
# csv_file_name = 'olist_order_items_dataset.csv'
df = tables_info2(dataset_path, csv_file_name)
df

Unnamed: 0,table_name,entities,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,olist_orders_dataset,orders,order_id,object,CHAR(32),c7b4a7a4d974418197e38cc0f6c2ea85,99441.0,True,True,True,False,True
1,olist_orders_dataset,orders,customer_id,object,CHAR(32),431bdad95e6781a18b295b98d87335fc,99441.0,True,True,True,False,True
2,olist_orders_dataset,orders,order_status,object,VARCHAR(255),delivered,8.0,True,False,False,False,False
3,olist_orders_dataset,orders,order_purchase_timestamp,object,datetime64[ns],2018-02-21 12:23:15,98875.0,True,False,False,False,False
4,olist_orders_dataset,orders,order_approved_at,object,datetime64[ns],2017-07-17 18:23:33,90734.0,False,False,False,False,False
5,olist_orders_dataset,orders,order_delivered_carrier_date,object,datetime64[ns],2018-01-25 21:42:52,81019.0,False,False,False,False,False
6,olist_orders_dataset,orders,order_delivered_customer_date,object,datetime64[ns],2017-05-04 17:34:52,95665.0,False,False,False,False,False
7,olist_orders_dataset,orders,order_estimated_delivery_date,object,datetime64[ns],2018-07-24 00:00:00,459.0,True,False,False,False,False


#### ***For all the tables***

In [18]:
info_df = pd.DataFrame([])
# print(info_df)
for file in files:
    print(f"{file}")
    info_df1 = tables_info2(dataset_path, file)    
    if info_df.empty:
        # print('DataFrame is empty!')
        info_df = info_df1.copy()
    else:
        info_df = pd.concat([info_df, info_df1], axis=0)

info_df.reset_index(drop=True, inplace=True)

olist_customers_dataset.csv
olist_geolocation_dataset.csv
olist_orders_dataset.csv
olist_order_items_dataset.csv
olist_order_payments_dataset.csv
olist_order_reviews_dataset.csv
olist_products_dataset.csv
olist_sellers_dataset.csv
product_category_name_translation.csv


In [19]:
info_df

Unnamed: 0,table_name,entities,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,olist_customers_dataset,customers,customer_id,object,CHAR(32),59c56e683b39c2ca0c21348d5e5eac99,99441.0,True,True,True,False,True
1,olist_customers_dataset,customers,customer_unique_id,object,CHAR(32),e6eb191ddc04af07531ff36d2caca1ff,96096.0,True,False,False,True,True
2,olist_customers_dataset,customers,customer_zip_code_prefix,int64,int64,13566,14994.0,True,False,False,False,False
3,olist_customers_dataset,customers,customer_city,object,VARCHAR(255),fortaleza,4119.0,True,False,False,False,False
4,olist_customers_dataset,customers,customer_state,object,CHAR(2),SP,27.0,True,False,False,False,False
5,olist_geolocation_dataset,geolocation,geolocation_zip_code_prefix,int64,int64,11900.0,19015.0,True,False,False,False,False
6,olist_geolocation_dataset,geolocation,geolocation_lat,float64,float64,-22.90042,717360.0,True,False,False,False,False
7,olist_geolocation_dataset,geolocation,geolocation_lng,float64,float64,-46.576368,717613.0,True,False,False,False,False
8,olist_geolocation_dataset,geolocation,geolocation_city,object,VARCHAR(255),ivinhema,8011.0,True,False,False,False,False
9,olist_geolocation_dataset,geolocation,geolocation_state,object,CHAR(2),RJ,27.0,True,False,False,False,False


##### ***Checking why not every table has its own primary_key***

In [20]:
info_df[info_df["is_primary_key"]==True]

Unnamed: 0,table_name,entities,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,olist_customers_dataset,customers,customer_id,object,CHAR(32),59c56e683b39c2ca0c21348d5e5eac99,99441.0,True,True,True,False,True
10,olist_orders_dataset,orders,order_id,object,CHAR(32),38eb04fead5a8c966e7e01b87830b50e,99441.0,True,True,True,False,True
11,olist_orders_dataset,orders,customer_id,object,CHAR(32),04ba9496f04b0eaa070def5b5ab662ac,99441.0,True,True,True,False,True
37,olist_products_dataset,products,product_id,object,CHAR(32),6fa8b8abbb489a0abba6ad7b79c7ecd9,32951.0,True,True,True,False,True
46,olist_sellers_dataset,sellers,seller_id,object,CHAR(32),891071be6ba827b591264c90c2ae8a63,3095.0,True,True,True,False,True


In [21]:
info_df[info_df["is_id"]==True]

Unnamed: 0,table_name,entities,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,olist_customers_dataset,customers,customer_id,object,CHAR(32),59c56e683b39c2ca0c21348d5e5eac99,99441.0,True,True,True,False,True
1,olist_customers_dataset,customers,customer_unique_id,object,CHAR(32),e6eb191ddc04af07531ff36d2caca1ff,96096.0,True,False,False,True,True
10,olist_orders_dataset,orders,order_id,object,CHAR(32),38eb04fead5a8c966e7e01b87830b50e,99441.0,True,True,True,False,True
11,olist_orders_dataset,orders,customer_id,object,CHAR(32),04ba9496f04b0eaa070def5b5ab662ac,99441.0,True,True,True,False,True
18,olist_order_items_dataset,order_items,order_id,object,CHAR(32),6dd0712b5d9d31ab2b006f7b510e8720,98666.0,True,False,False,True,True
19,olist_order_items_dataset,order_items,order_item_id,int64,int64,1,21.0,True,False,False,True,True
20,olist_order_items_dataset,order_items,product_id,object,CHAR(32),19c91ef95d509ea33eda93495c4d3481,32951.0,True,False,False,True,True
21,olist_order_items_dataset,order_items,seller_id,object,CHAR(32),744dac408745240a2c2528fb1b6028f3,3095.0,True,False,False,True,True
25,olist_order_payments_dataset,order_payments,order_id,object,CHAR(32),51e827dc634ca855acba1d305c4649af,99440.0,True,False,False,True,True
30,olist_order_reviews_dataset,order_reviews,review_id,object,CHAR(32),2d5f45068d4eb8c87f3d9bfa87311a5c,98410.0,True,False,False,True,True
