In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import difflib

In [2]:
dataset_path = "../../data/raw/" 

### ***Database table relationships (from kaggle)***
<img src="https://i.imgur.com/HRhd2Y0.png" alt="Database table relationships" style="height: 500px; width:900px;"/>

In [3]:
files = os.listdir(dataset_path)
print(f'The dataset contains {len(files)} files:')
for file in files:
    print(f'    * {file}')

The dataset contains 9 files:
    * olist_customers_dataset.csv
    * olist_geolocation_dataset.csv
    * olist_orders_dataset.csv
    * olist_order_items_dataset.csv
    * olist_order_payments_dataset.csv
    * olist_order_reviews_dataset.csv
    * olist_products_dataset.csv
    * olist_sellers_dataset.csv
    * product_category_name_translation.csv


In [4]:
def tables_info(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes   
        info_df.loc[column,'rows_count'] = df[column].shape[0]
        info_df.loc[column,'non_null_rows_count'] =df[column].notnull().sum()
        info_df.loc[column,'null_row_count'] =df[column].isna().sum()
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [5]:
def data_subtype(df, column):
    # column = df.columns[4]
    # print(column)
    df = df[df[column].notnull()].reset_index(drop=True) #Remove possible not null values
    # dtype
    # print(df[column].dtype)
    if df[column].dtype == 'object':    
        #Date/time or str?
        try: #Date/time: Up to now only datetime  
            x = pd.to_datetime(df[column], infer_datetime_format=True)                   
            data_subtype = x.dtype

        except: #if it fails -> str
            if df[column].str.len().unique().shape[0] == 1: #All the elements have the same length if == 1
                data_subtype = f"CHAR({df[column].str.len().unique()[0]})"
            elif df[column].map(lambda x: len(x)).max() > 255: #Variable length higher than 255
                data_subtype = 'TEXT()' 
            else:
                # data_type = f"VARCHAR({df[column].map(lambda x: len(x)).max()})"   
                data_subtype = f"VARCHAR(255)"       
    elif df[column].dtype == 'float64':
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    else: #int64
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    # print(data_subtype)
    example = df[column][random.randint(0,df[column].shape[0])]
    return [data_subtype, example]

In [6]:
def tables_info2(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes
        data_subtype_example = data_subtype(df, column)
        info_df.loc[column, 'subtype'] = data_subtype_example[0]
        info_df.loc[column, 'example'] = data_subtype_example[1]
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]
        info_df.loc[column,'is_not_null'] =~df[column].isnull().values.any()
        info_df.loc[column, 'is_unique'] = df[column].is_unique        
        info_df.loc[column, 'is_primary_key'] = (column.find('_id') != -1) & info_df.loc[column,'is_not_null'] & info_df.loc[column, 'is_unique']
        info_df.loc[column, 'is_foreign_key'] = (column.find('_id') != -1) & ~info_df.loc[column, 'is_primary_key']
        info_df.loc[column,'is_id'] = (column.find('_id') != -1)       

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [7]:
csv_file_name = 'olist_sellers_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
df = pd.read_csv(csv_file_path)

In [8]:
info_df = tables_info(df)
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,seller_id,object,3095.0,3095.0,0.0,3095.0
1,seller_zip_code_prefix,int64,3095.0,3095.0,0.0,2246.0
2,seller_city,object,3095.0,3095.0,0.0,611.0
3,seller_state,object,3095.0,3095.0,0.0,23.0


In [9]:
info_df = tables_info2(df)
info_df

Unnamed: 0,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,seller_id,object,CHAR(32),101a3a003516bc11253298b2fad3cb65,3095.0,True,True,True,False,True
1,seller_zip_code_prefix,int64,int64,86801,2246.0,True,False,False,False,False
2,seller_city,object,VARCHAR(255),curitiba,611.0,True,False,False,False,False
3,seller_state,object,CHAR(2),SP,23.0,True,False,False,False,False


## Comments:
This dataset has ***NO NULL VALUES!***

As shown in the figure above, table `olist_sellers_dataset` is related to `olist_order_items_dataset` (by providing `seller_id` a FK).
1. `seller_id` in tables`olist_sellers_dataset` and `olist_order_items_dataset` are exactly the same: same number of elements and elements themselves. This means All sellers have sold at list one item. **See below**

### ***Checking point list 1***

In [10]:
csv_file_name = 'olist_order_items_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
order_df = pd.read_csv(csv_file_path)

In [11]:
list_1 = order_df['seller_id'].unique().tolist()
list_1.sort(key = str)
print(f'List 1 has {len(list_1)} elements')
list_1[0:5]

List 1 has 3095 elements


['0015a82c2db000af6aaaf3ae2ecb0532',
 '001cca7ae9ae17fb1caed9dfb1094831',
 '001e6ad469a905060d959994f1b41e4f',
 '002100f778ceb8431b7a1020ff7ab48f',
 '003554e2dce176b5555353e4f3555ac8']

In [12]:
list_2 = df['seller_id'].tolist()
list_2.sort(key = str)
print(f'List 2 has {len(list_2)} elements')
list_2[0:5]

List 2 has 3095 elements


['0015a82c2db000af6aaaf3ae2ecb0532',
 '001cca7ae9ae17fb1caed9dfb1094831',
 '001e6ad469a905060d959994f1b41e4f',
 '002100f778ceb8431b7a1020ff7ab48f',
 '003554e2dce176b5555353e4f3555ac8']

In [13]:
# customer_id` in tables`olist_customers_dataset` and `olist_orders_dataset` are exactly the same: 
# same number of elements and elements themselves. This means that each row in `olist_customers_dataset` 
# actually is a customer attached to a single order (regardless the status of this order). 
list_1.sort(key = str) == list_2.sort(key = str)

True

To do next:
- Change table format to the one agreed with the team mates.
- 4 columns: seller_id, seller_zip_prefix_id, seller_state_id, seller_city_state_id
