In [22]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import difflib

In [23]:
dataset_path = "../../data/raw/" 

### ***Database table relationships (from kaggle)***
<img src="https://i.imgur.com/HRhd2Y0.png" alt="Database table relationships" style="height: 500px; width:900px;"/>

In [24]:
files = os.listdir(dataset_path)
print(f'The dataset contains {len(files)} files:')
for file in files:
    print(f'    * {file}')

The dataset contains 9 files:
    * olist_customers_dataset.csv
    * olist_geolocation_dataset.csv
    * olist_orders_dataset.csv
    * olist_order_items_dataset.csv
    * olist_order_payments_dataset.csv
    * olist_order_reviews_dataset.csv
    * olist_products_dataset.csv
    * olist_sellers_dataset.csv
    * product_category_name_translation.csv


In [25]:
def tables_info(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes   
        info_df.loc[column,'rows_count'] = df[column].shape[0]
        info_df.loc[column,'non_null_rows_count'] =df[column].notnull().sum()
        info_df.loc[column,'null_row_count'] =df[column].isna().sum()
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [26]:
def data_subtype(df, column):
    # column = df.columns[4]
    # print(column)
    df = df[df[column].notnull()].reset_index(drop=True) #Remove possible not null values
    # dtype
    # print(df[column].dtype)
    if df[column].dtype == 'object':    
        #Date/time or str?
        try: #Date/time: Up to now only datetime  
            x = pd.to_datetime(df[column], infer_datetime_format=True)                   
            data_subtype = x.dtype

        except: #if it fails -> str
            if df[column].str.len().unique().shape[0] == 1: #All the elements have the same length if == 1
                data_subtype = f"CHAR({df[column].str.len().unique()[0]})"
            elif df[column].map(lambda x: len(x)).max() > 255: #Variable length higher than 255
                data_subtype = 'TEXT()' 
            else:
                # data_type = f"VARCHAR({df[column].map(lambda x: len(x)).max()})"   
                data_subtype = f"VARCHAR(255)"       
    elif df[column].dtype == 'float64':
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    else: #int64
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    # print(data_subtype)
    example = df[column][random.randint(0,df[column].shape[0])]
    return [data_subtype, example]

In [27]:
def tables_info2(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes
        data_subtype_example = data_subtype(df, column)
        info_df.loc[column, 'subtype'] = data_subtype_example[0]
        info_df.loc[column, 'example'] = data_subtype_example[1]
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]
        info_df.loc[column,'is_not_null'] =~df[column].isnull().values.any()
        info_df.loc[column, 'is_unique'] = df[column].is_unique        
        info_df.loc[column, 'is_primary_key'] = (column.find('_id') != -1) & info_df.loc[column,'is_not_null'] & info_df.loc[column, 'is_unique']
        info_df.loc[column, 'is_foreign_key'] = (column.find('_id') != -1) & ~info_df.loc[column, 'is_primary_key']
        info_df.loc[column,'is_id'] = (column.find('_id') != -1)       

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [28]:
csv_file_name = 'olist_customers_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
df = pd.read_csv(csv_file_path)

In [29]:
info_df = tables_info(df)
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,customer_id,object,99441.0,99441.0,0.0,99441.0
1,customer_unique_id,object,99441.0,99441.0,0.0,96096.0
2,customer_zip_code_prefix,int64,99441.0,99441.0,0.0,14994.0
3,customer_city,object,99441.0,99441.0,0.0,4119.0
4,customer_state,object,99441.0,99441.0,0.0,27.0


In [30]:
info_df = tables_info2(df)
info_df

Unnamed: 0,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,customer_id,object,CHAR(32),fe4b77a5964aa7c2e9980def32a67e95,99441.0,True,True,True,False,True
1,customer_unique_id,object,CHAR(32),a84b91f1f91aed1b256f58bf4007e2f9,96096.0,True,False,False,True,True
2,customer_zip_code_prefix,int64,int64,9061,14994.0,True,False,False,False,False
3,customer_city,object,VARCHAR(255),niteroi,4119.0,True,False,False,False,False
4,customer_state,object,CHAR(2),MG,27.0,True,False,False,False,False


## Comments:
This dataset has ***NO NULL VALUES!***

As shown in the figure above, table `olist_customers_dataset` is related to `olist_orders_dataset` (by providing `customer_id` a FK) and`olist_geolocation_dataset` (through FK: `geolocation_zip_code_prefix`).
1. `customer_id` in tables`olist_customers_dataset` and `olist_orders_dataset` are exactly the same: same number of elements and elements themselves. This means that each row in `olist_customers_dataset` actually is a customer ***attached to a single order*** (regardless the status of this order). **See below**
2. `customers_unique_id` (currently a FK in this table) is the real unique id of a customer (like registered customers in the ecommerce platform). **See below**

~~___From points 1 and 2, we should consider modify this table and (maybe) rename it as `customers_unique` or `registered_customers` where `customers_unique_id` is the PK. Remaining columns could be kept in the same format. This table would be related to `olist_orders_dataset` by `customers_unique_id`.
The latter to be more consistent with the theory and good practices.___~~

**Finally it is not possible what was proposed above. The number of unique`customer_unique_id` is not the same as number if unique rows. Sometimes the same `customer_unique_id` has different `customer_zip_code_prefix`, which seems to be delivery zip code. TABLE `olist_customers_dataset` is kept in the same original format with the same relationships.**


### ***Checking point list 1***

In [31]:
csv_file_name = 'olist_orders_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
order_df = pd.read_csv(csv_file_path)

In [32]:
list_1 = order_df['customer_id'].tolist()
list_1.sort(key = str)
print(f'List 1 has {len(list_1)} elements')
list_1[0:5]

List 1 has 99441 elements


['00012a2ce6f8dcda20d059ce98491703',
 '000161a058600d5901f007fab4c27140',
 '0001fd6190edaaf884bcaf3d49edf079',
 '0002414f95344307404f0ace7a26f1d5',
 '000379cdec625522490c315e70c7a9fb']

In [33]:
list_2 = df['customer_id'].tolist()
list_2.sort(key = str)
print(f'List 2 has {len(list_2)} elements')
list_2[0:5]

List 2 has 99441 elements


['00012a2ce6f8dcda20d059ce98491703',
 '000161a058600d5901f007fab4c27140',
 '0001fd6190edaaf884bcaf3d49edf079',
 '0002414f95344307404f0ace7a26f1d5',
 '000379cdec625522490c315e70c7a9fb']

In [34]:
# customer_id` in tables`olist_customers_dataset` and `olist_orders_dataset` are exactly the same: 
# same number of elements and elements themselves. This means that each row in `olist_customers_dataset` 
# actually is a customer attached to a single order (regardless the status of this order). 
list_1.sort(key = str) == list_2.sort(key = str)

True

### ***Checking point list 2***

In [35]:
# `customers_unique_id` (currently a FK in this table) is the real unique id of a customer 
# (like registered customers in the ecommerce platform).
df['customer_unique_id'].value_counts().head()

8d50f5eadf50201ccdcedfb9e2ac8455    17
3e43e6105506432c953e165fb2acf44c     9
1b6c7548a2a1f9037c1fd3ddfed95f33     7
ca77025e7201e3b30c44b472ff346268     7
6469f99c1f9dfae7733b25662e7f1782     7
Name: customer_unique_id, dtype: int64

In [36]:
# This column works as a dictionary key to obtain the remaining 2 columns customer_city and customer_state from olist_geolocation_dataset
df['customer_zip_code_prefix'].value_counts().sum()

99441

In [37]:
# Drop customer_id columns
# Keep only unique customer_unique_id
df_to_clean = df.copy()
df_to_clean.drop('customer_id', axis=1, inplace=True)
df_to_clean
# Get the unique values (rows) by retaining last row
df_to_clean = df_to_clean.drop_duplicates(keep='last')

In [38]:
# Checking unique rows are unique customer_unique_id
print(f"Number of rows: {df.shape[0]}")
print(f"Number of unique rows: {df_to_clean.shape[0]}")
print(f"Number of unique customer_unique_id: {df_to_clean['customer_unique_id'].unique().shape[0]}")
print(f"Number of NON-unique customer_unique_id in unique rows: {df_to_clean.shape[0] - df_to_clean['customer_unique_id'].unique().shape[0]}")

Number of rows: 99441
Number of unique rows: 96352
Number of unique customer_unique_id: 96096
Number of NON-unique customer_unique_id in unique rows: 256


In [39]:
#Getting duplicated customer_unique_id
df_to_clean['customer_unique_id'].value_counts()

b9badb100ff8ecc16a403111209e3a06    3
3e43e6105506432c953e165fb2acf44c    3
d44ccec15f5f86d14d6a2cfa67da1975    3
9832ae2f7d3e5fa4c7a1a06e9551bc61    3
f34cd7fd85a1f8baff886edf09567be3    2
                                   ..
594fe19663e0da85c54704b990aa78dc    1
f4ae67524bc262b9fe7574d4007df896    1
f06a1ab06893b6c407e57dde5b62fd39    1
2b50264af857ad93e50606f1716ee8d3    1
84732c5050c01db9b23e19ba39899398    1
Name: customer_unique_id, Length: 96096, dtype: int64

In [40]:
#Getting duplicated customer_unique_id
id_duplicated = df_to_clean[df_to_clean['customer_unique_id'].duplicated(keep='first')]['customer_unique_id']
id_duplicated

8362     b26fa76ddf33e534491e4ec46f51bc64
11194    2b952792a20f6076f6e7a9c6a27ade02
15084    340d924858c395848c127b3e772b6bfd
15285    1291474366a550ebc251d9187e763d62
17254    2b174670dbec666bbc68b6e2a4062740
                       ...               
99227    13abc50b97af7425b5066e405d7cd760
99264    8c21dd8c37144807c601f99f2a209dfb
99297    bc7b9e0d078c0c01f622b38cfcd7ee9c
99353    0ceb502fc33a2ad327b08288c5310e2e
99406    5cbfdb85ec130898108b32c50d619c39
Name: customer_unique_id, Length: 256, dtype: object

In [41]:
# Looking at duplicated customer_unique_id rows
df_to_clean[df_to_clean['customer_unique_id'].isin(id_duplicated)].sort_values('customer_unique_id').head(25)

Unnamed: 0,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
87012,004b45ec5c64187465168251cd1c9c2f,57035,maceio,AL
72451,004b45ec5c64187465168251cd1c9c2f,57055,maceio,AL
61403,0058f300f57d7b93c477a131a59b36c3,40731,salvador,BA
36269,0058f300f57d7b93c477a131a59b36c3,41370,salvador,BA
54038,012452d40dafae4df401bced74cdb490,3984,sao paulo,SP
45307,012452d40dafae4df401bced74cdb490,3220,sao paulo,SP
57218,0178b244a5c281fb2ade54038dd4b161,14960,novo horizonte,SP
91727,0178b244a5c281fb2ade54038dd4b161,12518,guaratingueta,SP
64800,018b5a7502c30eb5f230f1b4eb23a156,74976,aparecida de goiania,GO
82312,018b5a7502c30eb5f230f1b4eb23a156,74936,aparecida de goiania,GO


To do next:
- Change table format to the one agreed with the team mates.
- 5 columns: customer_id, customer_unique_id, customer_zip_prefix_id, customer_state_id, customer_city_state_id
