# Importing Libraries

In [17]:
import pandas as pd
import numpy as np
import psycopg2
from psycopg2 import OperationalError
from io import StringIO
from geopy.geocoders import Nominatim
from multiprocessing import Pool

# Functions

In [18]:
# Dataframe Overview 
def dataframe_overview(dataframe):
    print("Number of Lines {}, Number of Columns {}.".format(dataframe.shape[0], dataframe.shape[1]))
    print("NaN Values in Dataframe {}.\n".format(dataframe.isnull().sum().sum()))
    print("Data types of the Dataframe")
    print('{}\n'.format(dataframe.dtypes))
    print("First 5 Lines of Data")
    return dataframe.head(5)

# Helpful during the process of identifies the Dimensions and Facts.
# Make sure that the columns types are correct to take assumptions.
def categorical_overview(dataframe):
    return dataframe[list(dataframe.select_dtypes('object').columns)].describe()


# Check mains characteristics of all the datasets listed before transfer to the Database
# Dataframe Name, Columns, Null Values, First Value, Max number characters(usefull during the process of creating Tables in SQL)
def  check_dataframes(dfs,list_dfs):
        df_overview_dtypes = pd.DataFrame({'DataFrame':[],
                                           'Column':[],
                                           'Null Values':[],
                                           'First Value':[],
                                           'D.Type':[],
                                           'Max Length':[]}) 
        for n,df in enumerate(dfs):
            for m,column in enumerate(df.columns):
                temp_list = []
                max_char = 0
                for line in dfs[n][column]:
                    if max_char < len(str(line)):
                        max_char = len(str(line))
                        
                df_temp = { 'DataFrame':list_dfs[n],
                            'Column':column,
                            'Null Values':dfs[n][column].isnull().sum().sum(),
                            'First Value':dfs[n][column][0],
                            'D.Type':dfs[n][dfs[n].columns[m]].dtypes,
                            'Max Length':max_char}
                df_overview_dtypes = df_overview_dtypes.append(df_temp,ignore_index=True)
        return df_overview_dtypes
    
# Create connection to the Database, necessary to execute queries    
def create_connection(db_name, db_user, db_password, db_host, db_port):
    connection = None
    try:
        connection = psycopg2.connect(
            database=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port,
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return connection

# Execute the Query in the Database               
def execute_query(connection, query):
    connection.autocommit = True
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        print("Query executed successfully")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        
# Function to upload the Data on the Database.      
def copy_from_stringio(conn, df, table):
    """
    Here we are going save the dataframe in memory 
    and use copy_from() to copy it to the table
    """
    # save dataframe to an in memory buffer
    buffer = StringIO()
    df.to_csv(buffer, index=False, header=False)
    buffer.seek(0)
    
    cursor = conn.cursor()
    try:
        cursor.copy_from(buffer, table, sep=",")
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("copy_from_stringio() done")
    cursor.close()

# Extraction of Data

In [19]:
# Definition of Paths to the files
path_customers     = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_customers_dataset.csv'
path_geolocation   = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_geolocation_dataset.csv'
path_order_items   = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_order_items_dataset.csv'
path_order_payments= r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_order_payments_dataset.csv'
path_order_reviews = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_order_reviews_dataset.csv'
path_orders        = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_orders_dataset.csv'
path_products      = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_products_dataset.csv'
path_sellers       = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\olist_sellers_dataset.csv'
path_category      = r'C:\Users\elwes\OneDrive\Documentos\Projetos\Olist\Datasets\product_category_name_translation.csv'

# Loading in Dataframes
df_customers     = pd.read_csv(path_customers) 
df_geolocation   = pd.read_csv(path_geolocation) 
df_order_items   = pd.read_csv(path_order_items) 
df_order_payments= pd.read_csv(path_order_payments) 
df_order_reviews = pd.read_csv(path_order_reviews) 
df_orders        = pd.read_csv(path_orders) 
df_products      = pd.read_csv(path_products) 
df_sellers       = pd.read_csv(path_sellers) 
df_category      = pd.read_csv(path_category) 

# Initial Overview

In [20]:
# List all Dataframes to perform a mass analysis through them
df_list    =[   df_customers,
                df_geolocation, 
                df_order_items,
                df_order_payments,
                df_order_reviews,
                df_orders,
                df_products, 
                df_sellers,
                df_category]

df_list_str=[   'df_customers',
                'df_geolocation', 
                'df_order_items',
                'df_order_payments',
                'df_order_reviews',
                'df_orders',
                'df_products', 
                'df_sellers',
                'df_category']

check_dataframes(df_list,df_list_str)

Unnamed: 0,DataFrame,Column,Null Values,First Value,D.Type,Max Length
0,df_customers,customer_id,0.0,06b8999e2fba1a1fbc88172c00ba8bc7,object,32.0
1,df_customers,customer_unique_id,0.0,861eff4711a542e4b93843c6dd7febb0,object,32.0
2,df_customers,customer_zip_code_prefix,0.0,14409,int64,5.0
3,df_customers,customer_city,0.0,franca,object,32.0
4,df_customers,customer_state,0.0,SP,object,2.0
5,df_geolocation,geolocation_zip_code_prefix,0.0,1037,int64,5.0
6,df_geolocation,geolocation_lat,0.0,-23.5456,float64,22.0
7,df_geolocation,geolocation_lng,0.0,-46.6393,float64,19.0
8,df_geolocation,geolocation_city,0.0,sao paulo,object,38.0
9,df_geolocation,geolocation_state,0.0,SP,object,2.0


# Data Transformation
Verification for inconsistences and perform the necessaries adjusts.

## Customers

In [21]:
dataframe_overview(df_customers)

Number of Lines 99441, Number of Columns 5.
NaN Values in Dataframe 0.

Data types of the Dataframe
customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

First 5 Lines of Data


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [22]:
categorical_overview(df_customers)

Unnamed: 0,customer_id,customer_unique_id,customer_city,customer_state
count,99441,99441,99441,99441
unique,99441,96096,4119,27
top,f5fb607023e0fc2c3b8443697e399403,8d50f5eadf50201ccdcedfb9e2ac8455,sao paulo,SP
freq,1,17,15540,41746


In [23]:
# Adjustments of Datatypes accordingly
df_customers['customer_zip_code_prefix'] = df_customers['customer_zip_code_prefix'].astype(str)

## Geolocation

In [24]:
dataframe_overview(df_geolocation)

Number of Lines 1000163, Number of Columns 5.
NaN Values in Dataframe 0.

Data types of the Dataframe
geolocation_zip_code_prefix      int64
geolocation_lat                float64
geolocation_lng                float64
geolocation_city                object
geolocation_state               object
dtype: object

First 5 Lines of Data


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [25]:
categorical_overview(df_geolocation)

Unnamed: 0,geolocation_city,geolocation_state
count,1000163,1000163
unique,8011,27
top,sao paulo,SP
freq,135800,404268


In [26]:
# Adjustments of Datatypes accordingly
df_geolocation['geolocation_zip_code_prefix'] = df_geolocation['geolocation_zip_code_prefix'].astype(str) 
df_geolocation['geolocation_city'] = df_geolocation['geolocation_city'].astype(str) 

# Replace "," for "-", this could be a problema during the upload to the Database
df_geolocation['geolocation_city'] = df_geolocation['geolocation_city'].apply(lambda x: x.replace(',','-'))
df_geolocation['geolocation_state'] = df_geolocation['geolocation_state'].apply(lambda x: x.replace(',','-'))

#### Geolocation - Dimension Table
In df_geolocation, for one Zip Code there is more than one Lat,Lng coordinates.<br/>
To fix this, this Datasets provides a single mean Lat,Lng coordinates to each Zip code

In [27]:
# Calculate the mean Lat,Lng
df_lat = df_geolocation.groupby('geolocation_zip_code_prefix')['geolocation_lat'].mean()
df_lng = df_geolocation.groupby('geolocation_zip_code_prefix')['geolocation_lng'].mean()

# Merge the two group by to consolidate in one single dataframe.
df_d_geolocation = pd.merge(df_lat, df_lng, on='geolocation_zip_code_prefix').reset_index()

In [28]:
# Create columns to posterior fullfilment.
df_d_geolocation['city'] = 'Empty'
df_d_geolocation['state'] = 'Empty'
df_d_geolocation['region'] = 'Empty'
df_d_geolocation['country'] = 'Empty'

In [29]:
# With the new coordinates we'll be able to look for the information of
# City, State, Region and Country in a Standardize way.
# Others information are available, but for this case only those will be used.

for i in range(20):
    # This process may take a long time, with For Loop takes around 0,4s;
    # In this case, we have around 20K lines;
    # The geolocator was include inside the loop to reconect during each loop,
    # because without this after a couple of hours the connection fails.
    geolocator = Nominatim(user_agent="Adress")
    
    lat  = df_d_geolocation['geolocation_lat'][i]
    lng = df_d_geolocation['geolocation_lng'][i]
    
    location = geolocator.reverse(str(lat) + "," +str(lng))
    try:
        df_d_geolocation.loc[i,['city']]    = location.raw['address']['city']
    except:
        continue
    try:    
        df_d_geolocation.loc[i,['state']]   = location.raw['address']['state']
    except:
        continue
    try:
        df_d_geolocation.loc[i,['region']]  = location.raw['address']['region']
    except:
        continue
    try:
        df_d_geolocation.loc[i,['country']] = location.raw['address']['country']
    except:
        continue

# Orders Items

In [30]:
dataframe_overview(df_order_items)

Number of Lines 112650, Number of Columns 7.
NaN Values in Dataframe 0.

Data types of the Dataframe
order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object

First 5 Lines of Data


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


In [31]:
categorical_overview(df_order_items)

Unnamed: 0,order_id,product_id,seller_id,shipping_limit_date
count,112650,112650,112650,112650
unique,98666,32951,3095,93318
top,8272b63d03f5f79c56e9e4120aec44ef,aca2eb7d00ea1a7b8ebd4e68314663af,6560211a19b47992c3666cc44a7e94c0,2018-03-01 02:50:48
freq,21,527,2033,21


In [32]:
# Adjustments of Datatypes accordingly
df_order_items['order_item_id'] = df_order_items['order_item_id'].astype(object)
# Convert to Datetime.
df_order_items['shipping_limit_date'] = pd.to_datetime(df_order_items['shipping_limit_date'])

# Order Payments

In [33]:
dataframe_overview(df_order_payments)

Number of Lines 103886, Number of Columns 5.
NaN Values in Dataframe 0.

Data types of the Dataframe
order_id                 object
payment_sequential        int64
payment_type             object
payment_installments      int64
payment_value           float64
dtype: object

First 5 Lines of Data


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [34]:
categorical_overview(df_order_payments)

Unnamed: 0,order_id,payment_type
count,103886,103886
unique,99440,5
top,fa65dad1b0e818e3ccc5cb0e39231352,credit_card
freq,29,76795


# Order Reviews

In [35]:
dataframe_overview(df_order_reviews)

Number of Lines 100000, Number of Columns 7.
NaN Values in Dataframe 146532.

Data types of the Dataframe
review_id                  object
order_id                   object
review_score                int64
review_comment_title       object
review_comment_message     object
review_creation_date       object
review_answer_timestamp    object
dtype: object

First 5 Lines of Data


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [36]:
categorical_overview(df_order_reviews)

Unnamed: 0,review_id,order_id,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
count,100000,100000,11715,41753,100000,100000
unique,99173,99441,4600,36921,637,99010
top,3415c9f764e478409e8e0660ae816dd2,03c939fd7fd3b38f8485a0f95798f1f6,Recomendo,Muito bom,2017-12-19 00:00:00,2017-06-15 23:21:05
freq,3,3,426,230,466,4


In [37]:
# For this specific case, review 'review_comment_title' and 'review_comment_message' it´s not the object of study so,
# they'll be droped due the NaN cases.
df_order_reviews.drop(['review_comment_title','review_comment_message'], axis='columns', inplace=True) 
df_order_reviews['review_creation_date'] = pd.to_datetime(df_order_reviews['review_creation_date'])
df_order_reviews['review_answer_timestamp'] = pd.to_datetime(df_order_reviews['review_answer_timestamp'])

# Orders

In [38]:
dataframe_overview(df_orders)

Number of Lines 99441, Number of Columns 8.
NaN Values in Dataframe 4908.

Data types of the Dataframe
order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

First 5 Lines of Data


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [39]:
categorical_overview(df_orders)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
count,99441,99441,99441,99441,99281,97658,96476,99441
unique,99441,99441,8,98875,90733,81018,95664,459
top,9fce087acf59b40f00b151c3720178cb,6a5c19d132ee5eddb9406fc88a0052c7,delivered,2018-04-11 10:48:14,2018-02-27 04:31:10,2018-05-09 15:48:00,2016-10-27 17:32:07,2017-12-20 00:00:00
freq,1,1,96478,3,9,47,3,522


In [40]:
df_orders.isnull().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

In [41]:
# Convert columns to Datetime type.
df_orders['order_purchase_timestamp']      = pd.to_datetime(df_orders['order_purchase_timestamp'])
df_orders['order_approved_at']             = pd.to_datetime(df_orders['order_approved_at'] )
df_orders['order_delivered_carrier_date']  = pd.to_datetime(df_orders['order_delivered_carrier_date'])
df_orders['order_delivered_customer_date'] = pd.to_datetime(df_orders['order_delivered_customer_date'])
df_orders['order_estimated_delivery_date'] = pd.to_datetime(df_orders['order_estimated_delivery_date'])



# Fix the null values in 'order_approved_at'
# Define a Temporary Dataframe with ['order_purchase_timestamp','order_approved_at']
temp_df_orders = df_orders[['order_purchase_timestamp','order_approved_at']]

# Filter the Not Null Values in the new DataFrame
temp_df_orders = temp_df_orders.loc[temp_df_orders['order_approved_at'].notnull(),:]

# Calculate the time between the column ['order_approved_at'] and ['order_purchase_timestamp']
temp_df_orders['difference'] = temp_df_orders['order_approved_at'] - temp_df_orders['order_purchase_timestamp']

# Find the Mean Time for Aproval
avg_time_approval = temp_df_orders['difference'].mean()

for line in range(len(df_orders)):
    if df_orders.loc[line,'order_approved_at'] is pd.NaT:
        df_orders.loc[line,'order_approved_at'] = df_orders.loc[line,'order_purchase_timestamp'] + avg_time_approval
        
        
        
# Fix the null values in 'order_approved_at'
# Define a Temporary Dataframe with ['order_approved_at','order_delivered_carrier_date']
temp_df_orders = df_orders[['order_approved_at','order_delivered_carrier_date']]

# Filter the Not Null Values in the new DataFrame
temp_df_orders = temp_df_orders.loc[temp_df_orders['order_delivered_carrier_date'].notnull(),:]

# Calculate the time between the column ['order_delivered_carrier_date'] and ['order_approved_at']
temp_df_orders['difference'] = temp_df_orders['order_delivered_carrier_date'] - temp_df_orders['order_approved_at']

# Find the Mean Time for Aproval
avg_time_carrier = temp_df_orders['difference'].mean()
# Replace the Null Values for the calculated mean.
for line in range(len(df_orders)):
    if df_orders.loc[line,'order_delivered_carrier_date'] is pd.NaT:
        df_orders.loc[line,'order_delivered_carrier_date'] = df_orders.loc[line,'order_approved_at'] + avg_time_carrier
                
        
        
# Fix the null values in 'order_approved_at'
# Define a Temporary Dataframe with ['order_delivered_carrier_date','order_delivered_customer_date']
temp_df_orders = df_orders[['order_delivered_carrier_date','order_delivered_customer_date']]

# Filter the Not Null Values in the new DataFrame
temp_df_orders = temp_df_orders.loc[temp_df_orders['order_delivered_customer_date'].notnull(),:]

# Calculate the time between the column ['order_delivered_customer_date'] and ['order_delivered_carrier_date']
temp_df_orders['difference'] = temp_df_orders['order_delivered_customer_date'] - temp_df_orders['order_delivered_carrier_date']

# Find the Mean Time for Aproval
avg_time_customer = temp_df_orders['difference'].mean()
# Replace the Null Values for the calculated mean.
for line in range(len(df_orders)):
    if df_orders.loc[line,'order_delivered_customer_date'] is pd.NaT:
        df_orders.loc[line,'order_delivered_customer_date'] = df_orders.loc[line,'order_approved_at'] + avg_time_customer

# Products

In [42]:
dataframe_overview(df_products)

Number of Lines 32951, Number of Columns 9.
NaN Values in Dataframe 2448.

Data types of the Dataframe
product_id                     object
product_category_name          object
product_name_lenght           float64
product_description_lenght    float64
product_photos_qty            float64
product_weight_g              float64
product_length_cm             float64
product_height_cm             float64
product_width_cm              float64
dtype: object

First 5 Lines of Data


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [43]:
categorical_overview(df_products)

Unnamed: 0,product_id,product_category_name
count,32951,32341
unique,32951,73
top,6df23f718baba6b30c0f3a6a516ae0bf,cama_mesa_banho
freq,1,3029


In [44]:
df_products.isnull().sum()

product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [45]:
# Replace the Null Values in ['product_category_name'] with "indisponivel"
df_products['product_category_name'].fillna('indisponivel', inplace=True)

#Replace the Null Values with the mean
df_products['product_name_lenght'].replace({np.nan:df_products['product_name_lenght'].mean(skipna=True)},inplace=True)
df_products['product_description_lenght'].replace({np.nan:df_products['product_description_lenght'].mean(skipna=True)},inplace=True)
df_products['product_photos_qty'].replace({np.nan:df_products['product_photos_qty'].mean(skipna=True)},inplace=True)
df_products['product_weight_g'].replace({np.nan:df_products['product_weight_g'].mean(skipna=True)},inplace=True)
df_products['product_length_cm'].replace({np.nan:df_products['product_length_cm'].mean(skipna=True)},inplace=True)
df_products['product_height_cm'].replace({np.nan:df_products['product_height_cm'].mean(skipna=True)},inplace=True)
df_products['product_width_cm'].replace({np.nan:df_products['product_width_cm'].mean(skipna=True)},inplace=True)

# Adjustments of Datatypes accordingly
df_products['product_name_lenght'] = df_products['product_name_lenght'].astype(np.int64)
df_products['product_description_lenght'] = df_products['product_description_lenght'].astype(np.int64)
df_products['product_photos_qty'] = df_products['product_photos_qty'].astype(np.int64)

# Sellers

In [46]:
dataframe_overview(df_sellers)

Number of Lines 3095, Number of Columns 4.
NaN Values in Dataframe 0.

Data types of the Dataframe
seller_id                 object
seller_zip_code_prefix     int64
seller_city               object
seller_state              object
dtype: object

First 5 Lines of Data


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [47]:
categorical_overview(df_sellers)

Unnamed: 0,seller_id,seller_city,seller_state
count,3095,3095,3095
unique,3095,611,23
top,5670f4db5b62c43d542e1b2d56b0cf7c,sao paulo,SP
freq,1,694,1849


In [48]:
# Replace "," for "-", this could be a problema during the upload to the Database
df_sellers['seller_zip_code_prefix'] = df_sellers['seller_zip_code_prefix'].astype(object)
df_sellers['seller_city']            = df_sellers['seller_city'].apply(lambda x: x.replace(',','-'))
df_sellers['seller_state']           = df_sellers['seller_state'].apply(lambda x: x.replace(',','-'))

# Category

In [49]:
dataframe_overview(df_category)

Number of Lines 71, Number of Columns 2.
NaN Values in Dataframe 0.

Data types of the Dataframe
product_category_name            object
product_category_name_english    object
dtype: object

First 5 Lines of Data


Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


In [50]:
categorical_overview(df_category)

Unnamed: 0,product_category_name,product_category_name_english
count,71,71
unique,71,71
top,moveis_sala,costruction_tools_tools
freq,1,1


# Final Overview

In [51]:
# List all Dataframes to perform a mass analysis through them
df_list    =[   df_customers,
                df_geolocation, 
                df_order_items,
                df_order_payments,
                df_order_reviews,
                df_orders,
                df_products, 
                df_sellers,
                df_category,
                df_d_geolocation]

df_list_str=[   'df_customers',
                'df_geolocation', 
                'df_order_items',
                'df_order_payments',
                'df_order_reviews',
                'df_orders',
                'df_products', 
                'df_sellers',
                'df_category',
                'df_d_geolocation']

df_upload_review = check_dataframes(df_list,df_list_str)

df_upload_review

Unnamed: 0,DataFrame,Column,Null Values,First Value,D.Type,Max Length
0,df_customers,customer_id,0.0,06b8999e2fba1a1fbc88172c00ba8bc7,object,32.0
1,df_customers,customer_unique_id,0.0,861eff4711a542e4b93843c6dd7febb0,object,32.0
2,df_customers,customer_zip_code_prefix,0.0,14409,object,5.0
3,df_customers,customer_city,0.0,franca,object,32.0
4,df_customers,customer_state,0.0,SP,object,2.0
5,df_geolocation,geolocation_zip_code_prefix,0.0,1037,object,5.0
6,df_geolocation,geolocation_lat,0.0,-23.5456,float64,22.0
7,df_geolocation,geolocation_lng,0.0,-46.6393,float64,19.0
8,df_geolocation,geolocation_city,0.0,sao paulo,object,38.0
9,df_geolocation,geolocation_state,0.0,SP,object,2.0


## / ! \ ! / ! \ / ! \ ! / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \

#  Load  

- PostgreSQL in AWS:<br/>
    Endpoint: main.czqwwerei65b.us-east-2.rds.amazonaws.com<br/>
    Port: 5432<br/>
    License model: Postgresql License<br/>
    Engine version: 12.6

## / ! \ ! / ! \ / ! \ ! / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \ / ! \

## Create a Database

In [974]:
# Create Connection to the database
# In this a Amazon Web Database is being utilized
connection = create_connection('postdb',
                  'elweshonorato',
                  'XXXXXX',
                  'main.czqwwerei65b.us-east-2.rds.amazonaws.com',
                  '5432')


# Create Query text in SQL## Create a Database
create_database_query = "CREATE DATABASE olist"
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_database_query)

Connection to PostgreSQL DB successful


## Create Tables
- Create tables to upload the Dataframes to the Database

In [1003]:
# Create Connection to the database
connection = create_connection('olist',
                  'elweshonorato',
                  'XXXXXX',
                  'main.czqwwerei65b.us-east-2.rds.amazonaws.com',
                  '5432')

# Create Query text in SQL
create_customers_table = """
    CREATE TABLE IF NOT EXISTS customers (
        customer_id              VARCHAR(32),
        customer_unique_id       VARCHAR(32),
        customer_zip_code_prefix VARCHAR(5),
        customer_city            VARCHAR(50),
        customer_state           CHAR(2),
        PRIMARY KEY (customer_id)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_customers_table)

# Create Query text in SQL
create_geolocation_table = """
    CREATE TABLE IF NOT EXISTS geolocation (
        geolocation_zip_code_prefix   VARCHAR(5),
        geolocation_lat               FLOAT(25),
        geolocation_lng               FLOAT(25),
        geolocation_city              VARCHAR(50),
        geolocation_state             CHAR(2)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_geolocation_table)

# Create Query text in SQL
create_order_items_table = """
    CREATE TABLE IF NOT EXISTS order_items (
        order_id             VARCHAR(32),
        order_item_id        VARCHAR(2),
        product_id           VARCHAR(32),
        seller_id            VARCHAR(32),
        shipping_limit_date  TIMESTAMP,
        price                DECIMAL(10),
        freight_value        DECIMAL(10)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_order_items_table)

# Create Query text in SQL
create_order_payments_table = """
    CREATE TABLE IF NOT EXISTS order_payments (
        order_id              VARCHAR(32),
        payment_sequential    DECIMAL(2),
        payment_type          VARCHAR(15),
        payment_installments  DECIMAL(2),
        payment_value         DECIMAL(10)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_order_payments_table)

# Create Query text in SQL
create_order_reviews_table = """
    CREATE TABLE IF NOT EXISTS order_reviews (
        review_id               VARCHAR(32),
        order_id                VARCHAR(32),
        review_score            DECIMAL(2),
        review_creation_date    TIMESTAMP,
        review_answer_timestamp TIMESTAMP
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_order_reviews_table)

# Create Query text in SQL
create_orders_table = """
    CREATE TABLE IF NOT EXISTS orders (
        order_id                      VARCHAR(32),
        customer_id                   VARCHAR(32),
        order_status                  VARCHAR(15),
        order_purchase_timestamp      TIMESTAMP,
        order_approved_at             TIMESTAMP,
        order_delivered_carrier_date  TIMESTAMP,
        order_delivered_customer_date TIMESTAMP,
        order_estimated_delivery_date TIMESTAMP,
        PRIMARY KEY (order_id)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_orders_table)

# Create Query text in SQL
create_products_table = """
    CREATE TABLE IF NOT EXISTS products (
        product_id                 VARCHAR(32),
        product_category_name      VARCHAR(50),
        product_name_lenght        INT,
        product_description_lenght INT,
        product_photos_qty         INT,
        product_weight_g           DECIMAL(20),
        product_length_cm          DECIMAL(20),
        product_height_cm          DECIMAL(20),
        product_width_cm           DECIMAL(20),
        PRIMARY KEY (product_id)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_products_table)

# Create Query text in SQL
create_sellers_table = """
    CREATE TABLE IF NOT EXISTS sellers (
        seller_id              VARCHAR(32),
        seller_zip_code_prefix VARCHAR(5),
        seller_city            VARCHAR(50),
        seller_state           CHAR(2),
        PRIMARY KEY (seller_id)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_sellers_table)

# Create Query text in SQL
create_category_table = """
    CREATE TABLE IF NOT EXISTS category (
        product_category_name          VARCHAR(50),
        product_category_name_english  VARCHAR(50)
    )
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_category_table)

# Create Query text in SQL
create_d_geolocation = """
CREATE TABLE d_geolocation(
        geolocation_zip_code_prefix   VARCHAR(5),
        avg_lat                       FLOAT(25),
        avg_lng                       FLOAT(25)
)
"""
# With the connection, pass the query to proceed with the SQL in the Database.
execute_query(connection, create_d_geolocation)

Connection to PostgreSQL DB successful
Query executed successfully


### Upload of the Dataframes to the respective Tables

In [1005]:
copy_from_stringio(connection, df_customers,      'customers')
copy_from_stringio(connection, df_geolocation,    'geolocation')
copy_from_stringio(connection, df_order_items,    'order_items')
copy_from_stringio(connection, df_order_payments, 'order_payments')
copy_from_stringio(connection, df_order_reviews,  'order_reviews')
copy_from_stringio(connection, df_orders,         'orders')
copy_from_stringio(connection, df_products,       'products')
copy_from_stringio(connection, df_sellers,        'sellers')
copy_from_stringio(connection, df_category,       'category')
copy_from_stringio(connection, df_category,       'df_d_geolocation')

copy_from_stringio() done


### Joins
With the purpose of make the future consults easier, some tables will be consolidated.<br/>
Below the creation and insertion of data.

In [137]:
connection = create_connection('olist',
                  'elweshonorato',
                  'XXXXXX',
                  'main.czqwwerei65b.us-east-2.rds.amazonaws.com',
                  '5432')

create_f_orders_sellers_geo = """
CREATE TABLE f_orders_sellers_geo(
        order_id                    VARCHAR(32),
        order_item_id               VARCHAR(2),
        product_id                  VARCHAR(32),
        shipping_limit_date         TIMESTAMP,
        price                       DECIMAL(10),
        freight_value               DECIMAL(10),
        seller_id                   VARCHAR(32),
        geolocation_zip_code_prefix VARCHAR(5),
        avg_lat                     VARCHAR(25),
        avg_lng                     VARCHAR(25)
)
"""
execute_query(connection, create_f_orders_sellers_geo)

insert_f_orders_sellers_geo = """
INSERT INTO f_orders_sellers_geo(
    order_id,
    order_item_id,
    product_id,
    shipping_limit_date,
    price,
    freight_value,
    seller_id,
    geolocation_zip_code_prefix,
    avg_lat,
    avg_lng)
SELECT 
    oi.order_id,
    oi.order_item_id,
    oi.product_id,
    oi.shipping_limit_date,
    oi.price,
    oi.freight_value,
    s.seller_id,
    dg.geolocation_zip_code_prefix,
    dg.avg_lat, 
    dg.avg_lng
    
    FROM order_items as oi
    LEFT JOIN sellers as s
        ON oi.seller_id = s.seller_id
    LEFT JOIN d_geolocation as dg
        ON s.seller_zip_code_prefix = dg.geolocation_zip_code_prefix

"""
execute_query(connection, insert_f_orders_sellers_geo)

create_f_products_orditems_orders_reviews = """
CREATE TABLE f_products_orditems_orders_reviews(
        product_id                    VARCHAR(32),
        product_category_name         VARCHAR(50),
        product_name_lenght           INT,
        product_description_lenght    INT,
        product_photos_qty            INT,
        product_weight_g              DECIMAL(20),
        product_length_cm             DECIMAL(20),
        product_height_cm             DECIMAL(20),
        product_width_cm              DECIMAL(20),
        order_id                      VARCHAR(32),
        order_item_id                 VARCHAR(2),
        seller_id                     VARCHAR(32),
        shipping_limit_date           TIMESTAMP,
        price                         DECIMAL(10),
        freight_value                 DECIMAL(10),
        customer_id                   VARCHAR(32),
        order_status                  VARCHAR(15),
        order_purchase_timestamp      TIMESTAMP,
        order_approved_at             TIMESTAMP,
        order_delivered_carrier_date  TIMESTAMP,
        order_delivered_customer_date TIMESTAMP,
        order_estimated_delivery_date TIMESTAMP,
        review_id                     VARCHAR(32),
        review_score                  DECIMAL(2),
        review_creation_date          TIMESTAMP,
        review_answer_timestamp       TIMESTAMP
)
"""
execute_query(connection, create_f_products_orditems_orders_reviews)

insert_f_products_orditems_orders_reviews = """
INSERT INTO f_products_orditems_orders_reviews(
        product_id,
        product_category_name,
        product_name_lenght,
        product_description_lenght,
        product_photos_qty,
        product_weight_g,
        product_length_cm,
        product_height_cm,
        product_width_cm,
        order_id,
        order_item_id,
        seller_id,
        shipping_limit_date,
        price,
        freight_value,
        customer_id,
        order_status,
        order_purchase_timestamp,
        order_approved_at,
        order_delivered_carrier_date,
        order_delivered_customer_date,
        order_estimated_delivery_date,
        review_id,
        review_score,
        review_creation_date,
        review_answer_timestamp
)
SELECT
         pr.product_id,
         pr.product_category_name,
         pr.product_name_lenght,
         pr.product_description_lenght,
         pr.product_photos_qty,
         pr.product_weight_g,
         pr.product_length_cm,
         pr.product_height_cm,
         pr.product_width_cm,
         oi.order_id,
         oi.order_item_id,
         oi.seller_id,
         oi.shipping_limit_date,
         oi.price,
         oi.freight_value,
         o.customer_id,
         o.order_status,
         o.order_purchase_timestamp,
         o.order_approved_at,
         o.order_delivered_carrier_date,
         o.order_delivered_customer_date,
         o.order_estimated_delivery_date,
         rv.review_id,
         rv.review_score,
         rv.review_creation_date,
         rv.review_answer_timestamp
FROM products as pr
INNER JOIN order_items as oi
ON pr.product_id = oi.product_id
INNER JOIN orders as o
ON oi.order_id = o.order_id
INNER JOIN order_reviews as rv
ON o.order_id = rv.order_id

"""
execute_query(connection, insert_f_products_orditems_orders_reviews)

Connection to PostgreSQL DB successful
Query executed successfully
