## Setup

In [9]:
# Importing frameworks
import os
from dotenv import load_dotenv
import pandas as pd

In [10]:
# Reading environment variable for data path
load_dotenv()

DATA_PATH = os.environ['DATA_PATH']

In [11]:
# Reading raw dataset
df = pd.read_parquet(f'{DATA_PATH}/raw.parquet')

## Data Preprocessing

In [12]:
# Converting textual columns to datetime columns
datetime_cols = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
                  'order_delivered_customer_date', 'order_estimated_delivery_date', 'review_creation_date',
                  'review_answer_timestamp', 'shipping_limit_date']

for col in datetime_cols:
    df[col] = pd.to_datetime(df[col])

In [13]:
# Removing line break in textual columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.replace('\r', '', regex=False)
    df[col] = df[col].str.replace('\n', ' ', regex=False)

In [14]:
# Verifying merged dataframe dimensions after preprocessing
df.shape

(119143, 39)

In [15]:
# Verifying data types after preprocessing
df.dtypes

order_id                                 object
customer_id                              object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
review_id                                object
review_score                            float64
review_comment_title                     object
review_comment_message                   object
review_creation_date             datetime64[ns]
review_answer_timestamp          datetime64[ns]
payment_sequential                      float64
payment_type                             object
payment_installments                    float64
payment_value                           float64
order_item_id                           float64
product_id                               object
seller_id                               

In [16]:
# Exporting preprocessed dataframe
df.to_parquet(f'{DATA_PATH}/preprocessed.parquet', index=False)