In [3]:
import pandas as pd
import numpy as np

**1. Define the folder path and load the files**

In [41]:
root_path = "../DataSet/"

# Load the orders file and convert date columns to datetime type
orders = pd.read_csv(
    root_path + "olist_orders_dataset.csv",
    parse_dates=[
        "order_purchase_timestamp", 
        "order_approved_at", 
        "order_delivered_carrier_date", 
        "order_delivered_customer_date", 
        "order_estimated_delivery_date"
    ]
)

# Load the order items file
order_items = pd.read_csv(root_path + "olist_order_items_dataset.csv")

# Load the payments file
order_payments = pd.read_csv(root_path + "olist_order_payments_dataset.csv")

# Load the reviews file
order_reviews = pd.read_csv(root_path + "olist_order_reviews_dataset.csv")

**2. Display samples of the data**

In [42]:
print("----- Orders -----")
print(orders.head(3))
print("\n----- Order Items -----")
print(order_items.head(3))
print("\n----- Order Payments -----")
print(order_payments.head(3))
print("\n----- Order Reviews -----")
print(order_reviews.head(3))

----- Orders -----
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   

  order_status order_purchase_timestamp   order_approved_at  \
0    delivered      2017-10-02 10:56:00 2017-10-02 11:07:00   
1    delivered      2018-07-24 20:41:00 2018-07-26 03:24:00   
2    delivered      2018-08-08 08:38:00 2018-08-08 08:55:00   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:55:00           2017-10-10 21:25:00   
1          2018-07-26 14:31:00           2018-08-07 15:27:00   
2          2018-08-08 13:50:00           2018-08-17 18:06:00   

  order_estimated_delivery_date  
0                    2017-10-18  
1                    2018-08-13  
2                    2018-09-04  

----- Order Items -----
                 

**3. Calculate and print the number of missing values in each file**

In [43]:
print("\nMissing values in orders:")
print(orders.isnull().sum())

print("\nMissing values in order_items:")
print(order_items.isnull().sum())

print("\nMissing values in order_payments:")
print(order_payments.isnull().sum())

print("\nMissing values in order_reviews:")
print(order_reviews.isnull().sum())


Missing values in orders:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

Missing values in order_items:
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

Missing values in order_payments:
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

Missing values in order_reviews:
review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp   

**4. Calculate missing value percentages in the orders file**

In [44]:
orders_missing_pct = orders.isnull().mean() * 100
print("\nMissing values percentage in orders:\n", orders_missing_pct)


Missing values percentage in orders:
 order_id                         0.000000
customer_id                      0.000000
order_status                     0.000000
order_purchase_timestamp         0.000000
order_approved_at                0.160899
order_delivered_carrier_date     1.793023
order_delivered_customer_date    2.981668
order_estimated_delivery_date    0.000000
dtype: float64


**5. Analyze the status of orders missing critical values**

In [45]:
# Analyze orders missing order_approved_at
missing_approved = orders[orders['order_approved_at'].isnull()]
print("\nOrder status distribution for missing order_approved_at:")
print(missing_approved['order_status'].value_counts())

# Analyze orders missing order_delivered_customer_date
missing_delivered = orders[orders['order_delivered_customer_date'].isnull()]
print("\nOrder status distribution for missing order_delivered_customer_date:")
print(missing_delivered['order_status'].value_counts())


Order status distribution for missing order_approved_at:
order_status
canceled     141
delivered     14
created        5
Name: count, dtype: int64

Order status distribution for missing order_delivered_customer_date:
order_status
shipped        1107
canceled        619
unavailable     609
invoiced        314
processing      301
delivered         8
created           5
approved          2
Name: count, dtype: int64


**6. Confirm datetime column conversions (recheck)**

In [46]:
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'], errors='coerce')
orders['order_approved_at'] = pd.to_datetime(orders['order_approved_at'], errors='coerce')
orders['order_delivered_customer_date'] = pd.to_datetime(orders['order_delivered_customer_date'], errors='coerce')
orders['order_estimated_delivery_date'] = pd.to_datetime(orders['order_estimated_delivery_date'], errors='coerce')

**7. Handle missing values in the orders file**

In [47]:
# (a) Handle order_approved_at:
# Extract delivered orders missing order_approved_at
delivered_missing_approval = orders[(orders['order_status'] == 'delivered') & (orders['order_approved_at'].isnull())]
print("\nNumber of orders delivered without order_approved_at:", delivered_missing_approval.shape[0])

# Exclude these rows from approval time calculation (since they represent a very small percentage)
orders_valid_approval = orders[~((orders['order_status'] == 'delivered') & (orders['order_approved_at'].isnull()))]

# (b) Handle order_delivered_customer_date:
# Extract delivered orders missing order_delivered_customer_date
delivered_missing_delivery = orders[(orders['order_status'] == 'delivered') & (orders['order_delivered_customer_date'].isnull())]
print("Number of orders delivered without order_delivered_customer_date:", delivered_missing_delivery.shape[0])

# Use the estimated delivery date as a temporary substitute for those delivered orders (with documentation of this step)
mask = (orders['order_status'] == 'delivered') & (orders['order_delivered_customer_date'].isnull())
orders.loc[mask, 'order_delivered_customer_date'] = orders.loc[mask, 'order_estimated_delivery_date']

# Ensure missing values in order_delivered_customer_date for delivered orders are handled
missing_delivered_after = orders[(orders['order_status'] == 'delivered') & (orders['order_delivered_customer_date'].isnull())].shape[0]
print("After imputation, number of orders delivered without order_delivered_customer_date:", missing_delivered_after)


Number of orders delivered without order_approved_at: 14
Number of orders delivered without order_delivered_customer_date: 8
After imputation, number of orders delivered without order_delivered_customer_date: 0


**8. Handle missing values in the order_reviews file**

In [48]:
# In the order_reviews file, there are many missing values in the review_comment_message column.
# Replace them with a default text "No Comment" to ease the analysis.
order_reviews['review_comment_message'] = order_reviews['review_comment_message'].fillna("No Comment")

**9. Save the processed data**

In [None]:
# Save a version of the orders file after excluding invalid rows (for example, for approval time analysis)
orders_clean = orders_valid_approval.copy()
orders_clean.to_csv("cleaned_orders_dataset.csv", index=False)

# Save a version of the order_reviews file after handling missing values
order_reviews.to_csv("cleaned_order_reviews_dataset.csv", index=False)

**10. Display a sample of the processed data for verification**

In [50]:
print("\n----- Cleaned Orders Sample -----")
print(orders_clean.head())

print("\n----- Cleaned Order Reviews Sample -----")
print(order_reviews.head())


----- Cleaned Orders Sample -----
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp   order_approved_at  \
0    delivered      2017-10-02 10:56:00 2017-10-02 11:07:00   
1    delivered      2018-07-24 20:41:00 2018-07-26 03:24:00   
2    delivered      2018-08-08 08:38:00 2018-08-08 08:55:00   
3    delivered      2017-11-18 19:28:00 2017-11-18 19:45:00   
4    delivered      2018-02-13 21:18:00 2018-02-13 22:20:00   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:55:00           2017-10-10 21:25:00   
1          2018-07-