In [1]:
import pandas as pd
import numpy as np

**Load the dataset**

In [2]:
root_path = "../DataSet/"

orders = pd.read_csv(root_path + 'olist_orders_dataset.csv')

**Inspect the data**

In [3]:
print("Initial Orders Dataset Info:")
print(orders.info())
print("\nMissing Values:")
print(orders.isnull().sum())

Initial Orders Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB
None

Missing Values:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carri

**Drop rows with missing order_id**

In [4]:
orders = orders.dropna(subset=['order_id'])

**Fill missing timestamps**

In [5]:
timestamp_cols = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
                  'order_delivered_customer_date', 'order_estimated_delivery_date']
for col in timestamp_cols:
    orders[col] = orders[col].fillna('2099-12-31')

**Convert data types**

In [6]:
orders['order_id'] = orders['order_id'].astype(str)
orders['customer_id'] = orders['customer_id'].astype(str)
orders['order_status'] = orders['order_status'].astype(str)
for col in timestamp_cols:
    orders[col] = pd.to_datetime(orders[col], errors='coerce')

**Creative addition: Calculate delivery time**

In [7]:
orders['delivery_time_days'] = (orders['order_delivered_customer_date'] - 
                                orders['order_purchase_timestamp']).dt.days

**Save the cleaned file**

In [8]:
orders.to_csv('./Data_Cleaned/cleaned_olist_orders_dataset.csv', index=False)
print("Saved cleaned orders dataset as './Data_Cleaned/cleaned_olist_orders_dataset.csv'")

Saved cleaned orders dataset as './Data_Cleaned/cleaned_olist_orders_dataset.csv'
