In [1]:
import pandas as pd
import numpy as np

**Load the dataset**

In [2]:
root_path = "../DataSet/"

order_items = pd.read_csv(root_path + 'olist_order_items_dataset.csv')

**Inspect the data**

In [3]:
print("Initial Order Items Dataset Info:")
print(order_items.info())
print("\nMissing Values:")
print(order_items.isnull().sum())

Initial Order Items Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB
None

Missing Values:
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64


**Drop rows with missing critical IDs**

In [4]:
order_items = order_items.dropna(subset=['order_id', 'product_id', 'seller_id'])

**Fill missing shipping_limit_date**

In [5]:
order_items['shipping_limit_date'] = order_items['shipping_limit_date'].fillna('2099-12-31')

**Convert data types**

In [6]:
order_items['order_id'] = order_items['order_id'].astype(str)
order_items['product_id'] = order_items['product_id'].astype(str)
order_items['seller_id'] = order_items['seller_id'].astype(str)
order_items['shipping_limit_date'] = pd.to_datetime(order_items['shipping_limit_date'], errors='coerce')
order_items['price'] = order_items['price'].astype(float)
order_items['freight_value'] = order_items['freight_value'].astype(float)

**Ensure non-negative values**

In [7]:
order_items['price'] = order_items['price'].clip(lower=0)
order_items['freight_value'] = order_items['freight_value'].clip(lower=0)

**Creative addition: Calculate total cost**

In [8]:
order_items['total_cost'] = order_items['price'] + order_items['freight_value']

**Save the cleaned file**

In [9]:
order_items.to_csv('./Data_Cleaned/cleaned_olist_order_items_dataset.csv', index=False)
print("Saved cleaned order items dataset as './Data_Cleaned/cleaned_olist_order_items_dataset.csv'")

Saved cleaned order items dataset as './Data_Cleaned/cleaned_olist_order_items_dataset.csv'
