In [1]:
import pandas as pd
import numpy as np

**Load the dataset**

In [2]:
root_path = "../DataSet/"

order_payments = pd.read_csv(root_path + 'olist_order_payments_dataset.csv')

**Inspect the data**

In [3]:
print("Initial Order Payments Dataset Info:")
print(order_payments.info())
print("\nMissing Values:")
print(order_payments.isnull().sum())

Initial Order Payments Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB
None

Missing Values:
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64


**Drop rows with missing order_id**

In [4]:
order_payments = order_payments.dropna(subset=['order_id'])

**Fill missing payment_type**

In [5]:
order_payments['payment_type'] = order_payments['payment_type'].fillna('Unknown')

**Convert data types**

In [6]:
order_payments['order_id'] = order_payments['order_id'].astype(str)
order_payments['payment_type'] = order_payments['payment_type'].astype(str)
order_payments['payment_value'] = order_payments['payment_value'].astype(float)

**Ensure non-negative payment values**

In [7]:
order_payments['payment_value'] = order_payments['payment_value'].clip(lower=0)

**Creative addition: Categorize payments**

In [8]:
order_payments['payment_category'] = order_payments['payment_type'].apply(
    lambda x: 'Card' if 'card' in x.lower() else 'Other'
)

**Save the cleaned file**

In [9]:
order_payments.to_csv('./Data_Cleaned/cleaned_olist_order_payments_dataset.csv', index=False)
print("Saved cleaned order payments dataset as './Data_Cleaned/cleaned_olist_order_payments_dataset.csv'")

Saved cleaned order payments dataset as './Data_Cleaned/cleaned_olist_order_payments_dataset.csv'
