In [1]:
import pandas as pd
import numpy as np

**Load the dataset**

In [2]:
root_path = "../DataSet/"

customers = pd.read_csv(root_path + 'olist_customers_dataset.csv')

**Inspect the data**

In [3]:
print("Initial Customers Dataset Info:")
print(customers.info())
print("\nMissing Values:")
print(customers.isnull().sum())

Initial Customers Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None

Missing Values:
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64


**Drop rows with missing critical IDs**

In [4]:
customers = customers.dropna(subset=['customer_id', 'customer_unique_id'])

**Fill missing geographic data with 'Unknown'**

In [5]:
customers['customer_zip_code_prefix'] = customers['customer_zip_code_prefix'].fillna('Unknown')
customers['customer_city'] = customers['customer_city'].fillna('Unknown')
customers['customer_state'] = customers['customer_state'].fillna('Unknown')

**Convert to consistent data types**

In [6]:
customers['customer_id'] = customers['customer_id'].astype(str)
customers['customer_unique_id'] = customers['customer_unique_id'].astype(str)
customers['customer_zip_code_prefix'] = customers['customer_zip_code_prefix'].astype(str)

**Standardize text**

In [7]:
customers['customer_city'] = customers['customer_city'].str.strip().str.title()
customers['customer_state'] = customers['customer_state'].str.strip().str.upper()

**Remove duplicate customer_ids**

In [8]:
customers = customers.drop_duplicates(subset=['customer_id'], keep='first')

**Creative addition: Validate Brazilian state codes**

In [9]:
valid_states = {'AC', 'AL', 'AP', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MT', 
                'MS', 'MG', 'PA', 'PB', 'PR', 'PE', 'PI', 'RJ', 'RN', 'RS', 'RO', 
                'RR', 'SC', 'SP', 'SE', 'TO'}
customers['is_valid_state'] = customers['customer_state'].isin(valid_states).astype(int)

**Save the cleaned file**

In [10]:
customers.to_csv('./Data_Cleaned/cleaned_olist_customers_dataset.csv', index=False)
print("Saved cleaned customers dataset as './Data_Cleaned/cleaned_olist_customers_dataset.csv'")

Saved cleaned customers dataset as './Data_Cleaned/cleaned_olist_customers_dataset.csv'
