In [1]:
import pandas as pd
import numpy as np

**Load the dataset**

In [2]:
root_path = "../DataSet/"

products = pd.read_csv(root_path + 'olist_products_dataset.csv')

**Step 1: Inspect the data**

In [3]:
print("Initial Products Dataset Info:")
print(products.info())
print("\nMissing Values:")
print(products.isnull().sum())
print("\nDuplicate Product IDs:")
print(products['product_id'].duplicated().sum())

Initial Products Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB
None

Missing Values:
product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty      

**Step 2: Handle missing values**

In [4]:
# Drop rows with missing product_id
products = products.dropna(subset=['product_id'])
print("Dropped rows with missing product_id")

# Fill missing product_category_name with 'Unknown'
products['product_category_name'] = products['product_category_name'].fillna('Unknown')

Dropped rows with missing product_id


In [5]:
# Numerical columns: impute with median to maintain distribution
numeric_cols = ['product_name_lenght', 'product_description_lenght', 'product_photos_qty',
                'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']
for col in numeric_cols:
    products[col] = products[col].fillna(products[col].median())
    print(f"Imputed missing values in {col} with median")

Imputed missing values in product_name_lenght with median
Imputed missing values in product_description_lenght with median
Imputed missing values in product_photos_qty with median
Imputed missing values in product_weight_g with median
Imputed missing values in product_length_cm with median
Imputed missing values in product_height_cm with median
Imputed missing values in product_width_cm with median


**Step 3: Ensure data type consistency**

In [6]:
products['product_id'] = products['product_id'].astype(str)
products['product_category_name'] = products['product_category_name'].astype(str)
for col in numeric_cols:
    products[col] = products[col].astype(float)  # Use float to accommodate potential decimals

**Step 4: Validate numerical data**

In [8]:
# Check for negative or unrealistic values
for col in numeric_cols:
    if (products[col] < 0).sum() > 0:
        products.loc[products[col] < 0, col] = products[col].median()
        print(f"Replaced negative values in {col} with median")

    # Cap extreme outliers (e.g., > 99th percentile)
    upper_limit = products[col].quantile(0.99)
    products[col] = products[col].clip(lower=0, upper=upper_limit)  # Fixed syntax
    print(f"Capped {col} at 99th percentile: {upper_limit}")


Capped product_name_lenght at 99th percentile: 63.0
Capped product_description_lenght at 99th percentile: 3274.5
Capped product_photos_qty at 99th percentile: 8.0
Capped product_weight_g at 99th percentile: 22537.5
Capped product_length_cm at 99th percentile: 100.0
Capped product_height_cm at 99th percentile: 69.0
Capped product_width_cm at 99th percentile: 63.0


**Step 5: Check for duplicates**

In [9]:
products = products.drop_duplicates(subset=['product_id'], keep='first')
print(f"Removed {products['product_id'].duplicated().sum()} duplicate product IDs")

Removed 0 duplicate product IDs


**Step 6: Creative Addition - Calculate product volume**

In [None]:
products['product_volume_cm3'] = (products['product_length_cm'] * 
                                  products['product_height_cm'] * 
                                  products['product_width_cm'])
print("Added product_volume_cm3 for size-related analysis")

Added product_volume_cm3 for size-related analysis


**Step 7: Creative Addition - Flag heavy products**

In [11]:
products['is_heavy'] = (products['product_weight_g'] > products['product_weight_g'].quantile(0.75)).astype(int)
print("Added is_heavy flag for logistics insights")

Added is_heavy flag for logistics insights


**Step 8: Save the cleaned dataset**

In [14]:
products.to_csv('./Data_Cleaned/cleaned_products_dataset.csv', index=False)
print("Products dataset cleaned and saved as './Data_Cleaned/cleaned_products_dataset.csv'")

Products dataset cleaned and saved as './Data_Cleaned/cleaned_products_dataset.csv'
