In [2]:
import pandas as pd
import numpy as np

**Load the dataset**

In [3]:
root_path = "../DataSet/"

categories = pd.read_csv(root_path + 'product_category_name_translation.csv')

**Step 1: Inspect the data**

In [4]:
print("Initial Categories Dataset Info:")
print(categories.info())
print("\nMissing Values:")
print(categories.isnull().sum())
print("\nDuplicate Categories:")
print(categories['product_category_name'].duplicated().sum())

Initial Categories Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   product_category_name          71 non-null     object
 1   product_category_name_english  71 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB
None

Missing Values:
product_category_name            0
product_category_name_english    0
dtype: int64

Duplicate Categories:
0


**Step 2: Handle missing values**

In [5]:
# Drop rows with missing category names (either Portuguese or English)
categories = categories.dropna(subset=['product_category_name', 'product_category_name_english'])
print(f"Dropped {categories.isnull().sum().sum()} rows with missing category names")

Dropped 0 rows with missing category names


**Step 3: Ensure data type consistency**

In [6]:
categories['product_category_name'] = categories['product_category_name'].astype(str)
categories['product_category_name_english'] = categories['product_category_name_english'].astype(str)

**Step 4: Standardize text data**

In [7]:
# Remove whitespace and standardize formatting
categories['product_category_name'] = categories['product_category_name'].str.strip().str.lower()
categories['product_category_name_english'] = categories['product_category_name_english'].str.strip().str.lower().str.replace(' ', '_')

**Step 5: Check for duplicates**

In [8]:
# Remove duplicate Portuguese category names, keeping the first translation
categories = categories.drop_duplicates(subset=['product_category_name'], keep='first')
print(f"Removed {categories['product_category_name'].duplicated().sum()} duplicate category names")

Removed 0 duplicate category names


**Step 6: Creative Addition - Add category grouping**

In [9]:
# Group categories into broader supercategories for advanced analysis
supercategory_mapping = {
    'health_beauty': 'Personal Care', 'perfumery': 'Personal Care', 'baby': 'Personal Care',
    'computers_accessories': 'Electronics', 'telephony': 'Electronics', 'fixed_telephony': 'Electronics',
    'electronics': 'Electronics', 'home_appliances': 'Electronics', 'home_appliances_2': 'Electronics',
    'auto': 'Automotive', 'construction_tools_construction': 'Home & Garden', 'garden_tools': 'Home & Garden',
    'bed_bath_table': 'Home & Garden', 'furniture_decor': 'Home & Garden', 'housewares': 'Home & Garden',
    'sports_leisure': 'Sports & Leisure', 'fashion_bags_accessories': 'Fashion', 'fashion_shoes': 'Fashion',
    'fashion_male_clothing': 'Fashion', 'fashion_underwear_beach': 'Fashion', 'fashion_sport': 'Fashion',
    'fashion_female_clothing': 'Fashion', 'fashion_childrens_clothes': 'Fashion',
    'toys': 'Entertainment', 'consoles_games': 'Entertainment', 'musical_instruments': 'Entertainment',
    'food_drink': 'Food & Beverage', 'food': 'Food & Beverage', 'drinks': 'Food & Beverage',
    # Add more mappings as needed
}
categories['supercategory'] = categories['product_category_name_english'].map(supercategory_mapping).fillna('Other')
print("Added supercategory column for broader category analysis")

Added supercategory column for broader category analysis


**Step 7: Save the cleaned dataset**

In [None]:
categories.to_csv('./Data_Cleaned/cleaned_product_category_name_translation.csv', index=False)
print("Categories dataset cleaned and saved as './Data_Cleaned/cleaned_product_category_name_translation.csv'")

Categories dataset cleaned and saved as 'cleaned_product_category_name_translation.csv'
