In [2]:
import pandas as pd
import numpy as np

**Load the dataset**

In [7]:
root_path = "../DataSet/"

sellers = pd.read_csv(root_path + 'olist_sellers_dataset.csv')

**Step 1: Inspect the data**

In [8]:
print("Initial Sellers Dataset Info:")
print(sellers.info())
print("\nMissing Values:")
print(sellers.isnull().sum())
print("\nDuplicate Seller IDs:")
print(sellers['seller_id'].duplicated().sum())

Initial Sellers Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB
None

Missing Values:
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

Duplicate Seller IDs:
0


**Step 2: Handle missing values**

In [9]:
# Check for missing values in critical columns
if sellers['seller_id'].isnull().sum() > 0:
    sellers = sellers.dropna(subset=['seller_id'])  # Drop rows with missing seller_id
    print("Dropped rows with missing seller_id")

In [10]:
# For zip code, city, and state, fill missing values with 'Unknown' to preserve data
sellers['seller_zip_code_prefix'] = sellers['seller_zip_code_prefix'].fillna('Unknown')
sellers['seller_city'] = sellers['seller_city'].fillna('Unknown')
sellers['seller_state'] = sellers['seller_state'].fillna('Unknown')

**Step 3: Ensure data type consistency**

In [11]:
# seller_id should be string
sellers['seller_id'] = sellers['seller_id'].astype(str)
# seller_zip_code_prefix might be numeric or string; convert to string for consistency
sellers['seller_zip_code_prefix'] = sellers['seller_zip_code_prefix'].astype(str)
# seller_city and seller_state should be strings
sellers['seller_city'] = sellers['seller_city'].astype(str)
sellers['seller_state'] = sellers['seller_state'].astype(str)

**Step 4: Standardize text data**

In [12]:
# Remove leading/trailing whitespace and convert cities/states to title case
sellers['seller_city'] = sellers['seller_city'].str.strip().str.title()
sellers['seller_state'] = sellers['seller_state'].str.strip().str.upper()

**Step 5: Check for duplicates**

In [13]:
# Remove duplicate seller_id entries, keeping the first occurrence
sellers = sellers.drop_duplicates(subset=['seller_id'], keep='first')
print(f"Removed {sellers['seller_id'].duplicated().sum()} duplicate seller IDs")

Removed 0 duplicate seller IDs


**Step 6: Validate state codes (Creative Touch)**

In [14]:
# Brazilian states have 2-letter codes; flag any anomalies
valid_states = {'AC', 'AL', 'AP', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MT', 
                'MS', 'MG', 'PA', 'PB', 'PR', 'PE', 'PI', 'RJ', 'RN', 'RS', 'RO', 
                'RR', 'SC', 'SP', 'SE', 'TO'}
sellers['is_valid_state'] = sellers['seller_state'].isin(valid_states).astype(int)
invalid_states = sellers[~sellers['seller_state'].isin(valid_states)]['seller_state'].unique()
if len(invalid_states) > 0:
    print(f"Invalid state codes detected: {invalid_states}")

**Step 7: Creative Addition - Derive region from state**

In [None]:
# Mapping Brazilian states to regions for geographic analysis
state_to_region = {
    'AC': 'North', 'AM': 'North', 'AP': 'North', 'PA': 'North', 'RO': 'North', 'RR': 'North', 'TO': 'North',
    'AL': 'Northeast', 'BA': 'Northeast', 'CE': 'Northeast', 'MA': 'Northeast', 'PB': 'Northeast', 
    'PE': 'Northeast', 'PI': 'Northeast', 'RN': 'Northeast', 'SE': 'Northeast',
    'DF': 'Central-West', 'GO': 'Central-West', 'MT': 'Central-West', 'MS': 'Central-West',
    'ES': 'Southeast', 'MG': 'Southeast', 'RJ': 'Southeast', 'SP': 'Southeast',
    'PR': 'South', 'RS': 'South', 'SC': 'South'
}
sellers['seller_region'] = sellers['seller_state'].map(state_to_region).fillna('Unknown')
print("Added seller_region column for geographic insights")

**Step 8: Save the cleaned dataset**

In [None]:
sellers.to_csv('./Data_Cleaned/cleaned_sellers_dataset.csv', index=False)
print("Sellers dataset cleaned and saved as './Data_Cleaned/cleaned_sellers_dataset.csv'")

Sellers dataset cleaned and saved as 'cleaned_sellers_dataset.csv'
