In [2]:
import pandas as pd
import numpy as np

**Load the dataset**

In [5]:
root_path = "../DataSet/"

geolocation = pd.read_csv(root_path + 'olist_geolocation_dataset.csv')

**Inspect the data**

In [6]:
print("Initial Geolocation Dataset Info:")
print(geolocation.info())
print("\nMissing Values:")
print(geolocation.isnull().sum())

Initial Geolocation Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB
None

Missing Values:
geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64


**Fill missing city and state**

In [7]:
geolocation['geolocation_city'] = geolocation['geolocation_city'].fillna('Unknown')
geolocation['geolocation_state'] = geolocation['geolocation_state'].fillna('Unknown')

**Convert data types**

In [8]:
geolocation['geolocation_zip_code_prefix'] = geolocation['geolocation_zip_code_prefix'].astype(str)
geolocation['geolocation_lat'] = geolocation['geolocation_lat'].astype(float)
geolocation['geolocation_lng'] = geolocation['geolocation_lng'].astype(float)

**Standardize text**

In [9]:
geolocation['geolocation_city'] = geolocation['geolocation_city'].str.strip().str.title()
geolocation['geolocation_state'] = geolocation['geolocation_state'].str.strip().str.upper()

**Remove duplicates**

In [10]:
geolocation = geolocation.drop_duplicates(subset=['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng'], keep='first')

**Creative addition: Validate lat/lng for Brazil (lat: -33.75 to 5.27, lng: -73.99 to -32.39)**

In [11]:
geolocation['is_valid_coords'] = (geolocation['geolocation_lat'].between(-33.75, 5.27) & 
                                  geolocation['geolocation_lng'].between(-73.99, -32.39)).astype(int)

**Save the cleaned file**

In [12]:
geolocation.to_csv('./Data_Cleaned/cleaned_olist_geolocation_dataset.csv', index=False)
print("Saved cleaned geolocation dataset as './Data_Cleaned/cleaned_olist_geolocation_dataset.csv'")

Saved cleaned geolocation dataset as './Data_Cleaned/cleaned_olist_geolocation_dataset.csv'
