## Yelp data cleaning
Marks rules for deduplication, missing values, standardization, and location validation

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import re
from datetime import datetime

### Load Initial Data and Get Metrics

In [2]:
# Load Yelp data
df = pd.read_json("../data/raw/yelp_academic_dataset_business.json", lines=True)

starting_shape = df.shape
starting_memory = df.memory_usage(deep=True).sum() / 1024**2
print(f"Original dataset shape: {starting_shape}")
print(f"Memory usage: {starting_memory} MB")
df.head()


Original dataset shape: (150346, 14)
Memory usage: 157.45216464996338 MB


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


### Select only data that has the relevant columns

The main thing we need from this dataset is location info and its categories, so if this data does not exist we can remove the record, we can also remove the other columns to make further processing faster. We will also keep business_id and name for more human understandable information. 

In [3]:
relevant_columns = [
    "business_id",
    "name",
    "address",
    "city",
    "state",
    "postal_code",
    "latitude",
    "longitude",
    "categories"
]

# Keep only the columns in our list of relevant columns
df = df[relevant_columns]

print(f"Dataset shape: {df.shape}")
print(f"Removed {starting_shape[1] - df.shape[1]} columns")

Dataset shape: (150346, 9)
Removed 5 columns


### Normalization and Cleaning
We can now quicky clean other fields with some trimming and removing of any empty relevant fields

In [4]:
df.columns = [col.lower().strip().replace(' ', '_') for col in df.columns]

string_cols = ['name', 'address', 'city', 'state', 'postal_code', 'categories']

for col in string_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.lower()

# remove any empty strings
df.replace(['', 'none', 'nan', 'null'], np.nan, inplace=True)

# drop any lines that don't have our most critical fields
critical_fields = ['longitude', 'latitude', 'categories']
df.dropna(subset=critical_fields, inplace=True)

# normalize zip codes to 5 characters
df['postal_code'] = df['postal_code'].str.extract(r'(\d{5})')

missing_values_removed = starting_shape[0] - df.shape[0]

print(f"Removed {missing_values_removed} items")
print(f"Remaining records: {df.shape[0]}")

Removed 103 items
Remaining records: 150243


### Remove duplicates

We group together businesses with the exact same name and within 100m to remove any potential duplicate records

In [5]:
# coordinates are very specific, this forces a change to be about equivalent to 100m
df['lat_bucket'] = df['latitude'].round(3)
df['lon_bucket'] = df['longitude'].round(3)

# Find duplicates
is_duplicate_bools = df.duplicated(subset=['name', 'lat_bucket', 'lon_bucket'], keep='first')

num_duplicates = is_duplicate_bools.sum()
print(f"Identified {num_duplicates} duplicate records")

# Remove the redundancies
df = df[~is_duplicate_bools].copy()

# Drop our temporary bucket columns
df.drop(columns=['lat_bucket', 'lon_bucket'], inplace=True)

print(f"Remaining records: {df.shape[0]}")


Identified 265 duplicate records
Remaining records: 149978


### Location Validation

Next we need to make sure we only keep the records that our actually in our target city of Philly

In [6]:
print("Validating coordinates with Philadelphia geographic bounds:")
print("  Latitude: 39.85° to 40.15°N")
print("  Longitude: -75.35° to -74.95°W")

invalid_coords = (
    (df['latitude'] < 39.85) | (df['latitude'] > 40.15) |
    (df['longitude'] < -75.35) | (df['longitude'] > -74.95)
)

num_businesses_outside_range = invalid_coords.sum()
print(f"Will be removing {num_businesses_outside_range} rows not in Philadelphia")

# Drop invalid coordinates
df = df[~invalid_coords].copy()

print(f"Remaining records: {df.shape[0]}")

Validating coordinates with Philadelphia geographic bounds:
  Latitude: 39.85° to 40.15°N
  Longitude: -75.35° to -74.95°W
Will be removing 126579 rows not in Philadelphia
Remaining records: 23399


## Cleaning Summary

In [7]:
# Display metrics on the cleaning process

print(f"Removed {starting_shape[1] - df.shape[1]} irrelevant columns")
print(f"Detected {missing_values_removed} businesses with missing values")
print(f"Detected {num_duplicates} duplicate businesses")
print(f"Removed {num_businesses_outside_range} businesses outside of Philadelpia")

print(f"\n Went from {starting_shape[0]} rows and {starting_shape[1]} cols to {df.shape[0]} rows and {df.shape[1]} cols")
print(f" {starting_memory:.2f}Mb to {df.memory_usage(deep=True).sum() / 1024**2:.2f}Mb")


Removed 5 irrelevant columns
Detected 103 businesses with missing values
Detected 265 duplicate businesses
Removed 126579 businesses outside of Philadelpia

 Went from 150346 rows and 14 cols to 23399 rows and 9 cols
 157.45Mb to 12.39Mb


## Output Clean Dataset

In [8]:
outputLoc = '../data/processed/yelp_cleaned.json'

# Save each row as a separate JSON object on its own line like yelp had originally
df.to_json(outputLoc, orient='records', lines=True)

print(f'Export complete at: {outputLoc}')

Export complete at: ../data/processed/yelp_cleaned.json
