In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('/workspaces/Business_Data_Scraper/foursquare_business_data.csv')  # Replace with your actual CSV file path

# Display the first few rows of the data to check
df.head()

Unnamed: 0,Name,Address,City,Category
0,Masala Library,Fifc Building,Mumbai,Indian Restaurant
1,Naaz Hotel,"Kantharia Mahal, Ground Floor, Lbs Road",Mumbai,Indian Restaurant
2,Smoke House Deli (BKC),Fifc Building,Mumbai,Deli
3,IVY Banquet,"Amar Mahal, Ghatkopar East, Mg Road",Mumbai,Restaurant
4,Tresind,"Ground Floor, Inspire Bkc, G Block",Mumbai,Restaurant


In [3]:
# Check the number of rows and columns in the dataset
print(f"Original Data has {df.shape[0]} rows and {df.shape[1]} columns.")


Original Data has 5000 rows and 4 columns.


In [4]:
def clean_data(df):
    """
    Clean the DataFrame by handling missing values, duplicates, and standardizing formats.
    
    :param df: The raw DataFrame to clean.
    :return: The cleaned DataFrame.
    """
    # Replace "N/A" with NaN for missing values
    df.replace("N/A", pd.NA, inplace=True)
    
    # Drop duplicates based on 'Name', 'Address', 'City', and 'Category'
    df.drop_duplicates(subset=["Name", "Address", "City", "Category"], keep="first", inplace=True)
    
    # Standardize text data (e.g., convert to title case)
    df["Name"] = df["Name"].str.title()
    df["City"] = df["City"].str.title()
    df["Category"] = df["Category"].str.title()
    
    # Drop rows where essential columns ('Name' or 'Address') have missing values
    df.dropna(subset=["Name", "Address"], inplace=True)
    
    # Reset the index after cleaning
    df.reset_index(drop=True, inplace=True)

    return df

# Clean the data
cleaned_df = clean_data(df)

# Display the cleaned DataFrame shape
print(f"Cleaned DataFrame has {cleaned_df.shape[0]} rows and {cleaned_df.shape[1]} columns.")

Cleaned DataFrame has 50 rows and 4 columns.


In [5]:
# Save the cleaned data to a new CSV file
cleaned_df.to_csv('cleaned_foursquare_business_data.csv', index=False)

# Check the first few rows of the cleaned data
cleaned_df.head()

Unnamed: 0,Name,Address,City,Category
0,Masala Library,Fifc Building,Mumbai,Indian Restaurant
1,Naaz Hotel,"Kantharia Mahal, Ground Floor, Lbs Road",Mumbai,Indian Restaurant
2,Smoke House Deli (Bkc),Fifc Building,Mumbai,Deli
3,Ivy Banquet,"Amar Mahal, Ghatkopar East, Mg Road",Mumbai,Restaurant
4,Tresind,"Ground Floor, Inspire Bkc, G Block",Mumbai,Restaurant
