In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('NY-House-Dataset.csv')

# Checking for missing values
missing_values = df.isna().sum()
print("Missing Values:\n", missing_values)

# Remove prefixes and suffixes from relevant columns
df["BROKERTITLE"] = df["BROKERTITLE"].apply(lambda row: row.removeprefix("Brokered by "))
df["TYPE"] = df["TYPE"].apply(lambda row: row.removesuffix(" for sale"))

# Rename columns for easier access
df = df.rename(columns={
    "TYPE": "type",
    "BROKERTITLE": "broker",
    "PRICE": "price",
    "BEDS": "beds",
    "BATH": "baths",
    "PROPERTYSQFT": "sqft",
    "ADDRESS": "address",
    "STATE": "state",
    "MAIN_ADDRESS": "address_full",
    "LATITUDE": "lat",
    "LONGITUDE": "lon",
    "SUBLOCALITY": "sub_locality"
})[["type", "broker", "price", "beds", "baths", "sqft", "address", "state", "address_full", "lat", "lon", "sub_locality"]]

# Convert 'baths' to integer
df['baths'] = df['baths'].astype("int32")

# Standardize the 'sub_locality' column
translations = {
    "New York": "New York County",  # Manhattan
    "Kings County": "Kings County",  # Brooklyn
    "Queens County": "Queens County",
    "Queens": "Queens County",
    "Richmond County": "Richmond County",  # Staten Island
    "Brooklyn": "Kings County",
    "Bronx County": "Bronx County",
    "New York County": "New York County",  # Manhattan
    "The Bronx": "Bronx County",
    "Staten Island": "Richmond County",
    "Manhattan": "New York County",
    "Riverdale": "Bronx County",  # Neighborhood in The Bronx
    "Flushing": "Queens County",  # Neighborhood in Queens
    "Coney Island": "Kings County",  # Neighborhood in Brooklyn
    "East Bronx": "Bronx County",  # Part of The Bronx
    "Brooklyn Heights": "Kings County",  # Neighborhood in Brooklyn
    "Jackson Heights": "Queens County",  # Neighborhood in Queens
    "Rego Park": "Queens County",  # Neighborhood in Queens
    "Fort Hamilton": "Kings County",  # Neighborhood in Brooklyn
    "Dumbo": "Kings County",  # Neighborhood in Brooklyn
    "Snyder Avenue": "Kings County"  # Assuming this is in Brooklyn
}

df["sub_locality"] = df["sub_locality"].apply(lambda row: translations.get(row, row))

# Check if any non-county entries remain
non_county_entries = np.any(~df["sub_locality"].str.contains("County"))
print("Non-County Entries Remaining:", non_county_entries)

# Display the cleaned data
print("Cleaned Data Sample:\n", df.head())

# Summary statistics
summary_stats = df[["price", "beds", "baths", "sqft"]].describe()
print("Summary Statistics:\n", summary_stats)

# Plot price distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=50, kde=True)
plt.title("Distribution of Property Prices")
plt.xlabel("Price ($)")
plt.ylabel("Frequency")
plt.show()

# Identify outliers in 'beds' and 'baths'
outliers_beds = df[df["beds"] > 10]
outliers_baths = df[df["baths"] > 10]
print("Outliers in beds:\n", outliers_beds)
print("Outliers in baths:\n", outliers_baths)

# Count the number of listings per broker
broker_counts = df["broker"].value_counts()

# Calculate the relative market share
broker_market_share = (broker_counts / broker_counts.sum()) * 100
print("Top 10 Brokers by Market Share (%):\n", broker_market_share.head(10))

# Visualization of top brokers' market share
plt.figure(figsize=(12, 8))
sns.barplot(y=broker_market_share.head(10).index, x=broker_market_share.head(10).values, palette="viridis")
plt.title("Top 10 Brokers by Market Share (%)")
plt.xlabel("Market Share (%)")
plt.ylabel("Broker")

# Save the figure as a PNG file
plt.savefig('top_10_brokers_market_share.png')

# Show the plot
plt.show()
