In [None]:

# Assignment ML10: Data Wrangling on Real Estate Market
# Educational notebook. Expects './real_estate.csv' or uses a synthetic example.

import pandas as pd
import numpy as np

path = "./real_estate.csv"
try:
    df = pd.read_csv(path)
    print("Loaded real estate data from", path)
except Exception as e:
    print("Could not load './real_estate.csv' â€” creating a synthetic dataset. Error:", e)
    n = 500
    np.random.seed(42)
    df = pd.DataFrame({
        "price": np.random.normal(300000, 50000, n).astype(int),
        "area_sqft": np.random.normal(1500, 300, n).astype(int),
        "bedrooms": np.random.choice([1,2,3,4], n, p=[0.1,0.3,0.4,0.2]),
        "bathrooms": np.random.choice([1,2,3], n, p=[0.4,0.5,0.1]),
        "year_built": np.random.choice(range(1950,2021), n),
        "city": np.random.choice(["CityA","CityB","CityC"], n)
    })

display(df.head())

# Basic wrangling steps
print("Missing values per column:\n", df.isna().sum())
# Feature engineering: price per sqft
df["price_per_sqft"] = df["price"] / df["area_sqft"]
# Binning year_built into age groups
df["age"] = 2025 - df["year_built"]
df["age_group"] = pd.cut(df["age"], bins=[-1,10,30,60,200], labels=["new","recent","mid","old"])
display(df.head())

# Aggregations
print("\nAverage price per city:\n", df.groupby("city")["price"].mean())
print("\nMedian price_per_sqft by bedrooms:\n", df.groupby("bedrooms")["price_per_sqft"].median())
