In [16]:
import pandas as pd

def load_and_clean_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    
    df.columns = df.columns.str.lower().str.strip()

    # Clean and convert price column
    df["price"] = df["price"].replace('[\$,]', '', regex=True).astype(float)

    # Drop rows with missing coordinates or price
    df = df.dropna(subset=["price", "lat", "long"])

    # Create price tiers
    df["price_category"] = pd.qcut(df["price"], q=3, labels=["Low", "Medium", "High"])

    # Select relevant columns
    df = df[[
        "price", "lat", "long", "bedrooms", "bathrooms",
        "sqft_living", "yr_built", "price_category"
    ]]
    df.columns = df.columns.str.strip().str.lower()
 
    return df

In [17]:
pd.read_csv("../data/raw/house.csv")

Unnamed: 0,id,price,sqft_living,sqft_lot,bedrooms,bathrooms,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,"$221,900",1180,5650,3,1.00,1.0,0,Average,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,"$538,000",2570,7242,3,2.25,2.0,0,Average,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,"$180,000",770,10000,2,1.00,1.0,0,Average,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,"$604,000",1960,5000,4,3.00,1.0,0,Very Good,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,"$510,000",1680,8080,3,2.00,1.0,0,Average,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1483300570,"$905,000",3300,10250,4,2.50,1.0,0,Average,7,2390,910,1946,1991,98040,47.5873,-122.249,1950,6045
96,3422049190,"$247,500",1960,15681,3,1.75,1.0,0,Average,7,1960,0,1967,0,98032,47.3576,-122.277,1750,15616
97,1099611230,"$199,000",1160,6400,4,1.50,1.0,0,Good,7,1160,0,1975,0,98023,47.3036,-122.378,1160,6400
98,722079104,"$314,000",1810,41800,3,1.75,1.0,0,Very Good,7,1210,600,1980,0,98038,47.4109,-121.958,1650,135036


In [14]:
load_and_clean_data("../data/raw/house.csv")

Unnamed: 0,price,lat,long,bedrooms,bathrooms,sqft_living,yr_built,price_category
0,221900.0,47.5112,-122.257,3,1.00,1180,1955,Low
1,538000.0,47.7210,-122.319,3,2.25,2570,1951,Medium
2,180000.0,47.7379,-122.233,2,1.00,770,1933,Low
3,604000.0,47.5208,-122.393,4,3.00,1960,1965,High
4,510000.0,47.6168,-122.045,3,2.00,1680,1987,Medium
...,...,...,...,...,...,...,...,...
95,905000.0,47.5873,-122.249,4,2.50,3300,1946,High
96,247500.0,47.3576,-122.277,3,1.75,1960,1967,Low
97,199000.0,47.3036,-122.378,4,1.50,1160,1975,Low
98,314000.0,47.4109,-121.958,3,1.75,1810,1980,Low


In [18]:
df.columns

Index(['price', 'lat', 'long', 'bedrooms', 'bathrooms', 'sqft_living',
       'yr_built', 'price_category'],
      dtype='object')