In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

RANDOM_STATE = 42

def regression_report(y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2   = r2_score(y_true, y_pred)
    return {"MAE": mae, "RMSE": rmse, "R2": r2}


In [6]:
from pathlib import Path
Path("data.csv").resolve()


WindowsPath('C:/Users/annem/ML-Projects/House Price Prediction/data.csv')

In [7]:
df = pd.read_csv(r"C:\Users\annem\ML-Projects\House Price Prediction\House rate prediction Dataset\data.csv")


In [8]:
df.head()
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,2.160815,2139.346957,14852.52,1.512065,0.007174,0.240652,3.451739,1827.265435,312.081522,1970.786304,808.608261
std,563834.7,0.908848,0.783781,963.206916,35884.44,0.538288,0.084404,0.778405,0.67723,862.168977,464.137228,29.731848,979.414536
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,322875.0,3.0,1.75,1460.0,5000.75,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,460943.5,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0,1976.0,0.0
75%,654962.5,4.0,2.5,2620.0,11001.25,2.0,0.0,0.0,4.0,2300.0,610.0,1997.0,1999.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0,2014.0,2014.0


In [9]:
import pandas as pd

df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["sale_year"] = df["date"].dt.year
df["sale_month"] = df["date"].dt.month


In [10]:
target = "price"
y = df[target]

cat_features = ["street", "city", "statezip", "country"]
num_features = [
    "bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "waterfront",
    "view", "condition", "sqft_above", "sqft_basement", "yr_built", "yr_renovated",
    "sale_year", "sale_month"
]

X = df[cat_features + num_features]


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_features),
    ("cat", cat_pipe, cat_features)
])


In [13]:
from sklearn.ensemble import RandomForestRegressor

model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

model.fit(X_train, y_train)


In [16]:
from sklearn.metrics import root_mean_squared_error

from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", root_mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


MAE: 155796.1629964422
RMSE: 984184.3823702123
R²: 0.05023054902677848


In [17]:
import joblib
joblib.dump(model, "house_price_model.joblib")


['house_price_model.joblib']

In [18]:
import joblib
model = joblib.load("house_price_model.joblib")
pred = model.predict(X_test)
print(pred[:5])


[ 452058.21        326431.75375    1091937.26        358505.4
  247725.14238095]


In [19]:
import numpy as np

rf = model.named_steps["regressor"]
ct = model.named_steps["preprocessor"]

ohe = ct.named_transformers_["cat"].named_steps["encoder"]
cat_names = ohe.get_feature_names_out(["street", "city", "statezip", "country"])
feature_names = np.concatenate([ct.transformers_[0][2], cat_names])

importances = rf.feature_importances_
indices = np.argsort(importances)[-10:][::-1]

for i in indices:
    print(feature_names[i], ":", round(importances[i], 4))


sqft_living : 0.4971
yr_built : 0.0446
sqft_above : 0.0345
sqft_basement : 0.0313
statezip_WA 98004 : 0.03
sqft_lot : 0.0252
view : 0.0236
city_Seattle : 0.022
bathrooms : 0.0177
city_Bellevue : 0.015
