In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso  # L1
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("data.csv")

# ---------------------------------------------------------
# Convert price to thousands
# ---------------------------------------------------------
df["price"] = df["price"] / 1000

# =========================================================
# 2. REMOVE OUTLIERS (IQR METHOD)
# =========================================================
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]

# =========================================================
# 3. FEATURE ENGINEERING
# =========================================================

# Remove zero or negative prices
df = df[df["price"] > 0]

# Property age
df["age"] = 2025 - df["yr_built"]

# Time since renovation (0 if never renovated)
df["renovated_age"] = np.where(df["yr_renovated"] > 0, 2025 - df["yr_renovated"], 0)

# Avoid division by zero
df["sqft_per_bedroom"] = np.where(
    df["bedrooms"] > 0, df["sqft_living"] / df["bedrooms"], 0
)

df["sqft_per_bathroom"] = np.where(
    df["bathrooms"] > 0, df["sqft_living"] / df["bathrooms"], 0
)

# Interaction features
df["bed_bath_interaction"] = df["bedrooms"] * df["bathrooms"]
df["living_floors_interaction"] = df["sqft_living"] * df["floors"]

# =========================================================
# 4. TARGET & FEATURES
# =========================================================
y = np.log(df["price"])   # Log Transform
X = df.drop(columns=["price", "date", "street", "sqft_above", "sqft_basement"])

numeric_features = [
    "bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors",
    "waterfront", "view", "condition",
    "age", "renovated_age",
    "sqft_per_bedroom", "sqft_per_bathroom",
    "bed_bath_interaction", "living_floors_interaction"
]

categorical_features = ["city", "statezip", "country"]

# =========================================================
# 5. PREPROCESSING
# =========================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# =========================================================
# 6. PIPELINE (L1 - Lasso)
# =========================================================
model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("reg", Lasso(alpha=0.00001))
    ]
)

# =========================================================
# 7. TRAIN / TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# =========================================================
# 8. TRAIN MODEL
# =========================================================
model.fit(X_train, y_train)

# =========================================================
# 9. EVALUATION
# =========================================================
y_pred = np.exp(model.predict(X_test))
y_test_actual = np.exp(y_test)

r2 = r2_score(y_test_actual, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))

print("\nMODEL PERFORMANCE")
print("--------------------")
print("R² Score:", round(r2, 4))
print("RMSE:", round(rmse, 2), "($ thousands)")

# =========================================================
# 10. COEFFICIENTS (FEATURE IMPORTANCE)
# =========================================================
num_features = numeric_features
cat_features = model.named_steps["prep"] \
    .named_transformers_["cat"] \
    .get_feature_names_out(categorical_features)

feature_names = list(num_features) + list(cat_features)
coefficients = model.named_steps["reg"].coef_

coef_table = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})

coef_table["Abs_Effect"] = coef_table["Coefficient"].abs()
coef_table = coef_table.sort_values("Abs_Effect", ascending=False)

print("\nTOP COEFFICIENTS")
print("-----------------")
print(coef_table[["Feature", "Coefficient"]].head(20))

# =========================================================
# 11. SAMPLE PREDICTIONS
# =========================================================
comparison = pd.DataFrame({
    "Actual_Price_(Thousands)": y_test_actual.values,
    "Predicted_Price_(Thousands)": y_pred
})

print("\nSAMPLE PREDICTIONS")
print("--------------------")
print(comparison.head(10))



MODEL PERFORMANCE
--------------------
R² Score: 0.758
RMSE: 106.52 ($ thousands)

TOP COEFFICIENTS
-----------------
                Feature  Coefficient
56    city_Yarrow Point    -0.798066
108   statezip_WA 98109     0.569347
14          city_Algona    -0.561498
129   statezip_WA 98198    -0.555546
23       city_Covington    -0.532672
50       city_Skykomish    -0.494963
109   statezip_WA 98112     0.481713
125   statezip_WA 98168    -0.463691
41         city_Pacific    -0.449897
114   statezip_WA 98119     0.443756
127   statezip_WA 98178    -0.442995
15          city_Auburn    -0.433347
26        city_Enumclaw    -0.431147
104   statezip_WA 98105     0.413056
102   statezip_WA 98102     0.408856
28     city_Federal Way    -0.389785
36   city_Mercer Island     0.386677
32            city_Kent    -0.376166
128   statezip_WA 98188    -0.370062
60    statezip_WA 98004     0.368834

SAMPLE PREDICTIONS
--------------------
   Actual_Price_(Thousands)  Predicted_Price_(Thousands)
0     

In [3]:
print(type(model.named_steps["reg"]))

<class 'sklearn.linear_model._coordinate_descent.Lasso'>


In [4]:
coef_table.head(10)

Unnamed: 0,Feature,Coefficient,Abs_Effect
127,statezip_WA 98178,-0.510294,0.510294
125,statezip_WA 98168,-0.441423,0.441423
105,statezip_WA 98106,-0.347325,0.347325
28,city_Federal Way,-0.325266,0.325266
17,city_Bellevue,0.317441,0.317441
23,city_Covington,-0.316921,0.316921
15,city_Auburn,-0.310406,0.310406
32,city_Kent,-0.303438,0.303438
48,city_Seattle,0.295183,0.295183
44,city_Redmond,0.269789,0.269789
