In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("/content/data.csv")

# ---------------------------------------------------------
# Convert price to thousands
# ---------------------------------------------------------
df["price"] = df["price"] / 1000

# =========================================================
# 2. REMOVE OUTLIERS (IQR METHOD)
# =========================================================
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]

print("Data size after outlier removal:", df.shape)

# =========================================================
# 3. FEATURE SELECTION
# =========================================================
y = df["price"]

X = df.drop(columns=["price", "date", "street"])

numeric_features = [
    "bedrooms","bathrooms","sqft_living","sqft_lot","floors",
    "waterfront","view","condition",
    "sqft_above","sqft_basement",
    "yr_built","yr_renovated"
]

categorical_features = ["city", "statezip", "country"]

# =========================================================
# 4. PREPROCESSING
# =========================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# =========================================================
# 5. PIPELINE
# =========================================================
model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("reg", LinearRegression())
    ]
)

# =========================================================
# 6. TRAIN / TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42
)

# =========================================================
# 7. TRAIN
# =========================================================
model.fit(X_train, y_train)

# =========================================================
# 8. EVALUATE
# =========================================================
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nMODEL PERFORMANCE")
print("--------------------")
print("R² Score:", round(r2, 4))
print("RMSE:", round(rmse, 2), "($ thousands)")

# =========================================================
# 9. COEFFICIENTS
# =========================================================
num_features = numeric_features

cat_features = model.named_steps["prep"] \
    .named_transformers_["cat"] \
    .get_feature_names_out(categorical_features)

feature_names = list(num_features) + list(cat_features)

coefficients = model.named_steps["reg"].coef_

coef_table = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})

coef_table["Abs_Effect"] = coef_table["Coefficient"].abs()
coef_table = coef_table.sort_values("Abs_Effect", ascending=False)

print("\nTOP COEFFICIENTS")
print("-----------------")
print(coef_table[["Feature", "Coefficient"]].head(20))

# =========================================================
# 10. SAMPLE PREDICTIONS
# =========================================================
comparison = pd.DataFrame({
    "Actual_Price_(Thousands)": y_test.values,
    "Predicted_Price_(Thousands)": y_pred
})

print("\nSAMPLE PREDICTIONS")
print("--------------------")
print(comparison.head(10))


Data size after outlier removal: (4360, 18)

MODEL PERFORMANCE
--------------------
R² Score: 0.7009
RMSE: 121.74 ($ thousands)

TOP COEFFICIENTS
-----------------
                     Feature  Coefficient
108        statezip_WA 98109   268.601338
20           city_Clyde Hill  -253.272891
109        statezip_WA 98112   235.853072
14   city_Beaux Arts Village   214.860352
114        statezip_WA 98119   214.485155
129        statezip_WA 98198  -212.484970
127        statezip_WA 98178  -202.157426
34               city_Medina  -198.398244
80         statezip_WA 98039  -198.398244
15             city_Bellevue   194.635547
128        statezip_WA 98188  -182.578937
102        statezip_WA 98102   178.273082
59         statezip_WA 98004   171.246662
125        statezip_WA 98168  -163.553883
104        statezip_WA 98105   148.513443
43              city_Redmond   140.424265
77         statezip_WA 98033   138.267092
130        statezip_WA 98199   133.491294
115        statezip_WA 98122   132.008

fixed full code with the zero/negative price check to safely apply the log-transform, and with sqft_above and sqft_basement removed:

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("/content/data.csv")

# ---------------------------------------------------------
# Convert price to thousands
# ---------------------------------------------------------
df["price"] = df["price"] / 1000

# =========================================================
# 2. REMOVE OUTLIERS (IQR METHOD)
# =========================================================
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]

# =========================================================
# 3. FEATURE SELECTION
# =========================================================
# Remove zero or negative prices before log-transform
df = df[df["price"] > 0]

# Log-transform price
y = np.log(df["price"])

# Drop target & unnecessary columns
X = df.drop(columns=["price", "date", "street"])

# Numeric features (drop redundant ones)
numeric_features = [
    "bedrooms","bathrooms","sqft_living","sqft_lot","floors",
    "waterfront","view","condition",
    "yr_built","yr_renovated"
]

categorical_features = ["city", "statezip", "country"]

# =========================================================
# 4. PREPROCESSING
# =========================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# =========================================================
# 5. PIPELINE
# =========================================================
model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("reg", LinearRegression())
    ]
)

# =========================================================
# 6. TRAIN / TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42
)

# =========================================================
# 7. TRAIN
# =========================================================
model.fit(X_train, y_train)

# =========================================================
# 8. EVALUATE
# =========================================================
# Exponentiate predictions to get actual price
y_pred = np.exp(model.predict(X_test))
y_test_actual = np.exp(y_test)

r2 = r2_score(y_test_actual, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))

print("\nMODEL PERFORMANCE")
print("--------------------")
print("R² Score:", round(r2, 4))
print("RMSE:", round(rmse, 2), "($ thousands)")

# =========================================================
# 9. COEFFICIENTS
# =========================================================
num_features = numeric_features

cat_features = model.named_steps["prep"] \
    .named_transformers_["cat"] \
    .get_feature_names_out(categorical_features)

feature_names = list(num_features) + list(cat_features)

coefficients = model.named_steps["reg"].coef_

coef_table = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})

coef_table["Abs_Effect"] = coef_table["Coefficient"].abs()
coef_table = coef_table.sort_values("Abs_Effect", ascending=False)

print("\nTOP COEFFICIENTS")
print("-----------------")
print(coef_table[["Feature", "Coefficient"]].head(20))

# =========================================================
# 10. SAMPLE PREDICTIONS
# =========================================================
comparison = pd.DataFrame({
    "Actual_Price_(Thousands)": y_test_actual.values,
    "Predicted_Price_(Thousands)": y_pred
})

print("\nSAMPLE PREDICTIONS")
print("--------------------")
print(comparison.head(10))



MODEL PERFORMANCE
--------------------
R² Score: 0.7574
RMSE: 106.65 ($ thousands)

TOP COEFFICIENTS
-----------------
                     Feature  Coefficient
52         city_Yarrow Point    -0.769939
125        statezip_WA 98198    -0.601598
104        statezip_WA 98109     0.555084
121        statezip_WA 98168    -0.508871
123        statezip_WA 98178    -0.473081
105        statezip_WA 98112     0.462038
110        statezip_WA 98119     0.425212
10               city_Algona    -0.408566
124        statezip_WA 98188    -0.404086
100        statezip_WA 98105     0.403164
12   city_Beaux Arts Village     0.386422
19            city_Covington    -0.376269
98         statezip_WA 98102     0.372064
18           city_Clyde Hill     0.358258
56         statezip_WA 98004     0.353483
13             city_Bellevue     0.325669
118        statezip_WA 98148    -0.323805
35        city_Normandy Park     0.319478
102        statezip_WA 98107     0.300129
111        statezip_WA 98122     0.29622

Feature engineering can unlock extra predictive power

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("/content/data.csv")

# ---------------------------------------------------------
# Convert price to thousands
# ---------------------------------------------------------
df["price"] = df["price"] / 1000

# =========================================================
# 2. REMOVE OUTLIERS (IQR METHOD)
# =========================================================
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]

# =========================================================
# 3. FEATURE ENGINEERING
# =========================================================
# Remove zero or negative prices
df = df[df["price"] > 0]

# Property age
df["age"] = 2025 - df["yr_built"]

# Time since renovation (0 if never renovated)
df["renovated_age"] = np.where(df["yr_renovated"] > 0, 2025 - df["yr_renovated"], 0)

# Avoid division by zero
df["sqft_per_bedroom"] = np.where(df["bedrooms"] > 0, df["sqft_living"] / df["bedrooms"], 0)
df["sqft_per_bathroom"] = np.where(df["bathrooms"] > 0, df["sqft_living"] / df["bathrooms"], 0)


# Interaction terms
df["bed_bath_interaction"] = df["bedrooms"] * df["bathrooms"]
df["living_floors_interaction"] = df["sqft_living"] * df["floors"]

# =========================================================
# 4. TARGET & FEATURES
# =========================================================
y = np.log(df["price"])  # log-transform

X = df.drop(columns=["price", "date", "street", "sqft_above", "sqft_basement"])

# Numeric features including engineered ones
numeric_features = [
    "bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors",
    "waterfront", "view", "condition",
    "age", "renovated_age",
    "sqft_per_bedroom", "sqft_per_bathroom",
    "bed_bath_interaction", "living_floors_interaction"
]

categorical_features = ["city", "statezip", "country"]

# =========================================================
# 5. PREPROCESSING
# =========================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# =========================================================
# 6. PIPELINE
# =========================================================
model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("reg", LinearRegression())
    ]
)

# =========================================================
# 7. TRAIN / TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# =========================================================
# 8. TRAIN MODEL
# =========================================================
model.fit(X_train, y_train)

# =========================================================
# 9. EVALUATE
# =========================================================
y_pred = np.exp(model.predict(X_test))
y_test_actual = np.exp(y_test)

r2 = r2_score(y_test_actual, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))

print("\nMODEL PERFORMANCE")
print("--------------------")
print("R² Score:", round(r2, 4))
print("RMSE:", round(rmse, 2), "($ thousands)")

# =========================================================
# 10. COEFFICIENTS
# =========================================================
num_features = numeric_features
cat_features = model.named_steps["prep"].named_transformers_["cat"].get_feature_names_out(categorical_features)

feature_names = list(num_features) + list(cat_features)
coefficients = model.named_steps["reg"].coef_

coef_table = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})

coef_table["Abs_Effect"] = coef_table["Coefficient"].abs()
coef_table = coef_table.sort_values("Abs_Effect", ascending=False)

print("\nTOP COEFFICIENTS")
print("-----------------")
print(coef_table[["Feature", "Coefficient"]].head(20))

# =========================================================
# 11. SAMPLE PREDICTIONS
# =========================================================
comparison = pd.DataFrame({
    "Actual_Price_(Thousands)": y_test_actual.values,
    "Predicted_Price_(Thousands)": y_pred
})

print("\nSAMPLE PREDICTIONS")
print("--------------------")
print(comparison.head(10))



MODEL PERFORMANCE
--------------------
R² Score: 0.7582
RMSE: 106.48 ($ thousands)

TOP COEFFICIENTS
-----------------
                     Feature  Coefficient
56         city_Yarrow Point    -0.757567
129        statezip_WA 98198    -0.610055
108        statezip_WA 98109     0.547410
125        statezip_WA 98168    -0.494281
127        statezip_WA 98178    -0.468984
109        statezip_WA 98112     0.458429
114        statezip_WA 98119     0.420651
128        statezip_WA 98188    -0.402539
16   city_Beaux Arts Village     0.395179
104        statezip_WA 98105     0.389907
14               city_Algona    -0.388377
102        statezip_WA 98102     0.386680
23            city_Covington    -0.378184
39        city_Normandy Park     0.353887
22           city_Clyde Hill     0.349703
60         statezip_WA 98004     0.346101
122        statezip_WA 98148    -0.331023
17             city_Bellevue     0.324407
106        statezip_WA 98107     0.305992
115        statezip_WA 98122     0.28950