In [8]:
# ================================================
# 1. Imports
# ================================================
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [23]:
# ============================================
# 2. Load datasets
# Use train.csv for training (2012–2019 or 2012–2021 depending on eval split)
# Use eval.csv (2020–2021) for validation
# Keep holdout.csv (2022–2023) untouched until the very end
# ============================================
train_df = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/feature_engineered_train.csv", index_col=0)
dev_df = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/feature_engineered_dev.csv", index_col=0)

# Clustering features
train_clustering_features = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/feature_engineered_train_with_clustering.csv", index_col=0)
dev_clustering_features = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/feature_engineered_dev_with_clustering.csv", index_col=0)

features_to_use = ['cluster_dist_1', 'cluster_dist_2', 'cluster_dist_3',
       'cluster_dist_4', 'cluster_dist_5', 'cluster_dist_6', 'cluster_dist_7',
       'cluster_dist_8', 'cluster_dist_9', 'cluster_dist_10']

train_clustering_features = train_clustering_features[features_to_use]
dev_clustering_features = dev_clustering_features[features_to_use]

# PCA features
train_pca_df = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/train_pca_features.csv", index_col=0)
dev_pca_df = pd.read_csv("/Users/aaditya.paliwal/Desktop/regression_ml_end2end/data/processed/dev_pca_features.csv", index_col=0)


In [None]:
# ================================================
# 3. Dropping High VIF features
# ================================================
problematic_columns = ['Total Population', "Total Labor Force", "Total Families Below Poverty", "Total School Enrollment"]

train_df.drop(columns=problematic_columns, inplace=True)
dev_df.drop(columns=problematic_columns, inplace=True)

# Also dropping the 'month' as it's MI score is 0, implying it's not contributing in any way to the target variable.
# We might also keep this in case we want to show predictions based on month as well in our UI.
train_df.drop(columns='month', inplace=True)
dev_df.drop(columns='month', inplace=True)

In [25]:
# ================================================
# 4. Define target & features
# ================================================
target = "price"
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_dev = dev_df.drop(columns=[target])
y_dev = dev_df[target]

In [26]:
# ================================================
# 5. Standardization (fit on train, transform eval)
# ================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_dev_scaled  = scaler.transform(X_dev)

In [27]:
# ================================================
# 6. Train & Evaluate Models
# ================================================

# --- Linear Regression ---
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_dev_scaled)

print("Linear Regression:")
print(" MAE:", mean_absolute_error(y_dev, y_pred_lr))
print(" RMSE:", np.sqrt(mean_squared_error(y_dev, y_pred_lr)))
print(" R²:", r2_score(y_dev, y_pred_lr))

Linear Regression:
 MAE: 57784.912862835896
 RMSE: 122825.79622546416
 R²: 0.883389994280884


In [42]:
# --- Ridge Regression ---
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_dev_scaled)

print("Ridge Regression:")
print(" MAE:", mean_absolute_error(y_dev, y_pred_ridge))
print(" RMSE:", np.sqrt(mean_squared_error(y_dev, y_pred_ridge)))
print(" R²:", r2_score(y_dev, y_pred_ridge))

Ridge Regression:
 MAE: 57784.83984115952
 RMSE: 122825.81446890674
 R²: 0.8833899596404718


In [33]:
# --- Lasso Regression ---
lasso = Lasso(alpha=1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_dev_scaled)

print("\nLasso Regression:")
print(" MAE:", mean_absolute_error(y_dev, y_pred_lasso))
print(" RMSE:", np.sqrt(mean_squared_error(y_dev, y_pred_lasso)))
print(" R²:", r2_score(y_dev, y_pred_lasso))


Lasso Regression:
 MAE: 57783.58635061123
 RMSE: 122826.22392707814
 R²: 0.8833891821652532


In [45]:
# --- ElasticNet ---
elastic = ElasticNet(alpha=0.1, l1_ratio=0.1)
elastic.fit(X_train_scaled, y_train)
y_pred_elastic = elastic.predict(X_dev_scaled)

print("\nElasticNet Regression:")
print(" MAE:", mean_absolute_error(y_dev, y_pred_elastic))
print(" RMSE:", np.sqrt(mean_squared_error(y_dev, y_pred_elastic)))
print(" R²:", r2_score(y_dev, y_pred_elastic))


ElasticNet Regression:
 MAE: 58077.67385668247
 RMSE: 126354.20627254821
 R²: 0.8765940646036103


In [None]:
# Compare coefficients
print(lr.coef_)
print(ridge.coef_)
print(lasso.coef_)

# Almost similar coefficients

[ 2.27574599e+04  2.43857950e+03  1.18251551e+05  4.01087932e+03
  8.63912815e+03  1.39993732e+04 -9.38514669e+02 -4.78200032e+03
  2.81043490e+03  7.52041839e+01 -3.28616039e+03  3.06653332e+03
 -3.03596700e+03  1.66212915e+04 -2.18249247e+03 -2.76952772e+03
  1.66969306e+03 -7.16281172e+03 -3.78133194e+03 -1.41123860e+04
  1.50669166e+04  8.42652269e+03 -4.91325394e+04  4.05329571e+04
 -3.40439581e+02 -2.55953819e+04  1.42920654e+05 -2.51036603e+03
 -1.51188393e+04 -7.10792587e+03 -5.72351351e+02 -6.48241233e+03
  3.80925471e+04]
[ 2.27573908e+04  2.43861233e+03  1.18251620e+05  4.01090461e+03
  8.63924956e+03  1.39988660e+04 -9.38516648e+02 -4.78162051e+03
  2.81047309e+03  7.52128788e+01 -3.28615162e+03  3.06647705e+03
 -3.03601147e+03  1.66208673e+04 -2.18248178e+03 -2.76944222e+03
  1.66969457e+03 -7.16267451e+03 -3.78101768e+03 -1.41122509e+04
  1.50667062e+04  8.42665795e+03 -4.91323274e+04  4.05332802e+04
 -3.40749575e+02 -2.55949463e+04  1.42919390e+05 -2.51052840e+03
 -1.511