In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [2]:
# Set random state for reproducibility
RANDOM_STATE = 42
all_results = []

## Function to evaluate model performance

In [3]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    print(f"MAE  : {mae:,.4f}")
    print(f"RMSE : {rmse:,.4f}")
    print(f"R²   : {r2:.4f}")
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

## Data Preprocessing

In [4]:
# 1. Load dataset
df = pd.read_csv("House_Price_India.csv")

# 2. Drop ID-column
df = df.drop(columns=["id"], errors="ignore")

# 3. Handle missing values (Fills any potential NaN with the median of the column)
df = df.fillna(df.median(numeric_only=True))

# 4. Feature & target
X = df.drop(columns=["Price"])
y = df["Price"]

# 5. Scaling (required for many models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE
)

## Multiple Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Multiple Linear Regression', **metrics})

MAE  : 125,293.0320
RMSE : 210,051.2958
R²   : 0.7019


## Polynomial Regression

In [6]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

model_poly = LinearRegression()
model_poly.fit(X_poly_train, y_train)
y_pred = model_poly.predict(X_poly_test) 
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Polynomial Regression (Degree 2)', **metrics})

MAE  : 101,309.0280
RMSE : 164,069.0754
R²   : 0.8181


## Ridge Regression (L2)

In [7]:
from sklearn.linear_model import Ridge
model_ridge = Ridge(alpha=1.0, random_state=RANDOM_STATE)
model_ridge.fit(X_train, y_train)
y_pred = model_ridge.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Ridge Regression (L2)', **metrics})

MAE  : 125,362.5326
RMSE : 210,019.7542
R²   : 0.7020


## Lasso Regression

In [8]:
from sklearn.linear_model import Lasso
model_lasso = Lasso(alpha=0.001, random_state=RANDOM_STATE)
model_lasso.fit(X_train, y_train)
y_pred = model_lasso.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Lasso Regression (L1)', **metrics})

MAE  : 125,366.1793
RMSE : 210,018.9068
R²   : 0.7020


  model = cd_fast.enet_coordinate_descent(


## Elastic Net (L1 + L2)

In [9]:
from sklearn.linear_model import ElasticNet
model_en = ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=RANDOM_STATE)
model_en.fit(X_train, y_train)
y_pred = model_en.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Elastic Net (L1 + L2)', **metrics})

MAE  : 125,354.0690
RMSE : 210,019.9810
R²   : 0.7020


  model = cd_fast.enet_coordinate_descent(


## KNN Regression

In [10]:
from sklearn.neighbors import KNeighborsRegressor
model_knn = KNeighborsRegressor(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred = model_knn.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'KNN Regression', **metrics})

[WinError 2] The system cannot find the file specified
  File "C:\Users\prathmesh\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\prathmesh\anaconda3\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\prathmesh\anaconda3\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\prathmesh\anaconda3\lib\subprocess.py", line 1420, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


MAE  : 99,477.1502
RMSE : 186,241.2581
R²   : 0.7656


## Decision Tree Regressor

In [11]:
from sklearn.tree import DecisionTreeRegressor
model_dt = DecisionTreeRegressor(max_depth=10, random_state=RANDOM_STATE)
model_dt.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Decision Tree Regressor', **metrics})

MAE  : 91,431.9282
RMSE : 175,783.9004
R²   : 0.7912


## Random Forest Regressor

In [12]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=RANDOM_STATE
)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Random Forest Regressor', **metrics})

MAE  : 70,132.9743
RMSE : 132,699.0394
R²   : 0.8810


## Gradient Boosting Regressor

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
model_gb = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=RANDOM_STATE
)
model_gb.fit(X_train, y_train)
y_pred = model_gb.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'Gradient Boosting Regressor', **metrics})

MAE  : 74,606.1927
RMSE : 126,556.8615
R²   : 0.8918


## Support Vector Regression (SVR)

In [14]:
from sklearn.svm import SVR
model_svr = SVR(kernel='rbf', C=100, epsilon=0.1)
model_svr.fit(X_train, y_train)
y_pred = model_svr.predict(X_test)
metrics = evaluate_model(y_test, y_pred)
all_results.append({'Model': 'SVR (RBF Kernel)', **metrics})

MAE  : 200,019.0670
RMSE : 373,012.8845
R²   : 0.0598


## Final Comparison Table Generation

In [15]:
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(by="R2", ascending=False)
results_df.to_csv('regression_full_case_study_results.csv', index=False)