In [53]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.preprocessing import PolynomialFeatures

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Save model
import joblib

In [54]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/House_Price_Prediction/data/processed/clean_train_df.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/House_Price_Prediction/data/processed/clean_test_df.csv')

In [55]:
train_df.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price
0,0.445856,0.055271,1.539173,2.587644,0.407155,-0.466773,-0.74642,-0.230521,1.501243,0.367957,-0.55262,-0.870669,-0.6769,7525000.0
1,1.040313,0.055271,1.539173,-0.912499,0.407155,-0.466773,1.339728,-0.230521,1.501243,2.709987,-0.55262,1.148542,-0.6769,6300000.0
2,-0.636055,-1.283514,-0.55795,-0.912499,0.407155,-0.466773,1.339728,-0.230521,1.501243,1.538972,-0.55262,-0.870669,-0.6769,3920000.0
3,-1.233484,0.055271,-0.55795,0.254215,0.407155,-0.466773,1.339728,-0.230521,-0.666115,-0.803059,1.809561,-0.870669,1.477322,3430000.0
4,-0.66875,0.055271,-0.55795,0.254215,0.407155,-0.466773,-0.74642,-0.230521,-0.666115,-0.803059,-0.55262,-0.870669,1.477322,3010000.0


In [56]:
train_df.shape, test_df.shape

((436, 14), (109, 14))

In [57]:
X_train = train_df.drop('price', axis=1)
y_train = train_df['price']

In [58]:
X_test = test_df.drop('price', axis=1)
y_test = test_df['price']

In [59]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [60]:
# Models
from sklearn.pipeline import Pipeline

models = {
    'LinearRegression': LinearRegression(),

    'PolynomialRegression(deg=2)': Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('lr', LinearRegression())
    ]),

    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1)
}

In [61]:
cv_results = {}

for name, model in models.items():
    scores = cross_val_score(
        model, X_train, y_train,
        cv=kfold,
        scoring="r2"
    )
    cv_results[name] = scores.mean()


In [62]:
print("Cross-Validation R¬≤ Scores:\n")

for model, score in cv_results.items():
    print(f"{model}: {score:.4f}")

Cross-Validation R¬≤ Scores:

LinearRegression: 0.6586
PolynomialRegression(deg=2): 0.4015
Ridge: 0.6587
Lasso: 0.6586


‚ÄúThe housing data shows a largely linear relationship with price, so linear regression already performs well. Polynomial features and regularization did not significantly improve performance, indicating the model is stable and not overfitting.‚Äù

In [63]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'Model': name,
        'RMSE': rmse,
        'MAE': mae,
        'R2_Score': r2
    })

results_df = pd.DataFrame(results).sort_values(by='RMSE')
print(results_df)


                         Model          RMSE            MAE  R2_Score
0             LinearRegression  1.136697e+06  881928.613759  0.680170
3                        Lasso  1.136697e+06  881928.623505  0.680170
2                        Ridge  1.136737e+06  881731.285081  0.680147
1  PolynomialRegression(deg=2)  1.221605e+06  937928.572079  0.630604


## üìä Model Comparison ‚Äì Key Findings

In this project, several regression models were tested to predict house prices, including **Linear Regression, Ridge Regression, Lasso Regression, and Polynomial Regression**.

Among all the models, **Linear Regression performed the best**, achieving an R¬≤ score of around **0.68**. Ridge and Lasso regression showed almost the same performance, indicating that regularization did not provide significant improvement. This suggests that the relationship between the features and house prices is mostly **linear**, with limited multicollinearity or overfitting.

Polynomial Regression with degree 2 resulted in a **lower R¬≤ score (~0.63)**, meaning that adding polynomial features increased model complexity without improving prediction accuracy. Polynomial Regression with degree 3 performed similarly to Linear Regression but did not offer any clear benefit.

These results show that **a simpler model can perform better when the data follows a linear pattern**. Based on the evaluation results, **Linear Regression was chosen as the final model** due to its better generalization, simplicity, and interpretability.


In [64]:
best_model_name = results_df.iloc[0]['Model']
best_model_pipeline = models[best_model_name]

# Save to disk
joblib.dump(best_model_pipeline, '/content/drive/MyDrive/Colab Notebooks/House_Price_Prediction/models/best_model.pkl')
print(f"Best model ({best_model_name}) saved as best_model.pkl")

Best model (LinearRegression) saved as best_model.pkl


In [65]:
from google.colab import files

# Download preprocessed train CSV
files.download("/content/drive/MyDrive/Colab Notebooks/House_Price_Prediction/data/processed/clean_train_df.csv")

# Download preprocessed test CSV
files.download("/content/drive/MyDrive/Colab Notebooks/House_Price_Prediction/data/processed/clean_test_df.csv")

# Download trained model
files.download("/content/drive/MyDrive/Colab Notebooks/House_Price_Prediction/models/best_model.pkl")

# Download scaler (if used)
files.download("/content/drive/MyDrive/Colab Notebooks/House_Price_Prediction/models/scaler.pkl")




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>