### Importing Required Libraries

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import joblib

In [25]:
import warnings
warnings.filterwarnings('ignore')

### Load the Cleaned Dataset

In [26]:
df = pd.read_csv('../data/processed_train.csv')

### Split the Dataset 

In [27]:
# Define the features(X) and the target variable(y)
X = df.drop(['Id', 'SalePrice', 'SalePrice_Log'], axis=1)
y = df['SalePrice_Log']

# Split the data into a training set(80%) and a testing set(20%)
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)


Shape of X_train:  (1163, 199)
Shape of y_train:  (1163,)
Shape of X_test:  (291, 199)
Shape of y_test:  (291,)


### Train the Models

In [28]:
# Train the Baseline (Linear) model and Regularized (Ridge, Lasso, ElasticNet) models
models= {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(random_state=42),
    "Lasso": Lasso(random_state=42),
    "ElasticNet": ElasticNet(random_state=42)
}

print("Training linear models.....")
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"-{name} trained")
print("All models trained successfully")

Training linear models.....
-Linear Regression trained
-Ridge trained
-Lasso trained
-ElasticNet trained
All models trained successfully


In [29]:
# Train the Tree-based and Boosting models
models["Decision Tree"]= DecisionTreeRegressor(random_state=42)
models["Random Forest"]= RandomForestRegressor(random_state=42)
models["XGBoost"]= XGBRegressor(random_state=42)

print("Training Tree-based and Boosting models.....")
for name, model in models.items():
    if name not in ["Linear Regression", "Ridge", "Lasso", "ElasticNet"]:
        model.fit(X_train, y_train)
        print(f"-{name} trained")
print("All models trained successfully")

Training Tree-based and Boosting models.....
-Decision Tree trained
-Random Forest trained
-XGBoost trained
All models trained successfully


### Evaluate the Models

In [30]:
kf= KFold(n_splits=5, shuffle=True, random_state=42)

# Dictionary to store the results
results={}

# Evaluate each model using Cross-Validation
print("Evaluating models with 5-fold cross validation")
print("-"*50)

for name, model in models.items():
    rmse_scores= np.sqrt(-cross_val_score(model, X_train, y_train, cv= kf, scoring= 'neg_mean_squared_error'))
    r2_scores= cross_val_score(model, X_train, y_train, cv= kf, scoring= 'r2')

    results[name]= {
        "Mean RMSE": rmse_scores.mean(),
        "Std RMSE": rmse_scores.std(),
        "Mean R2": r2_scores.mean(),
        "Std R2": r2_scores.std()
    }

    print(f"{name} Evaluation: ")
    print(f"    Mean RMSE: {results[name]['Mean RMSE']:.4f}")
    print(f"    Mean R2: {results[name]['Mean R2']:.4f}")
    print("-"*50)


Evaluating models with 5-fold cross validation
--------------------------------------------------
Linear Regression Evaluation: 
    Mean RMSE: 0.1235
    Mean R2: 0.9005
--------------------------------------------------
Ridge Evaluation: 
    Mean RMSE: 0.1164
    Mean R2: 0.9117
--------------------------------------------------
Lasso Evaluation: 
    Mean RMSE: 0.3942
    Mean R2: -0.0005
--------------------------------------------------
ElasticNet Evaluation: 
    Mean RMSE: 0.3942
    Mean R2: -0.0005
--------------------------------------------------
Decision Tree Evaluation: 
    Mean RMSE: 0.2133
    Mean R2: 0.7055
--------------------------------------------------
Random Forest Evaluation: 
    Mean RMSE: 0.1408
    Mean R2: 0.8718
--------------------------------------------------
XGBoost Evaluation: 
    Mean RMSE: 0.1392
    Mean R2: 0.8744
--------------------------------------------------


## Model Evaluation Results (5-Fold Cross Validation)

The following models were trained and evaluated using RMSE (Root Mean Squared Error) and R² (coefficient of determination):

### Observations

- **Linear Regression**: Strong baseline with high R² (0.9005), but limited in capturing complex nonlinear patterns.  
- **Ridge Regression**: Best-performing model overall with the lowest RMSE (0.1164) and highest R² (0.9117), showing the benefit of regularization.  
- **Lasso Regression**: Extremely poor performance (negative R²), indicating severe underfitting due to excessive regularization.  
- **ElasticNet**: Similar to Lasso, failed to learn meaningful relationships, likely due to strong penalty terms.  
- **Decision Tree**: Moderate performance (R² ≈ 0.71), prone to overfitting and not competitive with Ridge or ensembles.  
- **Random Forest**: Solid performance (RMSE = 0.1408, R² = 0.8718), effectively models nonlinearity but less accurate than Ridge or XGBoost.  
- **XGBoost**: Strong ensemble model (RMSE = 0.1392, R² = 0.8744), very close to Random Forest but still slightly behind Ridge.  

---

### Best Performing Models
- **Ridge Regression** – best overall, offering the highest predictive accuracy.  
- **XGBoost** – best ensemble method, competitive with Ridge.  
- **Random Forest** – reliable model, but slightly weaker than XGBoost.  


### Broad HyperParameter Search

In [31]:
ridge_param_grid = {'alpha': [0.1, 0.5, 1, 5, 10, 20, 50, 100]}

rf_param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

xgb_param_grid = {
    'learning_rate':[0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'n_estimators': [100, 200, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1, 10],
    'reg_lambda': [0.5, 1, 5, 10]
}

model_params= {
    'Ridge': (Ridge(random_state=42), ridge_param_grid),
    'RandomForest': (RandomForestRegressor(random_state=42), rf_param_grid),
    'XGBoost': (XGBRegressor(random_state=42), xgb_param_grid)
}

best_params_random= {}
best_scores_random= {}

for name, (model, param_grid) in model_params.items():
    print(f"\nPerforming broad search for {name}....")
    random_search= RandomizedSearchCV(
        estimator= model,
        param_distributions= param_grid,
        n_iter= 50,
        cv= 5,
        scoring= 'neg_mean_squared_error',
        n_jobs= -1,
        verbose= 1,
        random_state= 42
    )
    random_search.fit(X_train, y_train)

    best_params_random[name]= random_search.best_params_
    best_scores_random[name]= np.sqrt(-random_search.best_score_)

    print(f"Best parameters for {name}: {best_params_random[name]}")
    print(f"Best cross-validated RMSE for {name}: {best_scores_random[name]: .4f}")



Performing broad search for Ridge....
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters for Ridge: {'alpha': 10}
Best cross-validated RMSE for Ridge:  0.1142

Performing broad search for RandomForest....
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for RandomForest: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}
Best cross-validated RMSE for RandomForest:  0.1397

Performing broad search for XGBoost....
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for XGBoost: {'subsample': 0.6, 'reg_lambda': 0.5, 'reg_alpha': 0, 'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best cross-validated RMSE for XGBoost:  0.1185


### Fine-Tuning HyperParameters

In [32]:
# Fine-tuning Ridge Regression
ridge_param_grid_fine = {'alpha': [8, 9, 10, 11, 12, 13]}
ridge_gs_fine= GridSearchCV(Ridge(random_state= 42), ridge_param_grid_fine, cv= 5, scoring= 'neg_mean_squared_error', n_jobs= -1)

print("Performing fine-tuning for Ridge....")
ridge_gs_fine.fit(X_train, y_train)

print(f"Best parameters for Ridge (fine-tuned): {ridge_gs_fine.best_params_}")
print(f"Best cross-validated RMSE for Ridge (fine-tuned): {np.sqrt(-ridge_gs_fine.best_score_): .4f}")

Performing fine-tuning for Ridge....
Best parameters for Ridge (fine-tuned): {'alpha': 13}
Best cross-validated RMSE for Ridge (fine-tuned):  0.1142


In [33]:
# Fine-tuning XGBoost
xgb_param_grid_fine = {
    'learning_rate': [0.04, 0.05, 0.06],
    'max_depth': [3, 4, 5],
    'n_estimators': [450, 500, 550],
    'subsample': [0.5, 0.6, 0.7],
    'colsample_bytree': [0.75, 0.8, 0.85],
    'reg_lambda': [0.4, 0.5, 0.6],
    'reg_alpha': [0, 0.001, 0.01]
}

xgb_gs_fine = GridSearchCV(XGBRegressor(random_state=42), xgb_param_grid_fine, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
print("Performing fine-tuning for XGBoost....")
xgb_gs_fine.fit(X_train, y_train)

print(f"Best parameters for XGBoost (fine-tuned): {xgb_gs_fine.best_params_}")
print(f"Best cross-validated RMSE for XGBoost (fine-tuned): {np.sqrt(-xgb_gs_fine.best_score_): .4f}")


Performing fine-tuning for XGBoost....
Best parameters for XGBoost (fine-tuned): {'colsample_bytree': 0.75, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 550, 'reg_alpha': 0.001, 'reg_lambda': 0.4, 'subsample': 0.6}
Best cross-validated RMSE for XGBoost (fine-tuned):  0.1163


In [34]:
# Fine-tuning Random Forest
rf_param_grid_fine = {
    'n_estimators': [450, 500, 550],
    'max_depth': [None, 20, 25],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2, 3]
}

rf_gs_fine = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid_fine, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
print("Performing fine-tuning for Random Forest....")
rf_gs_fine.fit(X_train, y_train)

print(f"Best parameters for Random Forest (fine-tuned): {rf_gs_fine.best_params_}")
print(f"Best cross-validated RMSE for Random Forest (fine_tuned): {np.sqrt(-rf_gs_fine.best_score_): .4f}")


Performing fine-tuning for Random Forest....
Best parameters for Random Forest (fine-tuned): {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 500}
Best cross-validated RMSE for Random Forest (fine_tuned):  0.1396


## Key Insights
- **Ridge Regression** emerged as the best-performing model, achieving the lowest RMSE. Its strong regularization helped it capture the underlying linear structure of the dataset effectively.  
- **XGBoost** delivered competitive results, performing slightly worse than Ridge but still robust. It shows the strength of boosting on structured data.  
- **Random Forest** underperformed compared to Ridge and XGBoost, suggesting it struggled to generalize well on this dataset.  


### Retrain Final Models

In [35]:
ridge_final= Ridge(random_state= 42, **ridge_gs_fine.best_params_)
xgb_final= XGBRegressor(random_state= 42, **xgb_gs_fine.best_params_)
rf_final= RandomForestRegressor(random_state= 42, **rf_gs_fine.best_params_)

final_models= {
    'Ridge': ridge_final,
    'XGBoost': xgb_final,
    'RandomForest': rf_final
}

print("Retraining final models on the complete training dataset with best parameters....")
for name, model in final_models.items():
    model.fit(X_train, y_train)
    print(f"-{name} retrained with best parameters")

Retraining final models on the complete training dataset with best parameters....
-Ridge retrained with best parameters
-XGBoost retrained with best parameters
-RandomForest retrained with best parameters


### Final Model Evaluation on Hold-Out Test Set

In [36]:
test_results= {}

print("Evaluating final models on the hold-out test set....")
print("-"*50)

for name, model in final_models.items():
    y_pred= model.predict(X_test)

    rmse= np.sqrt(mean_squared_error(y_test, y_pred))
    mae= mean_absolute_error(y_test, y_pred)
    r2= r2_score(y_test, y_pred)

    test_results[name]= {
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    }

    print(f"{name} Final Evaluation:")
    print(f"    RMSE: {rmse:.4f}")
    print(f"    MAE: {mae:.4f}")
    print(f"    R2: {r2:.4f}")
    print("-"*50)

test_results_df= pd.DataFrame(test_results).T
print("Summary of Final Test Set Results: ")
print(test_results_df.round(4))


Evaluating final models on the hold-out test set....
--------------------------------------------------
Ridge Final Evaluation:
    RMSE: 0.1205
    MAE: 0.0801
    R2: 0.9154
--------------------------------------------------
XGBoost Final Evaluation:
    RMSE: 0.1285
    MAE: 0.0834
    R2: 0.9038
--------------------------------------------------
RandomForest Final Evaluation:
    RMSE: 0.1391
    MAE: 0.0915
    R2: 0.8872
--------------------------------------------------
Summary of Final Test Set Results: 
                RMSE     MAE      R2
Ridge         0.1205  0.0801  0.9154
XGBoost       0.1285  0.0834  0.9038
RandomForest  0.1391  0.0915  0.8872


### Key Observations
- **Ridge Regression** achieved the lowest RMSE and the highest R², confirming it as the most reliable model for this dataset.  
- **XGBoost** delivered competitive performance with RMSE of 0.1285 and R² of 0.9038, showing strong generalization but slightly behind Ridge.  
- **Random Forest** had the weakest results, suggesting it struggled to capture the structure of the data compared to Ridge and XGBoost.  

**Conclusion:** Ridge Regression is the top-performing model on the hold-out test set, with XGBoost close behind. Random Forest, despite tuning, underperformed in comparison.  


### Save the Requried Outputs

In [37]:
# Save the three final trained models
for name, model in final_models.items():
    joblib.dump(model, f'../models/{name}_model.pkl')
    print(f"{name} model saved to '../models/{name}_model.pkl'")

Ridge model saved to '../models/Ridge_model.pkl'
XGBoost model saved to '../models/XGBoost_model.pkl'
RandomForest model saved to '../models/RandomForest_model.pkl'


In [38]:
# Save the processed test set features and target variable
X_test.to_csv('../data/X_test.csv', index= False)
y_test.to_csv('../data/y_test.csv', index= False)

In [39]:
# Save the Test Results
test_results_df.to_csv('../results/test_results.csv', index= True)