In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

In [23]:
df = pd.read_csv('../data/processed/data_processed_housing_dataset.csv')
X = df.drop(columns=['log_price', 'price'])
y = df['log_price']

In [24]:
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode_freq,city_freq
0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,93,123
1,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,49,1573
2,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,100,185
3,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,50,286
4,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,135,235


In [25]:
y.head()

0    12.653962
1    14.684291
2    12.742569
3    12.948012
4    13.217675
Name: log_price, dtype: float64

In [26]:
X_train, X_test,y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42
)

In [27]:
print(f"Training data shape: {X_train.shape}, {y_train.shape}")

Training data shape: (3680, 14), (3680,)


In [28]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'SVR': SVR()
}

In [29]:
results = []
for name, model in models.items():
    start_time = time.time()
    
    # Cross-validation
    cv_scores = cross_val_score(
        model, X_train, y_train.values.ravel(), 
        cv=5, scoring='neg_root_mean_squared_error'
    )
    cv_rmse = -cv_scores.mean()
    
    # Full training
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_train)
    
    # Metrics
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    r2 = r2_score(y_train, y_pred)
    mae = mean_absolute_error(y_train, y_pred)
    training_time = time.time() - start_time
    
    results.append({
        'model': name,
        'cv_rmse': cv_rmse,
        'rmse': rmse,
        'r2': r2,
        'training_time': training_time
    })
    
    print(f"{name} trained | CV RMSE: {cv_rmse:.4f} | R2: {r2:.4f} | Time: {training_time:.2f}s ")


LinearRegression trained | CV RMSE: 1.3998 | R2: 0.0583 | Time: 0.01s 
Ridge trained | CV RMSE: 1.3996 | R2: 0.0583 | Time: 0.01s 
Lasso trained | CV RMSE: 1.3978 | R2: 0.0536 | Time: 0.02s 
RandomForest trained | CV RMSE: 1.4722 | R2: 0.8404 | Time: 4.90s 
GradientBoosting trained | CV RMSE: 1.4703 | R2: 0.4497 | Time: 1.43s 
XGBoost trained | CV RMSE: 1.5834 | R2: 0.9813 | Time: 0.75s 
SVR trained | CV RMSE: 1.4075 | R2: 0.0337 | Time: 1.17s 


In [30]:
baseline_results = pd.DataFrame(results)
baseline_results.to_csv('../data/processed/baseline_model_results.csv', index=False)
display(baseline_results.sort_values('cv_rmse'))

Unnamed: 0,model,cv_rmse,rmse,r2,training_time
2,Lasso,1.397754,1.4174,0.053592,0.015485
1,Ridge,1.399605,1.413855,0.058319,0.007598
0,LinearRegression,1.399753,1.413855,0.05832,0.012733
6,SVR,1.4075,1.432255,0.03365,1.170936
4,GradientBoosting,1.470324,1.08085,0.449668,1.43254
3,RandomForest,1.472224,0.582039,0.840413,4.900891
5,XGBoost,1.583401,0.199113,0.981324,0.747661


In [31]:
top_models = baseline_results.sort_values('cv_rmse').head(3)['model'].values

param_grids = {
        'LinearRegression': {
        'fit_intercept': [True, False],
        'copy_X': [True, False],
        'positive': [True, False]  # Only available in scikit-learn>=0.24
    },
    'RandomForest': {
        'n_estimators': [100, 300, 500],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'GradientBoosting': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 0.1, 0.2]
    },
    'Ridge': {
        'alpha': np.logspace(-3, 3, 7),  # [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    },
    'Lasso': {
        'alpha': np.logspace(-4, 0, 5),  # [0.0001, 0.001, 0.01, 0.1, 1]
        'selection': ['cyclic', 'random']
    },
    'SVR': {
        'C': [0.1, 1, 10],
        'epsilon': [0.01, 0.1, 0.5],
        'kernel': ['linear', 'rbf']
    }
}

In [32]:
tuned_models = {}
tuning_results = []

for model_name in top_models:
    print(f"\nTuning {model_name}...")
    start_time = time.time()
    
    # RandomizedSearchCV for faster tuning
    search = RandomizedSearchCV(
        models[model_name],
        param_distributions=param_grids[model_name],
        n_iter=50,
        cv=3,
        scoring='neg_root_mean_squared_error',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    
    search.fit(X_train, y_train.values.ravel())
    
    # Best model
    best_model = search.best_estimator_
    tuned_models[model_name] = best_model
    
    # Metrics
    cv_rmse = -search.best_score_
    y_pred = best_model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    r2 = r2_score(y_train, y_pred)
    training_time = time.time() - start_time
    
    tuning_results.append({
        'model': model_name,
        'best_params': search.best_params_,
        'cv_rmse': cv_rmse,
        'rmse': rmse,
        'r2': r2,
        'training_time': training_time
    })
    
    print(f"Best params: {search.best_params_}")
    print(f"Tuned CV RMSE: {cv_rmse:.4f} | Improvement: {baseline_results[baseline_results['model'] == model_name]['cv_rmse'].values[0] - cv_rmse:.4f}")



Tuning Lasso...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best params: {'selection': 'cyclic', 'alpha': np.float64(0.01)}
Tuned CV RMSE: 1.4154 | Improvement: -0.0177

Tuning Ridge...
Fitting 3 folds for each of 49 candidates, totalling 147 fits




Best params: {'solver': 'auto', 'alpha': np.float64(1000.0)}
Tuned CV RMSE: 1.4152 | Improvement: -0.0156

Tuning LinearRegression...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params: {'positive': False, 'fit_intercept': True, 'copy_X': True}
Tuned CV RMSE: 1.4163 | Improvement: -0.0165


In [33]:
# Save tuning results
tuning_results_df = pd.DataFrame(tuning_results)
tuning_results_df.to_csv('../data/processed/tuning_results.csv', index=False)
display(tuning_results_df)

Unnamed: 0,model,best_params,cv_rmse,rmse,r2,training_time
0,Lasso,"{'selection': 'cyclic', 'alpha': 0.01}",1.415406,1.4148,0.05706,2.38022
1,Ridge,"{'solver': 'auto', 'alpha': 1000.0}",1.415248,1.414832,0.057018,0.995134
2,LinearRegression,"{'positive': False, 'fit_intercept': True, 'co...",1.416255,1.413855,0.05832,0.028475


In [34]:
# Feature importance for tree-based models
for model_name, model in tuned_models.items():
    if hasattr(model, 'feature_importances_'):
        print(f"\nFeature Importance - {model_name}")
        feature_importance = pd.Series(
            model.feature_importances_,
            index=X_train.columns
        ).sort_values(ascending=False)
        
        # Plot top 20 features
        plt.figure(figsize=(12, 8))
        feature_importance.head(20).plot(kind='barh')
        plt.title(f'Top 20 Feature Importances - {model_name}')
        plt.tight_layout()
        plt.savefig(f'../reports/feature_importance_{model_name}.png')
        plt.show()
        
        # Save feature importance
        feature_importance.to_csv(f'../reports/feature_importance_{model_name}.csv')

In [35]:
print("\nSaving final models...")
for model_name, model in tuned_models.items():
    joblib.dump(model, f'../models/saved_models/{model_name}_tuned.pkl')


Saving final models...


In [36]:
# Save the best model overall
best_model_name = tuning_results_df.sort_values('cv_rmse').iloc[0]['model']
best_model = tuned_models[best_model_name]
joblib.dump(best_model, f'../models/saved_models/best_model.pkl')

print(f"Best model: {best_model_name} saved as best_model.pkl")
print("Model training complete!")

Best model: Ridge saved as best_model.pkl
Model training complete!
