In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from scipy.stats import randint as sp_randint


In [3]:
# Load data
data = pd.read_csv('housing.csv')

# Check data
print(data.head())
print(data.info())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                  41          880           129.0   
1    -122.22     37.86                  21         7099          1106.0   
2    -122.24     37.85                  52         1467           190.0   
3    -122.25     37.85                  52         1274           235.0   
4    -122.25     37.85                  52         1627           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0         322         126         8.3252              452600        NEAR BAY  
1        2401        1138         8.3014              358500        NEAR BAY  
2         496         177         7.2574              352100        NEAR BAY  
3         558         219         5.6431              341300        NEAR BAY  
4         565         259         3.8462              342200        NEAR BAY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [4]:
# Separate features and target variable
X = data.drop(columns=['median_house_value'])
y = data['median_house_value']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [5]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'SVR': SVR()
}

# Train models
for name, model in models.items():
    model.fit(X_train_processed, y_train)
    train_pred = model.predict(X_train_processed)
    test_pred = model.predict(X_test_processed)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    print(f"{name}: Train RMSE = {train_rmse}, Test RMSE = {test_rmse}")


Linear Regression: Train RMSE = 68433.93736666226, Test RMSE = 70031.4878994794
Random Forest: Train RMSE = 18057.87746223084, Test RMSE = 48970.98700871799
Gradient Boosting: Train RMSE = 52901.31388484858, Test RMSE = 55961.91083557595
SVR: Train RMSE = 118406.21946397949, Test RMSE = 116917.38986872014


In [7]:
# Hyperparameters for Random Forest (Reduced search space)
param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# RandomizedSearchCV for Random Forest (Fewer iterations)
rf_random = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_dist_rf, n_iter=20, cv=5, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train_processed, y_train)

# Best parameters and score
print("Best Parameters for Random Forest:", rf_random.best_params_)
print("Best Score for Random Forest:", rf_random.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


55 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\hp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\hp\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best Score for Random Forest: 0.8171535068004298


In [8]:
# Define pipeline with optimized Random Forest
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', rf_random.best_estimator_)])

# Train pipeline
pipeline_rf.fit(X_train, y_train)

# Evaluate pipeline
train_pred_rf = pipeline_rf.predict(X_train)
test_pred_rf = pipeline_rf.predict(X_test)
train_rmse_rf = np.sqrt(mean_squared_error(y_train, train_pred_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, test_pred_rf))
print("Random Forest Pipeline: Train RMSE = {}, Test RMSE = {}".format(train_rmse_rf, test_rmse_rf))


Random Forest Pipeline: Train RMSE = 13814.253543982799, Test RMSE = 49566.30870079155


In [9]:
# Ensemble model
ensemble_model = GradientBoostingRegressor()
ensemble_model.fit(X_train_processed, y_train)

# Predictions
train_pred_ensemble = ensemble_model.predict(X_train_processed)
test_pred_ensemble = ensemble_model.predict(X_test_processed)

# Evaluation
train_rmse_ensemble = np.sqrt(mean_squared_error(y_train, train_pred_ensemble))
test_rmse_ensemble = np.sqrt(mean_squared_error(y_test, test_pred_ensemble))
print("Ensemble Model: Train RMSE = {}, Test RMSE = {}".format(train_rmse_ensemble, test_rmse_ensemble))


Ensemble Model: Train RMSE = 52901.31388484858, Test RMSE = 55963.50954443141
