In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Load your dataset
merged_dfs = pd.read_csv('Merge_Economic_HouseTrend_Data.csv')

# Define predictors and target variable
predictors = ["Mortgage Interest", "Vacancy Rate", "CPI", "Median Sales Price"]
target = "Median House Price"

# Function to predict using RandomForestRegressor
def predict(train, test, predictors, target):
    rf = RandomForestRegressor(min_samples_split=10, random_state=1)
    rf.fit(train[predictors], train[target])
    preds = rf.predict(test[predictors])
    return preds

# Function for backtesting
def backtest(data, predictors, target, start=260, step=52):
    all_preds = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[:i]
        test = data.iloc[i:(i+step)]
        all_preds.append(predict(train, test, predictors, target))
    
    preds = np.concatenate(all_preds)
    return preds, np.sqrt(mean_squared_error(data.iloc[start:][target], preds)), mean_absolute_error(data.iloc[start:][target], preds), r2_score(data.iloc[start:][target], preds)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(merged_dfs[predictors], merged_dfs[target], test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=1)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Predictions for the test set
best_preds = best_rf.predict(X_test)

# Example future features input by the user
future_features = {
    "Mortgage Interest": [4.26],
    "Vacancy Rate": [4.6],
    "CPI": [122.2],
    "Median Sales Price": [1413602.13]
}

# Convert future features to DataFrame
future_data = pd.DataFrame(future_features)

# Predict function for future data using the best model
def predict_future(model, future_data):
    return model.predict(future_data)

# Make prediction for the future data
future_prediction = predict_future(best_rf, future_data[predictors])

# Print prediction
print("Predicted median house price for next quarter:", future_prediction[0])

# Scatter plot of actual vs predicted median house prices
plt.figure(figsize=(8, 6))
plt.scatter(y_test, best_preds, color='blue', label='Predicted')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Actual')
plt.xlabel('Actual Median House Price')
plt.ylabel('Predicted Median House Price')
plt.title('Actual vs Predicted Median House Prices')
plt.legend()
plt.show()


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


2400 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Acer\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Acer\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\Acer\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Acer\AppData\Local\Programs\Python\Python39\lib\si

Cross-validation scores: [0.99801848 0.99493685 0.99597139 0.99748278 0.99734069]
Mean CV score: 0.9967500363896915


KeyError: "['Mortgage Interest^2', 'Mortgage Interest Vacancy Rate', 'Mortgage Interest CPI', 'Mortgage Interest Median Sales Price', 'Vacancy Rate^2', 'Vacancy Rate CPI', 'Vacancy Rate Median Sales Price', 'CPI^2', 'CPI Median Sales Price', 'Median Sales Price^2'] not in index"