In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Data array
X = np.array([8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27])
y = np.array([92, 352, 724, 2680, 14200, 73712, 365596, 2279184, 14772512, 95815104, 666090624,
              4968057848, 39029188884, 314666222712, 2691008701644, 24233937684440, 227514171973736,
              2207893435808350, 22317699616364000, 234907967154122000])

# Reshape X to 2D array
X = X.reshape(-1, 1)

# Split data: training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train Random Forest Regression model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predictions on test set
y_pred = rf_regressor.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Plot data and Random Forest regression curve
X_plot = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_plot = rf_regressor.predict(X_plot)

plt.scatter(X, y, label="Data")
plt.plot(X_plot, y_plot, color='red', label='Random Forest Regression')
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.title("Random Forest Regression")
plt.show()

In [None]:
# RANDOM FOREST REGRESSION
# 1) Random Sampling of Data
# Bootstrap Sampling: create multiple subsets of training data through bootstrap sampling
    # Formula: Randomly select n data points with replacement from training dataset, where n = size of training dataset
    
# 2) Random Subset of Features
# Random Feature Selection: constructe each decision tree, randomly select a subset of features for consideration at each split. Introduces randomness & decorrelates trees
    # Formula: Randomly select m features from total p features at each split, where m = typically << p

# 3) Decision Tree Construction
# Decision Tree Construction: For each subset of data (obtained by bootstrap sampling) & each node in decision tree:
    # Find best feature among randomly selected features (Formula 2) to split data
    # Calculate impurity or error measure (mean squared error for regression) for potential splits
    # Choose feature that results in lowest impurity or error measure
    # Split data based on this feature
    # Continue recursively until a stopping criterion is met (maximum tree depth, minimum samples per leaf)
    
# 4) Aggregation of Predictions
# Aggregation of Predictions: add from each decision tree. Regression tasks, most common aggregation method is averaging
    # Formula: Calculate predicted value from each tree (Formula 3) & average these predictions to obtain final prediction
# Final Prediction (for regression):
    # y_pred_final = (1/N) * Σ y_pred_i
    # N = number of decision trees in forest
    # y_pred_i = predicted value from i-th decision tree