# Random Forest Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
url = "https://code.datasciencedojo.com/datasciencedojo/datasets/raw/master/Online%20News%20Popularity/OnlineNewsPopularity.csv"
dataset = pd.read_csv(url)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Removing columns that are not needed


In [3]:
X = X[:, 2:]

## Splitting the dataset into the Training set and Test set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Random Forest Regression model on the whole dataset

In [6]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 50)
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [7]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 1289.83   919.  ]
 [ 2377.94  1600.  ]
 [ 6256.84 11700.  ]
 ...
 [ 2684.97 12700.  ]
 [ 1746.72   651.  ]
 [ 3777.99  2200.  ]]


## Evaluating the Model Performance

In [8]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-0.13304307719699038

In [9]:
def regression_metrics(y_test, y_pred):
    # Mean Absolute Error (MAE)
    mae = np.mean(np.abs(y_test - y_pred))

    # Mean Squared Error (MSE)
    mse = np.mean((y_test - y_pred) ** 2)

    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)

    # R-squared (R2)
    mean_y_test = np.mean(y_test)
    ss_total = np.sum((y_test - mean_y_test) ** 2)
    ss_residual = np.sum((y_test - y_pred) ** 2)
    r2 = 1 - (ss_residual / ss_total)

    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    # Explained Variance Score
    explained_variance = 1 - (np.var(y_test - y_pred) / np.var(y_test))

    # Median Absolute Error
    median_absolute_error = np.median(np.abs(y_test - y_pred))

    # Max Error
    max_error = np.max(np.abs(y_test - y_pred))

    return {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape,
        'Explained Variance': explained_variance,
        'Median Absolute Error': median_absolute_error,
        'Max Error': max_error
    }


In [10]:
metrics = regression_metrics(y_test, y_pred)
print(metrics)

{'MAE': 3439.7838819523267, 'MSE': 86229450.67968811, 'RMSE': 9285.981406382854, 'R2': -0.13304307719699038, 'MAPE': 296.20361386957205, 'Explained Variance': -0.12774752569908698, 'Median Absolute Error': 1576.29, 'Max Error': 303935.0}
