# Random Forest Regression Process and Analysis for Wind Data

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D
import sklearn.metrics as metrics
from sklearn.model_selection import cross_validate, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import tree

## Data Preprocessing

First, we read in the dataset.

In [2]:
df = pd.read_csv("../data/solar.csv")
df.head(5)

Now, we must shuffle the datasets to reduce bias.

In [3]:
df = df.sample(frac=1)
df.head(5)

Looking at each dataset, we can identify which variables we want to use
for our models.

In [4]:
X = df.loc[:, ['lat','long','capacity']]
y = df.loc[:, ['generated_energy','cost']]

Now we split into training and testing sets, reserving about 80% for
training and 20% for testing.

In [5]:
X_train = X[:9500]
X_test = X[9500:]
y_train = y[:9500]
y_test = y[9500:]

Models typically perform better when input values are within a certain
range, like \[-1, 1\] for example. We scale the data points
appropriately.

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train

array([[ 0.05942354, -0.22478554,  0.35692648],
       [-0.03505453, -0.53086542, -0.43588483],
       [ 0.02943926, -0.16561264, -0.44388501],
       ...,
       [-1.65987101,  0.4749176 ,  0.35692648],
       [-0.40675385, -0.96669284, -0.43588483],
       [ 0.11236886,  0.3340536 , -0.44388501]])

## Training the Models

Now that the data is pre-processed accordingly, the models can be
trained and fit.

In [7]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)

With a trained model, predictions can now be made.

In [8]:
display = y_test.reset_index()
preds = reg.predict(X_test)
print("Predictions")
print("----------------------")
for i in range(3):
    print(f"predicted energy: {preds[i][0]:.2f}\tactual energy: {display.at[i, 'generated_energy']:.2f}\tpredicted cost: {preds[i][1]:.2f}\tactual cost: {display.at[i, 'cost']:.2f}")

Predictions
----------------------
predicted energy: 6132.00   actual energy: 6132.00  predicted cost: 5028240.00  actual cost: 5028240.00
predicted energy: 613200.00 actual energy: 613200.00    predicted cost: 416976000.00    actual cost: 416976000.00
predicted energy: 6132.00   actual energy: 6132.00  predicted cost: 5028240.00  actual cost: 5028240.00

## Testing and Analyzing the Models

This section contains metrics gathering and other figures that visualize
the models and its results.

### Metrics

#### Scores and Error Values

The score being recored are the R2 score, Root Mean Squared Error
(RMSE), and Mean Absolute Percentage Error (MAPE).

In [9]:
r2 = metrics.r2_score(y_test, preds, multioutput="raw_values")
rmse = metrics.root_mean_squared_error(y_test, preds, multioutput="raw_values")
mape = metrics.mean_absolute_percentage_error(y_test, preds, multioutput="raw_values")

print("Metric\tScore")
print("-----------------------")
print(f"r2\t{r2}\nrmse\t{rmse}\nmape\t{mape}")

Metric  Score
-----------------------
r2  [1.         0.99947801]
rmse    [1.89379452e-12 1.36960035e+07]
mape    [9.69285180e-16 4.32843758e-03]

#### Feature Importances

Feature importances give insights into the features that each decision
tree in the random forest use to split most often. Results are portrayed
in percentages.

In [10]:
features = ['lat','long','capacity',]

importances = reg.feature_importances_
indices = np.argsort(importances)

print("Importances")
print('----------------------')
for i in indices:
    print(f"{features[i]}: {importances[i]*100}")

Importances
----------------------
long: 0.49924958098045885
lat: 0.7828307672340137
capacity: 98.71791965178554

#### K-Fold Cross Validation

This cross validation splits up the dataset into 10 unique folds, which
are then used to test a model. The model is then scored using the same
metrics outlined above: R2, RMSE, MAPE. This ensures the scoring is
rigorous, and the *entire* dataset is used.

In [11]:
kf = KFold(n_splits=10, random_state=0, shuffle=True)
kf_cv_scores = cross_validate(reg, X, y, cv=kf, scoring={"r2":metrics.make_scorer(score_func=metrics.r2_score),
 "rmse":metrics.make_scorer(score_func=metrics.root_mean_squared_error),
 "mape":metrics.make_scorer(score_func=metrics.mean_absolute_percentage_error)})
kf_cv_df = pd.DataFrame.from_dict(kf_cv_scores)
means = kf_cv_df.mean()
print("10-Fold Cross Validation Scores")
print("----------------------------------------------------")
print(f"R2 Average: {means.iloc[2]}")
print(f"RMSE Average: {means.iloc[3]}")
print(f"MAPE Average: {means.iloc[4]}")
kf_cv_df

10-Fold Cross Validation Scores
----------------------------------------------------
R2 Average: 0.999750656025779
RMSE Average: 6617044.390212841
MAPE Average: 0.0019449076617190596

### Graphs

Graphs of the Random Forest model fits on each of the input features,
for each target.

In [16]:
plot_lat_x = X[9500:].loc[:,['lat']].sort_values(by=['lat'])
plot_long_x = X[9500:].loc[:,['long']].sort_values(by=['long'])
plot_cap_x = X[9500:].loc[:,['capacity']].sort_values(by=['capacity'])
plot_energy_y = pd.DataFrame(preds).loc[:,[0]].sort_values(by=[0])

figure, axis = plt.subplots(3)

figure.set_size_inches(15,15)

axis[0].scatter(X.loc[:,["lat"]], y.loc[:,['generated_energy']], color='blue', label='Data', s=5)
axis[0].plot(plot_lat_x, plot_energy_y, color='red',lw=2, label="Generated Energy Model")
axis[0].set_xlabel("Latitude")
axis[0].set_ylabel("Generated Energy(MWh)")
axis[0].set_title("Random Forest Regression: Generated Energy vs. Latitude")
axis[0].legend()

axis[1].scatter(X.loc[:,["long"]], y.loc[:,['generated_energy']], color='blue', label='Data', s=5)
axis[1].plot(plot_long_x, plot_energy_y, color='red',lw=2, label="Generated Energy Model")
axis[1].set_xlabel("Longitude")
axis[1].set_ylabel("Generated Energy(MWh)")
axis[1].set_title("Random Forest Regression: Generated Energy vs. Longitude")
axis[1].legend()

axis[2].scatter(X.loc[:,["capacity"]], y.loc[:,['generated_energy']], color='blue', label='Data', s=5)
axis[2].plot(plot_cap_x, plot_energy_y, color='red',lw=2, label="Generated Energy Model")
axis[2].set_xlabel("Capacity(MW)")
axis[2].set_ylabel("Generated Energy(MWh)")
axis[2].set_title("Random Forest Regression: Generated Energy vs. Capacity")
axis[2].legend()

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=.4, 
                    hspace=.4)

plt.show()

In [17]:
plot_cost_y = pd.DataFrame(preds).loc[:,[1]].sort_values(by=[1])

figure, axis = plt.subplots(3)

figure.set_size_inches(15,15)

axis[0].scatter(X.loc[:,["lat"]], y.loc[:,['cost']], color='blue', label='Data', s=5)
axis[0].plot(plot_lat_x, plot_cost_y, color='red',lw=2, label="Cost Model")
axis[0].set_xlabel("Latitude")
axis[0].set_ylabel("Cost($)")
axis[0].set_title("Random Forest Regression: Cost vs. Latitude")
axis[0].legend()

axis[1].scatter(X.loc[:,["long"]], y.loc[:,['cost']], color='blue', label='Data', s=5)
axis[1].plot(plot_long_x, plot_cost_y, color='red',lw=2, label="Cost Model")
axis[1].set_xlabel("Longitude")
axis[1].set_ylabel("Cost($)")
axis[1].set_title("Random Forest Regression: Cost vs. Longitude")
axis[1].legend()

axis[2].scatter(X.loc[:,["capacity"]], y.loc[:,['cost']], color='blue', label='Data', s=5)
axis[2].plot(plot_cap_x, plot_cost_y, color='red',lw=2, label="Cost Model")
axis[2].set_xlabel("Capacity(MW)")
axis[2].set_ylabel("Cost($)")
axis[2].set_title("Random Forest Regression: Cost vs. Capacity")
axis[2].legend()

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=.4, 
                    hspace=.4)

plt.show()

A graph of the feature importances. This helps to visualize the
magnitude of importance of each feature, and compare their impact
against one another.

In [14]:
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

A graph of one of the decision trees in the random forest. This displays
the decision making process the model takes to arive at predictions.

In [15]:
fn = ['lat','long','capacity']
cn = ['generated_energy','cost']
plt.subplots(nrows=1, ncols=1, figsize=(4,4), dpi=800)
tree.plot_tree(reg.estimators_[0],feature_names=fn,class_names=cn,filled=True, max_depth=3)
plt.show()