# Wind Machine Learning Model
This notebook creates a machine learning model for the Webberville Solar farm. The data is pulled from a Mongodb Atlas database using a connection string and user credentials. The data is then preprocessed before being split and fit to the model.

In [4]:
# Initial Imports
from path import Path
import requests
import json

# Data manipulation
import pandas as pd
import numpy as np

# Database Connection
import config
import pymongo

# datetime
from datetime import datetime
from datetime import timedelta

# ML Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
import tensorflow as tf

# don't show warnings
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

# Import Data from Database

In [5]:
# set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

#create connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")

Mongodb connected


In [32]:
# select database
db = client.get_database('wind_solar_data')
# select collection
collection = db.solar_data

# pull collection into dataframe
solar_df = pd.DataFrame(list(collection.find()))
solar_df

Unnamed: 0,_id,Date_Time,Year,Month,Day,Hour,MWH,MWH_perPanel,Temperature_F,Humidity_percent,Sunhour,CloudCover_percent,uvIndex,Weather_Description
0,5f986632c1c5e33be42804c2,2019-01-01 00:00:00,2019,1,1,0,0.0,0.0,43,88,6.7,0,1,Clear
1,5f986632c1c5e33be42804c3,2019-01-01 01:00:00,2019,1,1,1,0.0,0.0,43,88,6.7,0,1,Clear
2,5f986632c1c5e33be42804c4,2019-01-01 02:00:00,2019,1,1,2,0.0,0.0,43,89,6.7,0,1,Clear
3,5f986632c1c5e33be42804c5,2019-01-01 03:00:00,2019,1,1,3,0.0,0.0,43,90,6.7,0,1,Clear
4,5f986632c1c5e33be42804c6,2019-01-01 04:00:00,2019,1,1,4,0.0,0.0,43,90,6.7,0,1,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13866,5f986632c1c5e33be4283aec,2020-07-31 19:00:00,2020,7,31,19,0.0,0.0,79,58,6.9,73,1,Partly cloudy
13867,5f986632c1c5e33be4283aed,2020-07-31 20:00:00,2020,7,31,20,0.0,0.0,79,62,6.9,73,1,Partly cloudy
13868,5f986632c1c5e33be4283aee,2020-07-31 21:00:00,2020,7,31,21,0.0,0.0,79,66,6.9,73,1,Partly cloudy
13869,5f986632c1c5e33be4283aef,2020-07-31 22:00:00,2020,7,31,22,0.0,0.0,79,71,6.9,73,1,Partly cloudy


### Drop Columns
We'll be dropping the _id column because this is an artifact of the Mongodb storage and isn't a feature of the dataset.

In [33]:
solar_clean_df = solar_df.drop(['_id'], axis=1)

### Type Data

In [34]:
solar_clean_df.dtypes

Date_Time               object
Year                     int64
Month                    int64
Day                      int64
Hour                     int64
MWH                    float64
MWH_perPanel           float64
Temperature_F            int64
Humidity_percent         int64
Sunhour                float64
CloudCover_percent       int64
uvIndex                  int64
Weather_Description     object
dtype: object

In [35]:
solar_clean_df['Date_Time'] = pd.to_datetime(solar_clean_df['Date_Time'])
#solar_clean_df["MWH"] = solar_clean_df["MWH"].round(0).astype(int)
solar_clean_df.dtypes

Date_Time              datetime64[ns]
Year                            int64
Month                           int64
Day                             int64
Hour                            int64
MWH                           float64
MWH_perPanel                  float64
Temperature_F                   int64
Humidity_percent                int64
Sunhour                       float64
CloudCover_percent              int64
uvIndex                         int64
Weather_Description            object
dtype: object

# ML Models

## Multiple Linear Regression

Date time not supported in linear Regression.

### Preprocess Data
Split X and y, Test and Train,  and then scale

In [61]:
# Define the features set.
#wind_clean_df = wind_clean_df.reset_index()
X = solar_clean_df.drop(["MWH", 'Date_Time','MWH_perPanel','Year','Weather_Description'], axis=1)
y = solar_clean_df["MWH"].ravel()

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [62]:
from sklearn.linear_model import LinearRegression
# Train
regr = LinearRegression()
regr.fit(X_train_scaled,y_train)

# test
y_pred = regr.predict(X_test)

#accuracy
accuracy = regr.score(X_test_scaled,y_test)
print(accuracy*100,'%')
print(f"R^2 Value:{regr.score(X_test_scaled,y_test)}")

37.3181510528448 %
R^2 Value:0.37318151052844795


In [63]:
results = pd.DataFrame()
results['test'] = y_test
results['pred'] = y_pred
results[:10]

Unnamed: 0,test,pred
0,23.9544,-255.439446
1,0.3367,-441.635831
2,24.682875,-307.545858
3,18.4341,-222.784992
4,24.357375,-185.245924
5,0.0,-501.314523
6,0.0,-454.091729
7,0.03305,-459.19545
8,6.310775,-386.027285
9,9.00355,-303.071928


## Neural Network

In [64]:
# Define the features set.
#wind_clean_df = wind_clean_df.reset_index()
X = solar_clean_df.drop(["MWH", 'Year', 'Date_Time','MWH_perPanel', 'Weather_Description'], axis=1)
y = solar_clean_df["MWH"].ravel()

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [65]:
# Define the model - deep neural net
number_input_features = 8
hidden_nodes_layer1 =  number_input_features*3
hidden_nodes_layer2 =  number_input_features*3
hidden_nodes_layer3 =  number_input_features*2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Compile the Sequential model together and customize metrics
nn.compile(loss='mse', optimizer='sgd', metrics=['mse','mae'])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

y_pred = nn.predict(X_test_scaled)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [66]:
results = pd.DataFrame()
results['test'] = y_test
results['pred'] = y_pred
results[:10]

Unnamed: 0,test,pred
0,23.9544,22.663721
1,0.3367,0.706308
2,24.682875,24.912167
3,18.4341,20.201324
4,24.357375,25.05711
5,0.0,0.0
6,0.0,0.0
7,0.03305,0.0
8,6.310775,3.023397
9,9.00355,16.384338


In [60]:
# Example from https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score, KFold
from keras.models import Sequential
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import r2_score
seed = 1

def baseline_model():
    model = Sequential()
    model.add(Dense(10, input_dim=8, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


estimator = KerasRegressor(build_fn=baseline_model)
estimator.fit(X_train_scaled, y_train, nb_epoch=100, batch_size=100, verbose=False, shuffle=False)
prediction = estimator.predict(X_test_scaled)
r2_score(y_test, prediction)


0.7186992306595457

## Random Forrest

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)
Counter(y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)