# Wind Machine Learning Model
This notebook creates a machine learning model 

In [25]:
# Initial Imports
from path import Path
import requests
import json

# Data manipulation
import pandas as pd
import numpy as np

# Database Connection
import config
import pymongo

# datetime
from datetime import datetime
from datetime import timedelta

# ML Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score

# don't show warnings
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

# Import Data from Database

In [2]:
# set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

#create connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")

Mongodb connected


In [3]:
# select database
db = client.get_database('wind_solar_data')
# select collection
collection = db.wind_data

# pull collection into dataframe
wind_df = pd.DataFrame(list(collection.find()))
wind_df

Unnamed: 0,_id,time,WindSpeed_mph,WindDirection_degrees,WindDirection_compass,WindGust_mph,MWH
0,5f946bc9c64c67a0641fc6f8,2019-01-01 01:00:00,13.0,89.0,E,23.0,110.487950
1,5f946bc9c64c67a0641fc6f9,2019-01-01 02:00:00,14.0,53.0,NE,23.0,72.020225
2,5f946bc9c64c67a0641fc6fa,2019-01-01 03:00:00,15.0,17.0,NNE,22.0,67.639475
3,5f946bc9c64c67a0641fc6fb,2019-01-01 04:00:00,14.0,18.0,NNE,21.0,63.718900
4,5f946bc9c64c67a0641fc6fc,2019-01-01 05:00:00,14.0,19.0,NNE,20.0,61.264250
...,...,...,...,...,...,...,...
8755,5f946bc9c64c67a0641fe92b,2019-12-31 19:00:00,6.0,175.0,S,12.0,15.506725
8756,5f946bc9c64c67a0641fe92c,2019-12-31 20:00:00,6.0,176.0,S,13.0,32.191125
8757,5f946bc9c64c67a0641fe92d,2019-12-31 21:00:00,7.0,176.0,S,14.0,40.677250
8758,5f946bc9c64c67a0641fe92e,2019-12-31 22:00:00,7.0,176.0,S,15.0,45.826475


### Drop Columns
The first cleaning is to drop the columns we dont't need. We'll be dropping the _id column because this is an artifact of the Mongodb storage and isn't a feature of the dataset. The time column will be dropped because there is not a linear relationship between time and wind power. The winddirection compas is dropped because this data is less granular than the winddirection degrees.

In [49]:
# Drop uneeded columns
wind_clean_df = wind_df.drop(['_id', "WindDirection_compass"], axis=1)

### Type Data

In [52]:
# Check for NaN values
wind_clean_df.isnull().sum()

time                     0
WindSpeed_mph            0
WindDirection_degrees    0
WindGust_mph             0
MWH                      0
dtype: int64

In [51]:
# Drop any NaN values
wind_clean_df = wind_clean_df.dropna()

In [53]:
wind_clean_df["WindSpeed_mph"] = wind_clean_df["WindSpeed_mph"].round(0).astype(int)
wind_clean_df["WindDirection_degrees"] = wind_clean_df["WindDirection_degrees"].round(0).astype(int)
wind_clean_df["WindGust_mph"] = wind_clean_df["WindGust_mph"].round(0).astype(int)
wind_clean_df['time'] = pd.to_datetime(wind_clean_df['time'])
wind_clean_df["MWH"] = wind_clean_df["MWH"].round(0).astype(int)
wind_clean_df.dtypes

time                     datetime64[ns]
WindSpeed_mph                     int32
WindDirection_degrees             int32
WindGust_mph                      int32
MWH                               int32
dtype: object

# ML Models

## Multiple Linear Regression

Date time not supported in linear Regression.

### Split Data

In [54]:
# Define the features set.
#wind_clean_df = wind_clean_df.reset_index()
X = wind_clean_df.drop(["MWH",'time'], axis=1)
y = wind_clean_df["MWH"].ravel()

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [56]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [57]:
from sklearn.linear_model import LinearRegression
# Train

regr = LinearRegression()
regr.fit(X_train_scaled,y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [58]:
# test
y_pred = regr.predict(X_test)
y_pred

array([ 250.91702586,  708.69483708,  754.48646983, ...,  577.67519832,
       1124.06354202, 1293.8023135 ])

In [60]:
accuracy = regr.score(X_test_scaled,y_test)
print(accuracy*100,'%')
print(f"R^2 Value:{regr.score(X_test_scaled,y_test)}")


38.07819369662388 %
R^2 Value:0.38078193696623874


## Neural Network

In [63]:
import tensorflow as tf
# Define the model - deep neural net
number_input_features = 3
hidden_nodes_layer1 =  number_input_features*3
hidden_nodes_layer2 =  number_input_features*3

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="linear")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="linear"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Compile the Sequential model together and customize metrics
nn.compile(optimizer="sgd", loss=tf.keras.losses.MeanSquaredError(), metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
2190/2190 - 0s - loss: nan - acc: 0.0689
Loss: nan, Accuracy: 0.06894977390766144


In [68]:
# Example from https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score, KFold
from keras.models import Sequential
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import r2_score
seed = 1

def baseline_model():
    model = Sequential()
    model.add(Dense(10, input_dim=3, activation='linear'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


# estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=100, verbose=False)
# kfold = KFold(n_splits=10, random_state=seed)
# results = cross_val_score(estimator, X, y, cv=kfold)
# print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

# estimator.fit(X, y)
# prediction = estimator.predict(X)
# accuracy_score(y, prediction)

estimator = KerasRegressor(build_fn=baseline_model)
estimator.fit(X_train_scaled, y_train, nb_epoch=100, batch_size=100, verbose=False, shuffle=False)
prediction = estimator.predict(X_test_scaled)
r2_score(y_test, prediction)

# model = baseline_model()
# model.fit(X, y, nb_epoch=100, batch_size=100, verbose=False, shuffle=False)
# prediction = model.predict(X)

AttributeError: 'KerasRegressor' object has no attribute 'evaluate'

In [69]:
# https://nbviewer.jupyter.org/github/srnghn/ml_example_notebooks/blob/master/Predicting%20Wine%20Types%20with%20Neural%20Networks.ipynb
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
model.fit(X_train_scaled, y_train)
accuracy_score(y_train, model.predict(X_train_scaled))

0.09043848964677223

In [70]:
y_train

array([28, 49, 15, ..., 86, 28, 11])

In [71]:
model.predict(X_train_scaled)

array([ 14,   1,   0, ..., 115,  24,  14])

## Random Forrest

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)
Counter(y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)