# Wind Machine Learning Model
This notebook creates a machine learning model 

In [30]:
# Initial Imports
from path import Path
import requests
import json

# Data manipulation
import pandas as pd
import numpy as np

# Database Connection
import config
import pymongo

# datetime
from datetime import datetime
from datetime import timedelta

# ML Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score

# don't show warnings
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

# Import Data from Database

In [31]:
# set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

#create connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")

Mongodb connected


In [32]:
# select database
db = client.get_database('wind_solar_data')
# select collection
collection = db.solar_data

# pull collection into dataframe
solar_df = pd.DataFrame(list(collection.find()))
solar_df

Unnamed: 0,_id,Date_Time,Sunhour,Temp_F,cloudcover_percent,uvIndex,humidity,MWH,year,month,day
0,5f982b3cfa1733316b068bbd,2019-01-01 01:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
1,5f982b3cfa1733316b068bbe,2019-01-01 02:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
2,5f982b3cfa1733316b068bbf,2019-01-01 03:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
3,5f982b3cfa1733316b068bc0,2019-01-01 04:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
4,5f982b3cfa1733316b068bc1,2019-01-01 05:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
...,...,...,...,...,...,...,...,...,...,...,...
13867,5f982b3cfa1733316b06c1e8,2020-07-31 19:00:00,6.9,79.0,73.0,1.0,89.0,0.0,2020,7,31
13868,5f982b3cfa1733316b06c1e9,2020-07-31 20:00:00,6.9,79.0,73.0,1.0,89.0,0.0,2020,7,31
13869,5f982b3cfa1733316b06c1ea,2020-07-31 21:00:00,6.9,79.0,73.0,1.0,89.0,0.0,2020,7,31
13870,5f982b3cfa1733316b06c1eb,2020-07-31 22:00:00,6.9,79.0,73.0,1.0,89.0,0.0,2020,7,31


### Drop Columns
The first cleaning is to drop the columns we dont't need. We'll be dropping the _id column because this is an artifact of the Mongodb storage and isn't a feature of the dataset. The time column will be dropped because there is not a linear relationship between time and wind power. The winddirection compas is dropped because this data is less granular than the winddirection degrees.

In [50]:
solar_clean_df = solar_df.drop(['_id'], axis=1)
solar_clean_df['MWH'] = solar_clean_df['MWH']*127278
solar_clean_df[:50]

Unnamed: 0,Date_Time,Sunhour,Temp_F,cloudcover_percent,uvIndex,humidity,MWH,year,month,day
0,2019-01-01 01:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
1,2019-01-01 02:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
2,2019-01-01 03:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
3,2019-01-01 04:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
4,2019-01-01 05:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
5,2019-01-01 06:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
6,2019-01-01 07:00:00,6.7,43.0,0.0,1.0,88.0,0.0,2019,1,1
7,2019-01-01 08:00:00,6.7,43.0,0.0,1.0,88.0,0.307071,2019,1,1
8,2019-01-01 09:00:00,6.7,43.0,0.0,1.0,88.0,1.506895,2019,1,1
9,2019-01-01 10:00:00,6.7,43.0,0.0,1.0,88.0,2.297075,2019,1,1


### Type Data

In [51]:
# Check for NaN values
solar_clean_df.isnull().sum()

Date_Time             0
Sunhour               0
Temp_F                0
cloudcover_percent    0
uvIndex               0
humidity              0
MWH                   2
year                  0
month                 0
day                   0
dtype: int64

In [52]:
# Drop any NaN values
solar_clean_df = solar_clean_df.dropna()

In [53]:
solar_clean_df.dtypes

Date_Time              object
Sunhour               float64
Temp_F                float64
cloudcover_percent    float64
uvIndex               float64
humidity              float64
MWH                   float64
year                    int64
month                   int64
day                     int64
dtype: object

In [54]:
solar_clean_df['Date_Time'] = pd.to_datetime(solar_clean_df['Date_Time'])
solar_clean_df["MWH"] = solar_clean_df["MWH"].round(0).astype(int)
solar_clean_df.dtypes

Date_Time             datetime64[ns]
Sunhour                      float64
Temp_F                       float64
cloudcover_percent           float64
uvIndex                      float64
humidity                     float64
MWH                            int32
year                           int64
month                          int64
day                            int64
dtype: object

# ML Models

## Multiple Linear Regression

Date time not supported in linear Regression.

In [55]:
solar_clean_df['hour'] = solar_clean_df['Date_Time'].dt.hour
solar_clean_df.dtypes

Date_Time             datetime64[ns]
Sunhour                      float64
Temp_F                       float64
cloudcover_percent           float64
uvIndex                      float64
humidity                     float64
MWH                            int32
year                           int64
month                          int64
day                            int64
hour                           int64
dtype: object

### Split Data

In [8]:
# Define the features set.
#wind_clean_df = wind_clean_df.reset_index()
X = solar_clean_df.drop(["MWH"], axis=1)
y = solar_clean_df["MWH"].ravel()

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
from sklearn.linear_model import LinearRegression
# Train

regr = LinearRegression()
regr.fit(X_train_scaled,y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
# test
y_pred = regr.predict(X_test)
y_pred

array([0.00326407, 0.00332644, 0.00326571, ..., 0.0032075 , 0.00332675,
       0.00337553])

In [12]:
accuracy = regr.score(X_test_scaled,y_test)
print(accuracy*100,'%')
print(f"R^2 Value:{regr.score(X_test_scaled,y_test)}")


6.629448783298142 %
R^2 Value:0.06629448783298142


## Neural Network

In [56]:
# Define the features set.
#wind_clean_df = wind_clean_df.reset_index()
X = solar_clean_df.drop(["MWH", 'year', 'Date_Time'], axis=1)
y = solar_clean_df["MWH"].ravel()

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [57]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [98]:
import tensorflow as tf
# Define the model - deep neural net
number_input_features = 8
hidden_nodes_layer1 =  number_input_features*3
hidden_nodes_layer2 =  number_input_features*3
hidden_nodes_layer3 =  number_input_features*2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer2, input_dim=number_input_features, activation="linear")
)

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Compile the Sequential model together and customize metrics
nn.compile(loss='mse', optimizer='sgd', metrics=['mse','mae'])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

y_pred = nn.predict(X_test_scaled)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, mse: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
3468/3468 - 0s - loss: 14.4205 - mean_squared_error: 14.4205 - mean_absolute_error: 2.0344


ValueError: too many values to unpack (expected 2)

In [99]:
results = pd.DataFrame()
results['test'] = y_test
results['pred'] = y_pred
results[:10]

Unnamed: 0,test,pred
0,26,19.629028
1,2,1.928335
2,21,25.303516
3,21,16.931591
4,23,21.369152
5,0,0.0
6,0,0.0
7,1,0.0
8,17,10.573897
9,18,20.448502


In [63]:
print(np.mean((y_test[:100]-y_pred[:100])**2))

159.9826706284362


In [68]:
# Example from https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score, KFold
from keras.models import Sequential
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import r2_score
seed = 1

def baseline_model():
    model = Sequential()
    model.add(Dense(10, input_dim=3, activation='linear'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


# estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=100, verbose=False)
# kfold = KFold(n_splits=10, random_state=seed)
# results = cross_val_score(estimator, X, y, cv=kfold)
# print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

# estimator.fit(X, y)
# prediction = estimator.predict(X)
# accuracy_score(y, prediction)

estimator = KerasRegressor(build_fn=baseline_model)
estimator.fit(X_train_scaled, y_train, nb_epoch=100, batch_size=100, verbose=False, shuffle=False)
prediction = estimator.predict(X_test_scaled)
r2_score(y_test, prediction)

# model = baseline_model()
# model.fit(X, y, nb_epoch=100, batch_size=100, verbose=False, shuffle=False)
# prediction = model.predict(X)

AttributeError: 'KerasRegressor' object has no attribute 'evaluate'

In [69]:
# https://nbviewer.jupyter.org/github/srnghn/ml_example_notebooks/blob/master/Predicting%20Wine%20Types%20with%20Neural%20Networks.ipynb
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
model.fit(X_train_scaled, y_train)
accuracy_score(y_train, model.predict(X_train_scaled))

0.09043848964677223

In [70]:
y_train

array([28, 49, 15, ..., 86, 28, 11])

In [71]:
model.predict(X_train_scaled)

array([ 14,   1,   0, ..., 115,  24,  14])

## Random Forrest

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)
Counter(y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)