In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import keras_tuner as kt
from pathlib import Path
import pandas as pd
import keras_tuner as kt
import os
import sqlite3


In [2]:
# Path to SQLite database file (imported in from pgAdmin)
database_path = ("../project4_bad_mother_clusters/wind_farm_a_data.sqlite")

In [3]:
# Connect to the SQLight database
conn = sqlite3.connect(database_path)

# Create a cursor object
cursor = conn.cursor()

In [4]:
#Execute a query to retrieve table name
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")


<sqlite3.Cursor at 0x297289275c0>

In [5]:
# Fetch all table name results
tables = cursor.fetchall()
for table in tables:
    print(table[0])


wind_farm_a_data


In [6]:
# Query to select all SQL data and use panda to read into a WindFarm A Dataframe
table_name = 'wind_farm_a_data'
query = f"SELECT * FROM {table_name}"
windfarma_df = pd.read_sql_query(query, conn)
windfarma_df.head()


Unnamed: 0,time_stamp,asset_id,status_type_id,sensor_0_avg,sensor_1_avg,sensor_2_avg,wind_speed_3_avg,wind_speed_4_avg,wind_speed_3_max,wind_speed_3_min,...,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,sensor_52_avg,sensor_52_max,sensor_52_min,sensor_52_std,sensor_53_avg
0,2014-07-29 13:20:00,11,5,31.0,152.0,48.7,3.9,3.9,8.0,0.6,...,-2090.0,0.0,0.0,-1185.0,-2090.0,0.4,2.6,0.0,0.8,34.0
1,2014-07-29 13:30:00,11,5,31.0,86.1,150.9,6.0,6.0,9.9,0.6,...,-1627.0,0.0,0.0,-1050.0,-1627.0,0.0,0.0,0.0,0.0,34.0
2,2014-07-29 13:40:00,11,5,31.0,115.2,69.6,6.3,6.3,10.6,0.8,...,-1624.0,0.0,0.0,-1043.0,-1624.0,0.0,0.0,0.0,0.0,34.0
3,2014-07-29 13:50:00,11,0,32.0,129.3,-29.1,6.0,5.9,12.4,1.7,...,-212.0,-9540.0,0.0,40124.0,-9753.0,9.5,14.0,0.0,4.8,34.0
4,2014-07-29 14:00:00,11,0,32.0,137.7,26.4,7.1,6.9,13.7,1.7,...,0.0,-25215.0,0.0,99360.0,-25215.0,13.1,14.9,10.8,1.3,35.0


In [7]:
#Look for unique values
num_unique_values = windfarma_df.nunique()
num_unique_values

time_stamp        399750
asset_id               5
status_type_id         4
sensor_0_avg          38
sensor_1_avg        3591
                   ...  
sensor_52_avg        150
sensor_52_max        181
sensor_52_min        135
sensor_52_std         75
sensor_53_avg         42
Length: 84, dtype: int64

In [8]:
# Explore data types
print(windfarma_df.dtypes)

time_stamp         object
asset_id            int64
status_type_id      int64
sensor_0_avg      float64
sensor_1_avg      float64
                   ...   
sensor_52_avg     float64
sensor_52_max     float64
sensor_52_min     float64
sensor_52_std     float64
sensor_53_avg     float64
Length: 84, dtype: object


In [9]:
# Drop non-parameter columns 
windfarm_a_cleaned_df = windfarma_df.drop(['time_stamp','asset_id'], axis=1)
windfarm_a_cleaned_df.head()

Unnamed: 0,status_type_id,sensor_0_avg,sensor_1_avg,sensor_2_avg,wind_speed_3_avg,wind_speed_4_avg,wind_speed_3_max,wind_speed_3_min,wind_speed_3_std,sensor_5_avg,...,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,sensor_52_avg,sensor_52_max,sensor_52_min,sensor_52_std,sensor_53_avg
0,5,31.0,152.0,48.7,3.9,3.9,8.0,0.6,0.9,70.5,...,-2090.0,0.0,0.0,-1185.0,-2090.0,0.4,2.6,0.0,0.8,34.0
1,5,31.0,86.1,150.9,6.0,6.0,9.9,0.6,1.4,86.0,...,-1627.0,0.0,0.0,-1050.0,-1627.0,0.0,0.0,0.0,0.0,34.0
2,5,31.0,115.2,69.6,6.3,6.3,10.6,0.8,1.3,86.0,...,-1624.0,0.0,0.0,-1043.0,-1624.0,0.0,0.0,0.0,0.0,34.0
3,0,32.0,129.3,-29.1,6.0,5.9,12.4,1.7,1.4,13.6,...,-212.0,-9540.0,0.0,40124.0,-9753.0,9.5,14.0,0.0,4.8,34.0
4,0,32.0,137.7,26.4,7.1,6.9,13.7,1.7,1.7,-1.9,...,0.0,-25215.0,0.0,99360.0,-25215.0,13.1,14.9,10.8,1.3,35.0


In [10]:
# Combine Y values with similar or the same outcome
windfarm_a_cleaned_df['status_type_id'] = windfarm_a_cleaned_df['status_type_id'].replace({3: 1, 4: 1, 5:2})
windfarm_a_cleaned_df

Unnamed: 0,status_type_id,sensor_0_avg,sensor_1_avg,sensor_2_avg,wind_speed_3_avg,wind_speed_4_avg,wind_speed_3_max,wind_speed_3_min,wind_speed_3_std,sensor_5_avg,...,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,sensor_52_avg,sensor_52_max,sensor_52_min,sensor_52_std,sensor_53_avg
0,2,31.0,152.0,48.7,3.9,3.9,8.0,0.6,0.9,70.5,...,-2090.0,0.0,0.0,-1185.0,-2090.0,0.4,2.6,0.0,0.8,34.0
1,2,31.0,86.1,150.9,6.0,6.0,9.9,0.6,1.4,86.0,...,-1627.0,0.0,0.0,-1050.0,-1627.0,0.0,0.0,0.0,0.0,34.0
2,2,31.0,115.2,69.6,6.3,6.3,10.6,0.8,1.3,86.0,...,-1624.0,0.0,0.0,-1043.0,-1624.0,0.0,0.0,0.0,0.0,34.0
3,0,32.0,129.3,-29.1,6.0,5.9,12.4,1.7,1.4,13.6,...,-212.0,-9540.0,0.0,40124.0,-9753.0,9.5,14.0,0.0,4.8,34.0
4,0,32.0,137.7,26.4,7.1,6.9,13.7,1.7,1.7,-1.9,...,0.0,-25215.0,0.0,99360.0,-25215.0,13.1,14.9,10.8,1.3,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142287,2,12.0,314.6,-24.0,3.4,3.5,7.7,0.5,0.8,3.5,...,-639.0,-570.0,0.0,2299.0,-1209.0,10.3,11.2,1.9,2.5,15.0
1142288,0,12.0,345.5,7.6,4.0,4.0,17.2,1.4,0.8,4.1,...,-35.0,-7757.0,0.0,9430.0,-7792.0,9.3,12.1,2.5,3.4,15.0
1142289,0,13.0,343.1,-2.1,4.1,4.0,7.0,0.9,0.6,-0.3,...,0.0,-25888.0,0.0,10201.0,-25888.0,11.1,11.1,11.0,0.0,15.0
1142290,0,13.0,331.5,-13.7,3.5,3.6,7.3,0.4,0.7,0.4,...,-20.0,-13529.0,0.0,4231.0,-13549.0,11.1,11.1,11.0,0.0,15.0


In [11]:
# Verify datatypes after changes
print(windfarm_a_cleaned_df.dtypes)

status_type_id        int64
sensor_0_avg        float64
sensor_1_avg        float64
sensor_2_avg        float64
wind_speed_3_avg    float64
                     ...   
sensor_52_avg       float64
sensor_52_max       float64
sensor_52_min       float64
sensor_52_std       float64
sensor_53_avg       float64
Length: 82, dtype: object


In [12]:
# Split our data into our features and target arrays
y = windfarm_a_cleaned_df['status_type_id']
X = windfarm_a_cleaned_df.drop(columns="status_type_id")

#Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(856719, 81)

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

#Fit the StandardScaler
X_scaler = scaler.fit(X_train)

#Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train.columns)


# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=input_features))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 10)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [22]:
# Initialize the Keras Tuner using RandomSearch

tuner = kt.RandomSearch(
    create_model,
    objective="val_accuracy",
    executions_per_trial=1,
    max_trials=10,
    project_name="Mother_Clusters_Project",
    )

# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=10,validation_data=(X_test_scaled,y_test))
best_hps = tuner.get_best_hyperparameters()[0]

# Access and print trial results
for trial in tuner.oracle.get_best_trials(num_trials=10):
    print(f"Trial summary for trial {trial.trial_id}:")
    print(f"Hyperparameters: {trial.hyperparameters.values}")
    print(f"Metrics: {trial.metrics}")
    print("=" * 50)



Trial 10 Complete [00h 07m 48s]
val_accuracy: 0.6571840047836304

Best val_accuracy So Far: 0.7502250075340271
Total elapsed time: 01h 43m 58s
Trial summary for trial 00:
Hyperparameters: {'activation': 'relu', 'first_units': 1, 'num_layers': 7, 'units_0': 9, 'units_1': 1, 'units_2': 1, 'units_3': 1, 'units_4': 1, 'units_5': 1, 'units_6': 1}
Metrics: <keras_tuner.src.engine.metrics_tracking.MetricsTracker object at 0x000002977F016890>
Trial summary for trial 05:
Hyperparameters: {'activation': 'relu', 'first_units': 5, 'num_layers': 7, 'units_0': 9, 'units_1': 7, 'units_2': 9, 'units_3': 7, 'units_4': 1, 'units_5': 5, 'units_6': 9}
Metrics: <keras_tuner.src.engine.metrics_tracking.MetricsTracker object at 0x000002972DE20F40>
Trial summary for trial 03:
Hyperparameters: {'activation': 'tanh', 'first_units': 9, 'num_layers': 7, 'units_0': 1, 'units_1': 9, 'units_2': 7, 'units_3': 5, 'units_4': 3, 'units_5': 9, 'units_6': 1}
Metrics: <keras_tuner.src.engine.metrics_tracking.MetricsTracker

In [20]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'first_units': 1,
 'num_layers': 10,
 'units_0': 5,
 'units_1': 1,
 'units_2': 3,
 'units_3': 9,
 'units_4': 5,
 'units_5': 3,
 'units_6': 1,
 'units_7': 1,
 'units_8': 1,
 'units_9': 1}

In [21]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

8925/8925 - 13s - 1ms/step - accuracy: 0.7502 - loss: 0.6911
Loss: 0.6910707354545593, Accuracy: 0.7502250075340271
