In [1]:
# Module Importations
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures

# Print versioning information
print(sklearn.__version__, np.__version__, pd.__version__) 

0.22.2.post1 1.19.4 1.1.4


In [2]:
# Custom Module Imports
from Source.data import load_data
from Source.data import split_data
from Source.models import model_evaluation
from Source.models import sklearn_helpers

In [3]:
# Constants

In [4]:
# Load data from pickle
original_dataset_df = load_data.load_pickled_data('full_data_df.pkl')

Loaded pickled dataframe ...


In [5]:
# Data Munging - Convert time of day to float
def convert_time_to_float(time):
    return time.hour / 24.0 + time.minute / (24.0*60.0) + time.second / (24.0*60.0*60.0) + time.microsecond / (24.0*60.0*60.0*1000000.0)

original_dataset_df['TIME_OF_DAY'] = original_dataset_df.apply(lambda row: convert_time_to_float(row['DATE_TIME']), axis = 1)

print(original_dataset_df)

                 DATE_TIME  PLANT_ID       SOURCE_KEY  DC_POWER  AC_POWER  \
0      2020-05-15 00:00:00   4135001  1BY6WEcLGh8j5v7       0.0       0.0   
1      2020-05-15 00:00:00   4135001  1IF53ai7Xc0U56Y       0.0       0.0   
2      2020-05-15 00:00:00   4135001  3PZuoBAID5Wc2HD       0.0       0.0   
3      2020-05-15 00:00:00   4135001  7JYdWkrLSPkdwr4       0.0       0.0   
4      2020-05-15 00:00:00   4135001  McdE0feGgRqW7Ca       0.0       0.0   
...                    ...       ...              ...       ...       ...   
137551 2020-06-17 23:45:00   4135001  uHbuxQJl8lW7ozc       0.0       0.0   
137552 2020-06-17 23:45:00   4135001  wCURE6d3bPkepu2       0.0       0.0   
137553 2020-06-17 23:45:00   4135001  z9Y9gH1T5YWrNuG       0.0       0.0   
137554 2020-06-17 23:45:00   4135001  zBIq5rxdHJRwDNY       0.0       0.0   
137555 2020-06-17 23:45:00   4135001  zVJPv84UY57bAof       0.0       0.0   

        DAILY_YIELD  TOTAL_YIELD CELL_NO  TIME_OF_DAY   AMB_TEMP   MOD_TEMP

In [6]:
# Data Munging - Convert Plant to Int

def convert_plant_to_int(plant):
    
    if plant == "plant1":
        return 1
    else:
        return 2    

original_dataset_df['PLANT'] = original_dataset_df.apply(lambda row: convert_plant_to_int(row['PLANT']), axis = 1)

print(original_dataset_df)

                 DATE_TIME  PLANT_ID       SOURCE_KEY  DC_POWER  AC_POWER  \
0      2020-05-15 00:00:00   4135001  1BY6WEcLGh8j5v7       0.0       0.0   
1      2020-05-15 00:00:00   4135001  1IF53ai7Xc0U56Y       0.0       0.0   
2      2020-05-15 00:00:00   4135001  3PZuoBAID5Wc2HD       0.0       0.0   
3      2020-05-15 00:00:00   4135001  7JYdWkrLSPkdwr4       0.0       0.0   
4      2020-05-15 00:00:00   4135001  McdE0feGgRqW7Ca       0.0       0.0   
...                    ...       ...              ...       ...       ...   
137551 2020-06-17 23:45:00   4135001  uHbuxQJl8lW7ozc       0.0       0.0   
137552 2020-06-17 23:45:00   4135001  wCURE6d3bPkepu2       0.0       0.0   
137553 2020-06-17 23:45:00   4135001  z9Y9gH1T5YWrNuG       0.0       0.0   
137554 2020-06-17 23:45:00   4135001  zBIq5rxdHJRwDNY       0.0       0.0   
137555 2020-06-17 23:45:00   4135001  zVJPv84UY57bAof       0.0       0.0   

        DAILY_YIELD  TOTAL_YIELD CELL_NO  TIME_OF_DAY   AMB_TEMP   MOD_TEMP

In [7]:
# Split data into training / evaluation sets
training_set, evaluation_set = split_data.split_train_eval(original_dataset_df, 0.2)

Original Data Items: 137556
Training Data Items: 110045
Evaluation Data Items: 27511


In [8]:
# Drop unrequired data columns

# Identify columns to drop 
columns_to_drop = ['DATE_TIME', 'PLANT_ID', 'SOURCE_KEY', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD']

training_set = training_set.drop(columns_to_drop, axis = 1)
evaluation_set = evaluation_set.drop(columns_to_drop, axis = 1)

print(evaluation_set.head(5))

           DC_POWER CELL_NO  TIME_OF_DAY   AMB_TEMP   MOD_TEMP  IRRADIATION  \
23464      0.000000      05     0.041667  23.478941  22.007802     0.000000   
82416      0.000000      12     0.229167  23.216699  21.191993     0.000000   
131200     0.000000      03     0.968750  24.652915  23.913763     0.000000   
120917     0.000000      15     0.093750  24.696277  23.876865     0.000000   
98459   3486.857143      17     0.364583  25.788373  28.674120     0.215449   

        PLANT  
23464       1  
82416       2  
131200      2  
120917      2  
98459       2  


In [9]:
# Create DC Power Target datasets

# Modify training set
dc_power_training_data = training_set.drop('DC_POWER', axis = 1)
dc_label_data = training_set['DC_POWER'].copy()

# Modify evaluation set
dc_evaluation_data = evaluation_set.drop('DC_POWER', axis = 1)
dc_eval_label_data = evaluation_set['DC_POWER'].copy()

In [10]:
# Linear Regression Model (Target - DC Power)
lin_reg_dc = LinearRegression()
model_name = "LinReg_DC"

# Train model
lin_reg_dc.fit(dc_power_training_data, dc_label_data)

# Save model
model_name = sklearn_helpers.name_model(model_name)
sklearn_helpers.save_model(lin_reg_dc, model_name)

# Compute RMSE via cross validation
scores_mse = cross_val_score(lin_reg_dc, dc_power_training_data, dc_label_data, scoring = "neg_mean_squared_error", cv = 5)
scores_rmse = np.sqrt(-scores_mse)

# Compute MAE via cross validation
scores_mae = cross_val_score(lin_reg_dc, dc_power_training_data, dc_label_data, scoring = "neg_mean_absolute_error", cv = 5)
scores_mae = -1 * scores_mae

# Visualise cross validation results
print(model_name, "rmse mean (cv):", scores_rmse.mean())
print(model_name, "rmse std (cv):", scores_rmse.std())

print(model_name, "mae mean (cv):", scores_mae.mean())
print(model_name, "mae (cv):", scores_mae.std())

Save Path: C:\Developer\electric_motor_thermal_modelling\Models\WJ_LinReg_DC_2020_12_18-14_56_05.pkl
WJ_LinReg_DC_2020_12_18-14_56_05.pkl rmse mean (cv): 1322.4137001754764
WJ_LinReg_DC_2020_12_18-14_56_05.pkl rmse std (cv): 15.553634245007927
WJ_LinReg_DC_2020_12_18-14_56_05.pkl mae mean (cv): 691.8947488375384
WJ_LinReg_DC_2020_12_18-14_56_05.pkl mae (cv): 9.433653337987451


In [11]:
# Model Evaluation - Linear Regr (Target - DC Power)

# Describe model coefficients
print("Cell_no Coef: ", lin_reg_dc.coef_[0])
print("Time of Day Coef: ", lin_reg_dc.coef_[1])
print("Amb Temp Coef: ", lin_reg_dc.coef_[2])
print("Mod Temp Coef: ", lin_reg_dc.coef_[3])
print("Irradiation Coef: ", lin_reg_dc.coef_[4])
print("Plant Coef: ", lin_reg_dc.coef_[5])

# Evaluate model
dc_power_pred_eval = lin_reg_dc.predict(dc_evaluation_data)
model_evaluation.evaluate_model(model_name, dc_eval_label_data, dc_power_pred_eval)

Cell_no Coef:  1.1808872444195757
Time of Day Coef:  80.95278604763084
Amb Temp Coef:  -102.71970025356467
Mod Temp Coef:  105.03360231841546
Irradiation Coef:  9363.511030818054
Plant Coef:  26.42004886454447
WJ_LinReg_DC_2020_12_18-14_56_05.pkl rmse (Eval): 1297.550312811934
WJ_LinReg_DC_2020_12_18-14_56_05.pkl mae (Eval): 676.8300452316188
WJ_LinReg_DC_2020_12_18-14_56_05.pkl r2 (Eval): 0.896825460687761


In [12]:
# Polynomial Regression (Target - DC Power)
lin_reg_poly_dc = LinearRegression()

# Create Polynomial Dataset
poly_features = PolynomialFeatures(degree = 2, include_bias = False)
dc_training_poly = poly_features.fit_transform(dc_power_training_data)

# Train Model
lin_reg_poly_dc.fit(dc_training_poly, dc_label_data)

model_name = "LinRegPoly_DC"

# Compute RMSE via cross validation
scores_mse = cross_val_score(lin_reg_poly_dc, dc_training_poly, dc_label_data, scoring = "neg_mean_squared_error", cv = 5)
scores_rmse = np.sqrt(-scores_mse)

# Compute MAE via cross validation
scores_mae = cross_val_score(lin_reg_poly_dc, dc_training_poly, dc_label_data, scoring = "neg_mean_absolute_error", cv = 5)
scores_mae = -1 * scores_mae

# Visualise cross validation results
print(model_name, "rmse mean (cv):", scores_rmse.mean())
print(model_name, "rmse std (cv):", scores_rmse.std())

print(model_name, "mae mean (cv):", scores_mae.mean())
print(model_name, "mae (cv):", scores_mae.std())

LinRegPoly_DC rmse mean (cv): 1197.7440312946387
LinRegPoly_DC rmse std (cv): 13.603250846581462
LinRegPoly_DC mae mean (cv): 656.1798886451742
LinRegPoly_DC mae (cv): 6.851284258821517


In [13]:
# Polynomial Regression - Model Evaluation (Target - DC Power)

# Describe model coefficients
print(lin_reg_poly_dc.coef_)

# Evaluate model (includes poly conversion of input data)
dc_eval_poly = poly_features.fit_transform(dc_evaluation_data)
dc_power_pred_eval = lin_reg_poly_dc.predict(dc_eval_poly)
model_evaluation.evaluate_model(model_name, dc_eval_label_data, dc_power_pred_eval)

[-1.70200321e+00  9.05123071e+03  8.28311591e+02 -2.01390393e+02
  2.03162007e+04 -1.73059965e+02 -9.54497658e-02  2.74757124e+00
  3.30993032e-01 -1.80754253e-01  7.61630781e+00 -9.53282158e-01
 -3.16995305e+03  1.03437939e+02 -4.39033826e+02  2.00541032e+04
  9.85043058e+02 -1.78352445e+01  8.05052310e+00 -2.62149468e+02
 -7.17846101e+01  2.90147144e-02  1.59026900e+01  1.39571163e+02
 -5.51626261e+03 -5.34751678e+03 -5.19179894e+02]
LinRegPoly_DC rmse (Eval): 1173.1783999940685
LinRegPoly_DC mae (Eval): 642.3607270320235
LinRegPoly_DC r2 (Eval): 0.915656379067132
