In [1]:
# Module Importations
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Print versioning information
print(sklearn.__version__, np.__version__, pd.__version__) 

0.22.2.post1 1.19.4 1.1.4


In [2]:
# Custom Module Imports
from Source.data import load_data
from Source.data import split_data

In [3]:
# Constants

In [4]:
# Load data from pickle
original_dataset_df = load_data.load_pickled_data('full_data_df.pkl')

Loaded pickled dataframe ...


In [5]:
# Split data into training / evaluation sets
training_set, evaluation_set = split_data.split_train_eval(original_dataset_df, 0.2)

Original Data Items: 137556
Training Data Items: 110045
Evaluation Data Items: 27511


In [6]:
# Drop unrequired data columns

# Identify columns to drop 
columns_to_drop = ['DATE_TIME', 'PLANT_ID', 'SOURCE_KEY', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD']

training_set = training_set.drop(columns_to_drop, axis = 1)
evaluation_set = evaluation_set.drop(columns_to_drop, axis = 1)

print(evaluation_set.head(5))

           DC_POWER CELL_NO TIME_OF_DAY   AMB_TEMP   MOD_TEMP  IRRADIATION  \
23464      0.000000       5    01:00:00  23.478941  22.007802     0.000000   
82416      0.000000      12    05:30:00  23.216699  21.191993     0.000000   
131200     0.000000       3    23:15:00  24.652915  23.913763     0.000000   
120917     0.000000      15    02:15:00  24.696277  23.876865     0.000000   
98459   3486.857143      17    08:45:00  25.788373  28.674120     0.215449   

         PLANT  
23464   plant1  
82416   plant2  
131200  plant2  
120917  plant2  
98459   plant2  


In [9]:
# Create DC Power Target datasets

# Modify training set
dc_power_training_data = training_set.drop('DC_POWER', axis = 1)
dc_label_data = training_set['DC_POWER'].copy()

# Modify evaluation set
dc_evaluation_data = evaluation_set.drop('DC_POWER', axis = 1)
dc_eval_label_data = evaluation_set['DC_POWER'].copy()

In [8]:
# Linear Regression Model (Target - DC Power)
lin_reg_dc = LinearRegression()
model_name = "LinReg_DC"

# Train model
lin_reg_dc.fit(dc_power_training_data, dc_label_data)

# Compute RMSE via cross validation
scores_mse = cross_val_score(lin_reg_dc, dc_power_training_data, dc_label_data, scoring = "neg_mean_squared_error", cv = 5)
scores_mse = np.sqrt(-scores_mse)

# Compute MAE via cross validation
scores_mae = cross_val_score(lin_reg_dc, dc_power_training_data, dc_label_data, scoring = "neg_mean_absolute_error", cv = 5)
scores_mae = -1 * scores_mae

# Visualise cross validation results
print(model_name, "rmse mean (cv):", scores_rmse.mean())
print(model_name, "rmse std (cv):", scores_rmse.std())

print(model_name, "mae mean (cv):", scores_mae.mean())
print(model_name, "mae (cv):", scores_mae.std())

# Evaluate Model