In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

from tpot import TPOTRegressor



In [2]:
df = pd.read_csv("solar_PV_forecaster/solar_project_data.csv")

In [3]:
# Convert the object column to datetime
df['tstamp'] = pd.to_datetime(df['tstamp'])

# Set the timestamp column as the DataFrame index
df.set_index('tstamp', inplace=True)

In [4]:
column_indices_to_drop = [0, 1, 2, 3]
# Drop the specified columns by index from the DataFrame
X = df.drop(df.columns[column_indices_to_drop], axis=1)

y = df['ptot']

In [5]:
scaler = StandardScaler()
X[['Power Loss Event', 'AirTemp', 'Azimuth', 'CloudOpacity', 'DewpointTemp', 'Dhi', 'Dni', 'Ebh', 'Ghi', 'PrecipitableWater', 'RelativeHumidity', 'Zenith', 'AlbedoDaily']] = scaler.fit_transform(X[['Power Loss Event', 'AirTemp', 'Azimuth', 'CloudOpacity', 'DewpointTemp', 'Dhi', 'Dni', 'Ebh', 'Ghi', 'PrecipitableWater', 'RelativeHumidity', 'Zenith', 'AlbedoDaily']])

In [6]:
df_class = df['ptot'].values
X_train, X_test, y_train, y_test = train_test_split(df, df_class,
                                                    train_size=0.75, test_size=0.25)


In [7]:
tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2)

In [8]:
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -5.12078270567461e-06

Generation 2 - Current best internal CV score: -3.805013387398217e-24

Generation 3 - Current best internal CV score: -3.805013387398217e-24

Generation 4 - Current best internal CV score: -2.143490963820017e-26

Generation 5 - Current best internal CV score: -2.143490963820017e-26

Best pipeline: LassoLarsCV(RobustScaler(input_matrix), normalize=False)


In [10]:
print("TPOT cross-validation MSE")
print(tpot.score(X_test, y_test))

TPOT cross-validation MSE
-2.6690221615655106e-26


In [11]:
from sklearn.metrics import mean_squared_error


In [12]:
print('MSE:')
print(mean_squared_error(y_test, tpot.predict(X_test)))

MSE:
2.6690221615655106e-26


In [13]:
print('RMSE:')
print(np.sqrt(mean_squared_error(y_test, tpot.predict(X_test))))

RMSE:
1.633714222734659e-13


In [14]:
# Evaluate the TPOT model on the test set
score = tpot.score(X_test, y_test)
print(f'TPOT test set R2 score: {score}')

# Export the optimized pipeline code to a Python script (optional)
tpot.export('tpot_solar_pv_pipeline.py')

TPOT test set R2 score: -2.6690221615655106e-26
