In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


In [2]:
df = pd.read_csv("solar_PV_forecaster/solar_project_data.csv")

In [3]:
df

Unnamed: 0,tstamp,meter_id,impwh,expwh,ptot,Power Loss Event,AirTemp,Azimuth,CloudOpacity,DewpointTemp,Dhi,Dni,Ebh,Ghi,PrecipitableWater,RelativeHumidity,Zenith,AlbedoDaily
0,2020/11/29 05:00,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,58,679,190,248,14.8,73.5,74,0.10
1,2020/11/29 05:05,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,60,696,207,266,14.8,73.2,73,0.10
2,2020/11/29 05:10,5884,59.68,1.183848e+09,0.0,2,16.7,-104,0.0,11.9,61,713,223,285,14.8,73.1,72,0.10
3,2020/11/29 05:15,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,63,729,240,303,14.8,72.9,71,0.10
4,2020/11/29 05:20,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,65,744,258,322,14.8,72.8,70,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626012,2023/11/27 19:40,7672,101261.92,2.211767e+09,0.0,2,17.7,129,27.3,14.1,13,28,13,26,23.6,79.5,103,0.09
626013,2023/11/27 19:45,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,12,27,12,25,23.6,79.6,104,0.09
626014,2023/11/27 19:50,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,11,25,12,23,23.6,79.8,104,0.09
626015,2023/11/27 19:55,7672,101261.92,2.211767e+09,0.0,2,17.6,131,27.8,14.1,11,24,11,22,23.5,79.9,105,0.09


In [4]:
# Convert the object column to datetime
df['tstamp'] = pd.to_datetime(df['tstamp'])

# Set the timestamp column as the DataFrame index
df.set_index('tstamp', inplace=True)

df

Unnamed: 0_level_0,meter_id,impwh,expwh,ptot,Power Loss Event,AirTemp,Azimuth,CloudOpacity,DewpointTemp,Dhi,Dni,Ebh,Ghi,PrecipitableWater,RelativeHumidity,Zenith,AlbedoDaily
tstamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-11-29 05:00:00,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,58,679,190,248,14.8,73.5,74,0.10
2020-11-29 05:05:00,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,60,696,207,266,14.8,73.2,73,0.10
2020-11-29 05:10:00,5884,59.68,1.183848e+09,0.0,2,16.7,-104,0.0,11.9,61,713,223,285,14.8,73.1,72,0.10
2020-11-29 05:15:00,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,63,729,240,303,14.8,72.9,71,0.10
2020-11-29 05:20:00,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,65,744,258,322,14.8,72.8,70,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-27 19:40:00,7672,101261.92,2.211767e+09,0.0,2,17.7,129,27.3,14.1,13,28,13,26,23.6,79.5,103,0.09
2023-11-27 19:45:00,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,12,27,12,25,23.6,79.6,104,0.09
2023-11-27 19:50:00,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,11,25,12,23,23.6,79.8,104,0.09
2023-11-27 19:55:00,7672,101261.92,2.211767e+09,0.0,2,17.6,131,27.8,14.1,11,24,11,22,23.5,79.9,105,0.09


In [5]:
unique_meter_ids = df['meter_id'].unique()
print(unique_meter_ids)

[5884 6508 7657 7672]


In [6]:
# Assuming 'meter_id' is the identifier for the meter you want to model
selected_meter_id = 5884
df_meter = df[df['meter_id'] == selected_meter_id]

# Select features and target variable
all_features = ['AirTemp', 'Azimuth', 'CloudOpacity', 'DewpointTemp', 'Dhi', 'Dni',	'Ebh', 'Ghi', 'PrecipitableWater', 'RelativeHumidity', 'Zenith', 'AlbedoDaily']
target = 'ptot'

# Create a new DataFrame with all features and the target for the selected meter
df_model = df_meter[all_features + [target]]

# # Handle missing values if necessary
# df_model.dropna(inplace=True)

# Split the data into training and testing sets
X = df_model[all_features]
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection using RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
selector = SelectFromModel(estimator=model_rf).fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

In [None]:
# Hyperparameter optimization using GridSearchCV with TimeSeriesSplit
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [9]:
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=TimeSeriesSplit(n_splits=5), scoring='neg_mean_squared_error')
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

In [10]:
# Train the final model with the best hyperparameters
final_model = RandomForestRegressor(random_state=42, **best_params)
final_model.fit(X_train_selected, y_train)

# Evaluate the model using cross-validation
cv_results = cross_val_score(final_model, X_test_selected, y_test, cv=TimeSeriesSplit(n_splits=5), scoring='neg_mean_squared_error')
mse_cv = -cv_results.mean()
print(f'Mean Squared Error using Cross-Validation: {mse_cv}')

Mean Squared Error using Cross-Validation: 9622.194330332099
