In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
file_path = '/home/chad/code/ChadReich/solar_PV_forecaster/solar_project_data.csv'

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

df

Unnamed: 0,tstamp,meter_id,impwh,expwh,ptot,Power Loss Event,AirTemp,Azimuth,CloudOpacity,DewpointTemp,Dhi,Dni,Ebh,Ghi,PrecipitableWater,RelativeHumidity,Zenith,AlbedoDaily
0,2020-11-29 05:00:00,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,58,679,190,248,14.8,73.5,74,0.10
1,2020-11-29 05:05:00,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,60,696,207,266,14.8,73.2,73,0.10
2,2020-11-29 05:10:00,5884,59.68,1.183848e+09,0.0,2,16.7,-104,0.0,11.9,61,713,223,285,14.8,73.1,72,0.10
3,2020-11-29 05:15:00,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,63,729,240,303,14.8,72.9,71,0.10
4,2020-11-29 05:20:00,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,65,744,258,322,14.8,72.8,70,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626012,2023-11-27 19:40:00,7672,101261.92,2.211767e+09,0.0,2,17.7,129,27.3,14.1,13,28,13,26,23.6,79.5,103,0.09
626013,2023-11-27 19:45:00,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,12,27,12,25,23.6,79.6,104,0.09
626014,2023-11-27 19:50:00,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,11,25,12,23,23.6,79.8,104,0.09
626015,2023-11-27 19:55:00,7672,101261.92,2.211767e+09,0.0,2,17.6,131,27.8,14.1,11,24,11,22,23.5,79.9,105,0.09


In [3]:
# Convert the object column to datetime
df['tstamp'] = pd.to_datetime(df['tstamp'])

# Set the timestamp column as the DataFrame index
df.set_index('tstamp', inplace=True)

df

Unnamed: 0_level_0,meter_id,impwh,expwh,ptot,Power Loss Event,AirTemp,Azimuth,CloudOpacity,DewpointTemp,Dhi,Dni,Ebh,Ghi,PrecipitableWater,RelativeHumidity,Zenith,AlbedoDaily
tstamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-11-29 05:00:00,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,58,679,190,248,14.8,73.5,74,0.10
2020-11-29 05:05:00,5884,59.68,1.183848e+09,0.0,2,16.7,-105,0.0,11.9,60,696,207,266,14.8,73.2,73,0.10
2020-11-29 05:10:00,5884,59.68,1.183848e+09,0.0,2,16.7,-104,0.0,11.9,61,713,223,285,14.8,73.1,72,0.10
2020-11-29 05:15:00,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,63,729,240,303,14.8,72.9,71,0.10
2020-11-29 05:20:00,5884,59.68,1.183848e+09,0.0,2,16.8,-103,0.0,11.9,65,744,258,322,14.8,72.8,70,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-27 19:40:00,7672,101261.92,2.211767e+09,0.0,2,17.7,129,27.3,14.1,13,28,13,26,23.6,79.5,103,0.09
2023-11-27 19:45:00,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,12,27,12,25,23.6,79.6,104,0.09
2023-11-27 19:50:00,7672,101261.92,2.211767e+09,0.0,2,17.7,130,27.1,14.1,11,25,12,23,23.6,79.8,104,0.09
2023-11-27 19:55:00,7672,101261.92,2.211767e+09,0.0,2,17.6,131,27.8,14.1,11,24,11,22,23.5,79.9,105,0.09


In [4]:
# Get unique meters
unique_meters = [5884, 6508, 7657, 7672]

# Create a dictionary to store DataFrames for each meter
meter_dataframes = {}

# Split the DataFrame based on each meter
for meter in unique_meters:
    meter_dataframes[meter] = df[df['meter_id'] == meter].copy()

In [5]:
df_5884 = meter_dataframes[5884]
df_6508 = meter_dataframes[6508]
df_7657 = meter_dataframes[7657]
df_7672 = meter_dataframes[7672]

In [6]:
df_5884

Unnamed: 0_level_0,meter_id,impwh,expwh,ptot,Power Loss Event,AirTemp,Azimuth,CloudOpacity,DewpointTemp,Dhi,Dni,Ebh,Ghi,PrecipitableWater,RelativeHumidity,Zenith,AlbedoDaily
tstamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-11-29 05:00:00,5884,59.68,1.183848e+09,0.000,2,16.7,-105,0.0,11.9,58,679,190,248,14.8,73.5,74,0.1
2020-11-29 05:05:00,5884,59.68,1.183848e+09,0.000,2,16.7,-105,0.0,11.9,60,696,207,266,14.8,73.2,73,0.1
2020-11-29 05:10:00,5884,59.68,1.183848e+09,0.000,2,16.7,-104,0.0,11.9,61,713,223,285,14.8,73.1,72,0.1
2020-11-29 05:15:00,5884,59.68,1.183848e+09,0.000,2,16.8,-103,0.0,11.9,63,729,240,303,14.8,72.9,71,0.1
2020-11-29 05:20:00,5884,59.68,1.183848e+09,0.000,2,16.8,-103,0.0,11.9,65,744,258,322,14.8,72.8,70,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-28 09:45:00,5884,84.80,3.407400e+09,399.850,0,19.2,-66,17.0,14.8,268,556,483,751,22.7,77.0,30,0.1
2023-11-28 09:50:00,5884,84.80,3.407429e+09,347.072,0,19.3,-65,16.5,14.9,266,568,497,764,22.6,76.7,30,0.1
2023-11-28 09:55:00,5884,84.80,3.407443e+09,176.836,0,19.3,-63,15.9,14.9,264,580,512,776,22.5,76.5,29,0.1
2023-11-28 10:00:00,5884,84.80,3.407478e+09,412.825,0,19.4,-60,15.5,14.9,263,590,525,788,22.4,76.3,28,0.1


In [14]:
# Select features and target variable
features = ['AirTemp', 'CloudOpacity', 'Dhi', 'PrecipitableWater']
target = 'ptot'

# Create a new DataFrame with only the selected features and target
df_model = df[features + [target]]
df_model.head()

Unnamed: 0_level_0,AirTemp,CloudOpacity,Dhi,PrecipitableWater,ptot
tstamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-29 05:00:00,16.7,0.0,58,14.8,0.0
2020-11-29 05:05:00,16.7,0.0,60,14.8,0.0
2020-11-29 05:10:00,16.7,0.0,61,14.8,0.0
2020-11-29 05:15:00,16.8,0.0,63,14.8,0.0
2020-11-29 05:20:00,16.8,0.0,65,14.8,0.0


In [15]:
y = df_model["ptot"]
X = df_model.drop(columns="ptot")
X.head()

Unnamed: 0_level_0,AirTemp,CloudOpacity,Dhi,PrecipitableWater
tstamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-29 05:00:00,16.7,0.0,58,14.8
2020-11-29 05:05:00,16.7,0.0,60,14.8
2020-11-29 05:10:00,16.7,0.0,61,14.8
2020-11-29 05:15:00,16.8,0.0,63,14.8
2020-11-29 05:20:00,16.8,0.0,65,14.8


In [16]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X.shape

(626017, 4)

In [11]:
# Handle missing values if necessary
df_model.dropna(inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_model[features], df_model[target], test_size=0.3, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model.dropna(inplace=True)


In [12]:
# Calculate the mean of the target variable in the training set
mean_ptot = y_train.mean()

# Create predictions using the mean for all instances
baseline_predictions = [mean_ptot] * len(y_test)

# Evaluate the baseline model
mse_baseline = mean_squared_error(y_test, baseline_predictions)

In [13]:
# Train a machine learning model (Random Forest Regressor in this example)
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Now you can use the trained model for predictions on new data
# For example, if you have a new DataFrame 'new_data' with the same features:
# new_data_scaled = scaler.transform(new_data[features])
# new_predictions = model.predict(new_data_scaled)

Mean Squared Error: 6446.139394669184
