In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
%timeit

In [2]:
df_train_est_A = pd.read_csv('train_est_A.csv')
df_train_obs_A = pd.read_csv('train_obs_A.csv')
df_test_A = pd.read_csv('X_test_A.csv')
df_train_est_A = df_train_est_A.rename(columns={'Unnamed: 0': 'date_forecast'})

df_train_obs_A = df_train_obs_A.rename(columns={'Unnamed: 0': 'date_forecast'})

df_test_A = df_test_A.rename(columns={'Unnamed: 0': 'date_forecast'})

df_train_est_A.set_index('date_forecast', inplace=True)
df_train_obs_A.set_index('date_forecast', inplace=True)
df_test_A.set_index('date_forecast', inplace=True)
df_train_obs_A.index = pd.to_datetime(df_train_obs_A.index)
df_train_obs_A.index = pd.to_datetime(df_train_obs_A.index)
df_train_est_A=df_train_est_A.drop(columns=['date_calc'])
df_test_A=df_test_A.drop(columns=['date_calc'])
df = pd.concat([df_train_obs_A,df_train_est_A])

In [3]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
# Imputa i valori mancanti
df_knn =pd.DataFrame(imputer.fit_transform(df),columns = df.columns, index = df.index)

In [4]:
columns = df.drop(columns = 'pv_measurement').columns

In [5]:
from sklearn.preprocessing import RobustScaler
# Crea un oggetto MinMaaxScaler
scaler = RobustScaler()
# Normalizza il DataFrame
df_normalized = pd.DataFrame(scaler.fit_transform(df_knn[columns]),columns = df_knn[columns].columns, index = df_knn[columns].index)
df_normalized['pv_measurement'] = df[ 'pv_measurement']

In [6]:
df_normalized.index = pd.to_datetime(df_normalized.index)
df_normalized['hours'] = df_normalized.index.hour
df_normalized['day'] = df_normalized.index.day
df_normalized['month'] = df_normalized.index.month
df_test_A.index = pd.to_datetime(df_test_A.index)
df_test_A['hours'] = df_test_A.index.hour
df_test_A['day'] = df_test_A.index.day
df_test_A['month'] = df_test_A.index.month
df_normalized['hour_cos'] = np.cos(df_normalized['hours'] * (2. * np.pi / 24))
df_normalized['month_cos'] = np.cos(df_normalized['month'] * (2. * np.pi / 12))
df_normalized['hour_sin'] = np.sin(df_normalized['hours'] * (2. * np.pi / 24))
df_normalized['month_sin'] = np.sin(df_normalized['month'] * (2. * np.pi / 12))
df_test_A['hour_cos'] = np.cos(df_test_A['hours'] * (2. * np.pi / 24))
df_test_A['month_cos'] = np.cos(df_test_A['month'] * (2. * np.pi / 12))
df_test_A['hour_sin'] = np.sin(df_test_A['hours'] * (2. * np.pi / 24))
df_test_A['month_sin'] = np.sin(df_test_A['month'] * (2. * np.pi / 12))

In [7]:
import pandas as pd
import math

# Function to calculate solar zenith angle
def calculate_solar_zenith(row):
    t = row['hours']
    Az = row['sun_azimuth:d']
    
    # Calculate the solar zenith angle (θ) using a simplified model
    solar_zenith_angle = 90 - abs(t - 12) * (90 / 6)  # Assumes the sun is highest at noon
    
    # Ensure the solar zenith angle is between 0 and 90 degrees
    solar_zenith_angle = max(0, min(90, solar_zenith_angle))
    
    return solar_zenith_angle

# Apply the function to each row in the DataFrame
df_test_A['Solar Zenith Angle (degrees)'] = df_test_A.apply(calculate_solar_zenith, axis=1)

# Display the updated DataFrame
# Apply the function to each row in the DataFrame
df_normalized['Solar Zenith Angle (degrees)'] = df_normalized.apply(calculate_solar_zenith, axis=1)


In [8]:
def calculate_total_radiation(row):
    DNI = row['direct_rad:W']
    solar_zenith_angle = row['Solar Zenith Angle (degrees)']
    DHI = row['diffuse_rad:W']
    # Calculate total solar radiation (G) using the formula
    G = DNI * math.cos(math.radians(solar_zenith_angle)) + DHI
    
    return G

# Apply the function to each row in the DataFrame
df_test_A['Total Solar Radiation (W/m²)'] = df_test_A.apply(calculate_total_radiation, axis=1)
df_normalized['Total Solar Radiation (W/m²)'] = df_normalized.apply(calculate_total_radiation, axis=1)


In [9]:
df = df_normalized

In [10]:
param = {'n_jobs' : -1}

In [11]:
model = RandomForestRegressor(**param)

In [12]:
X_train = df.drop(columns = 'pv_measurement')
y_train = df['pv_measurement']

In [13]:
model.fit(X_train,y_train)

In [14]:
feature_importance = model.feature_importances_
# Ottieni i nomi delle colonne dalle feature
feature_names = X_train.columns

feature_df = pd.DataFrame({'Name' : feature_names, 'Importance':feature_importance})

In [22]:
feature_df = feature_df.sort_values(by='Importance', ascending=False)
feature_df = feature_df.reset_index()
feature_df = feature_df.drop(columns = ['index'])
feature_df['Name'][:10]

0                    direct_rad:W
1    Total Solar Radiation (W/m²)
2                   diffuse_rad:W
3                 clear_sky_rad:W
4                   sun_azimuth:d
5             wind_speed_u_10m:ms
6             wind_speed_v_10m:ms
7            ceiling_height_agl:m
8               wind_speed_10m:ms
9                cloud_base_agl:m
Name: Name, dtype: object

In [25]:
feature_test = feature_df['Name'][:10].tolist()
target = 'pv_measurement'


In [26]:
feature_test 

['direct_rad:W',
 'Total Solar Radiation (W/m²)',
 'diffuse_rad:W',
 'clear_sky_rad:W',
 'sun_azimuth:d',
 'wind_speed_u_10m:ms',
 'wind_speed_v_10m:ms',
 'ceiling_height_agl:m',
 'wind_speed_10m:ms',
 'cloud_base_agl:m']

In [27]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error

# Sample DataFrame (replace this with your own dataset)
data = df[feature_test]
data[target] = df[target]
# Define the number of splits (e.g., 5 for 80-20 train-test splits)
n_splits = 5


# Initialize the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=n_splits)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[target] = df[target]


In [28]:
def train(data):
    model = RandomForestRegressor(**param)
    maelist = []
    for train_index, test_index in tscv.split(data):
            train_data = data.iloc[train_index]
            test_data = data.iloc[test_index]

            # Extract target variable for training and testing data
            y_train = train_data['pv_measurement']
            y_test = test_data['pv_measurement']

            # Extract features for training and testing data
            X_train = train_data.drop(columns = 'pv_measurement')
            X_test = test_data.drop(columns = 'pv_measurement')

            # Train the XGBoost model
            model.fit(X_train, y_train)

            # Make predictions on the test data
            y_pred = model.predict(X_test)

            # Evaluate the model using Mean Absolute Error (MAE)
            mae = mean_absolute_error(y_test, y_pred)
            print("Mean Absolute Error:", mae)
            maelist.append(mae)
        # Return MAE
    mean_mae = np.mean(maelist)
    print(mean_mae)
    return mean_mae

In [29]:
train(df)
train(data)

Mean Absolute Error: 336.598494084507
Mean Absolute Error: 152.34903633802818
Mean Absolute Error: 224.81563848591546
Mean Absolute Error: 246.37173038732394
Mean Absolute Error: 137.67988117957745
219.56295609507043
Mean Absolute Error: 347.02409964788734
Mean Absolute Error: 152.3675511971831
Mean Absolute Error: 227.96196674295777
Mean Absolute Error: 255.6969683802817
Mean Absolute Error: 141.75956117957747
224.96202942957748


224.96202942957748

In [33]:
data

Unnamed: 0_level_0,direct_rad:W,Total Solar Radiation (W/m²),diffuse_rad:W,clear_sky_rad:W,sun_azimuth:d,wind_speed_u_10m:ms,wind_speed_v_10m:ms,ceiling_height_agl:m,wind_speed_10m:ms,cloud_base_agl:m,pv_measurement
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-06-02 22:00:00,0.000000,-0.029016,-0.029016,-0.013915,0.983563,-0.951219,-0.457447,-0.179721,0.360825,0.298824,0.00
2019-06-02 23:00:00,0.000000,-0.029016,-0.029016,-0.013915,-0.515798,-0.896341,-0.127660,-0.191187,0.257732,0.275195,0.00
2019-06-03 00:00:00,0.000000,-0.029016,-0.029016,-0.013915,-0.966948,-0.798780,0.074468,-0.228290,0.134021,0.198738,0.00
2019-06-03 01:00:00,0.000000,-0.024770,-0.024770,-0.010933,-0.886753,-0.713415,0.127660,-0.310293,0.000000,0.029759,0.00
2019-06-03 02:00:00,0.004392,0.144874,0.140481,0.077925,-0.808480,-0.652439,0.148936,-0.392332,-0.072165,-0.139296,19.36
...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,0.002928,0.013898,0.010970,0.002882,0.731202,1.091463,0.574468,-0.195767,1.010309,-0.417586,9.02
2023-04-30 20:00:00,0.000000,-0.029016,-0.029016,-0.013915,0.811344,0.902439,0.734043,-0.169917,0.793814,-0.415352,0.00
2023-04-30 21:00:00,0.000000,-0.029016,-0.029016,-0.013915,0.894260,0.792683,0.861702,-0.189187,0.711340,-0.414174,0.00
2023-04-30 22:00:00,0.000000,-0.029016,-0.029016,-0.013915,0.979706,0.695122,0.840426,-0.289785,0.567010,-0.426600,0.00


In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import make_regression

# Genera un dataset di esempio (sostituisci con i tuoi dati)
X, y = data[feature_test], data[target]

# Definisci una lista di valori da testare per ciascun iperparametro
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_jobs' : [-1],
}

# Inizializza il modello Random Forest
rf = RandomForestRegressor()

# Inizializza GridSearchCV con il modello, il parametro da ottimizzare e la cross-validation
grid_search = RandomizedSearchCV(rf, param_grid, cv=tscv, scoring='neg_mean_absolute_error', n_jobs = -1, verbose = 3)

# Esegui la ricerca degli iperparametri ottimali
grid_search.fit(X, y)

# Ottieni i migliori iperparametri trovati
best_params = grid_search.best_params_
best_mae = -grid_search.best_score_  # Negativo MAE, quindi invertiamo il segno

print("Migliori iperparametri:", best_params)
print("Miglior MAE:", best_mae)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [None]:
param = {'n_jobs':-1,'n_estimators':100, 'random_state':42}

In [None]:
model_A = RandomForestRegressor(**param)

In [None]:
X_train = data.drop(columns = 'pv_measurement')
y_train = data['pv_measurement']

In [None]:
model_A.fit(X_train,y_train)
df_test_A = df_test_A[feature_test]
y_pred_A = model_A.predict(df_test_A)

In [None]:
plt.plot(range(len(y_pred_A)), y_pred_A, label='Predizione', color='red', linestyle='-')

# Aggiungi una legenda
plt.legend()

# Aggiungi etichette agli assi
plt.ylabel('Valori y')

# Titolo del grafico
plt.title('Grafico di Predizione')

# Mostra il grafico
plt.show()

In [None]:
df = pd.DataFrame({'Prediction':y_pred_A})

In [None]:
threshold = 0.05
df['Prediction'] = df['Prediction'].apply(lambda x: 0 if x < threshold else x)

In [None]:
df.to_csv('A.csv')

In [None]:
df.head(50)

In [None]:
target = 'pv_measurement'