In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
# Sopprime tutti i FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

# ADD TIME FEATURES
def add_time_features(df, time_column):
    '''
        This function will add some time feature based on the param 'time_columns'

        Params:
            df-> Dataframe with the column contained in 'time_column'
            time_column -> the column that is a datetime object

        Returns:
            A dataframe with time features
    '''

    df[time_column] = pd.to_datetime(df[time_column])  # Make sure the time column is in datetime format
    df['hour'] = df[time_column].dt.hour
    df['day_of_week'] = df[time_column].dt.dayofweek
    df['month'] = df[time_column].dt.month
    df['day_of_year'] = df[time_column].dt.dayofyear
    df['week_of_year'] = df[time_column].dt.isocalendar().week
    df['year'] = df[time_column].dt.year
    #df['sin_hour'] = np.sin(np.pi * df[time_column].dt.hour/24.)
    #df['sin_month'] = np.sin(np.pi * df[time_column].dt.month/12.)
    #why these feature? Who knows
    return df

def plot_targets(targets):
    '''
        Plot the target, by a giving date

        Params:
            Targets-> A dataframe with the target value
            Start_date -> the start date
            End_date -> the end date

        Returns:
            Sto cazzo
    '''
    plt.figure(figsize=(15, 6))
    plt.plot(targets, label='PV Measurement', color='blue')
    plt.xlabel('Time')
    plt.ylabel('PV Measurement')
    plt.title('PV Measurement Over Time')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def to_datetime(df,column):
    '''
        Make the column in datetime format
    '''
    return pd.to_datetime(df[column])

def resampling(df,column):
    '''
        Resample df to 1 hour using mean() as aggregator and drop rows where all columns are NaN

        Params :
            df -> the dataframe to be resampled
            column -> the time column
    '''
    return df.set_index(keys = column).resample('1H').mean().dropna(how='all').reset_index()

def filter_df(df,columnlist):
    return df.drop(columns = columnlist)

def extract_data_calc(df):
    '''
    This function create a dataframe with 'date_forecast' as index and the column 'date_calc' resampled by '1H'.
    If there's no data in a specific bin, the resulting value for that bin would be NaN (not a number).
    Params:
        df -> dataframe with 'date_forecast' and 'date_calc' columns.
            'date_calc' is expected to contain data that the user wants to resample or analyze.
    Returns:
        A dataframe with 'date_calc' resampled.
    '''
    return df.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()



def is_estimated_feature(df):
    '''
        This function will create some time feature and estimated information. It's need to let the model understand is
        estimated value.
        Params:
            df -> It MUST be an estimated dataframe, that contains 'data_forecast' as datetime type
        Returns:
            A dataframe with 'time_dummy', 'time_delta' and 'is_estimated'
    '''
    df['time_dummy'] = (df['date_forecast'] - df['date_forecast'].dt.normalize()).dt.total_seconds() / 3600
    df['time_delta'] = (df['date_calc'] - df['date_forecast']).dt.total_seconds() / 3600
    df['is_estimated'] = 1
    return df

def delete_stationarity(df):
    '''
    Removes constant stretches of data within a DataFrame where the 'pv_measurement' column does not change.
    The function identifies blocks of data where the 'pv_measurement' stays constant for more than two consecutive
    points and removes these blocks to address data stationarity.

    params:
        df -> DataFrame
              A pandas DataFrame with a 'pv_measurement' column which contains the data from which to remove stationarity.

    return:
        The DataFrame with constant stretches of data removed from the 'pv_measurement' column.
    '''

    #Calculate the difference, this need for check the constant
    df['diff'] = df['pv_measurement'].diff().fillna(0)

    # Create an indicator for constant stretches
    df['constant'] = (df['diff'] == 0).astype(int)

    # Use the indicator to mark stretches. The diff() function here identifies change-points.
    df['block'] = (df['constant'].diff() != 0).astype(int).cumsum()

    # Get the size of each constant block
    block_sizes = df.groupby('block')['constant'].sum()

    # Identify blocks that are constant for more than N consecutive time points (in this case 2)
    constant_blocks = block_sizes[block_sizes > 2].index

    # Remove the constant
    filtered_df = df[~df['block'].isin(constant_blocks)]

    return filtered_df.drop(columns=['diff', 'constant', 'block'])


def impute_nan(df):
    '''
        This function will impute the Nan in the give dataframe

        Params:
            df -> the dataframe to be imputed
        Return:
            The dataframe imputed
    '''

    for col in df.columns:
        df[col] = df[col].fillna(df[col].median())
    return df

import pandas as pd
from scipy import stats

def remove_outliers(df, column, method='IQR', **kwargs):
    '''
    Removes outliers from a specific column in a pandas DataFrame based on the selected method.
    Additionally, prints the number of outliers removed.

    Parameters:
    df : DataFrame
        The pandas DataFrame from which to remove outliers.
    column : str
        The name of the column from which to remove outliers.
    method : str, optional
        The method used to identify and remove outliers. Accepted values are 'IQR' for Interquartile Range or
        'Z-score'. The default is 'IQR'.
    **kwargs : additional keyword arguments
        Additional parameters required for the specified outlier removal method.

    Returns:
    DataFrame
        A new DataFrame with outliers removed from the specified column.
    '''

    if method == 'IQR':
        # Calculate the IQR (Interquartile Range) for the column
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        # Determine upper and lower bounds for outlier detection
        lower_bound = Q1 - (3 * IQR)
        upper_bound = Q3 + (3 * IQR)

        # Filtering before removing to determine the number of outliers
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        non_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

    elif method == 'Z-score':
        # Calculate Z-scores for the column
        z_scores = stats.zscore(df[column].dropna())

        # Define a threshold for identifying outliers
        threshold = kwargs.get('threshold', 3) # Default threshold is 3

        # Create masks for outliers and non-outliers
        mask = (abs(z_scores) < threshold)
        non_outliers = df[mask]
        outliers = df[~mask]

    else:
        raise ValueError("Method not recognized. Use 'IQR' or 'Z-score'.")

    # Print the number of outliers removed
    num_outliers = len(outliers)
    #if(num_outliers > 0):
       # print(f'For {column} there was outliers {num_outliers}')

    # Return the DataFrame without outliers
    return non_outliers

# Example usage:
# Assuming `data` is your DataFrame and you want to remove outliers from the 'price' column using IQR method:
# cleaned_data = remove_outliers(data, 'price', method='IQR')

# Or using Z-score method with a specific threshold:
# cleaned_data = remove_outliers(data, 'price', method='Z-score', threshold=2.5)


def one_hot_encoding(df, columns_to_encode):
    '''
        Perform one-hot encoding on the selected columns
        Params:
            df: Dataframe
            columns_to_encode: List of columns
        Return
            Dataframe with columns encoded
    '''
    one_hot_encoded_df = pd.get_dummies(df[columns_to_encode], columns=columns_to_encode)
    return pd.concat([df, one_hot_encoded_df], axis=1)

def should_process_column(column_name):
    """
    Check if the column should be processed based on its name.

    Args:
    column_name (str): The name of the column to check.

    Returns:
    bool: True if the column should be processed, False otherwise.
    """
    excluded_substrings = ['idx', 'time', 'estimated']
    return not any(substring in column_name for substring in excluded_substrings)

def fix_encoding(df):
    df[['dew_or_rime:idx_-1.0', 'dew_or_rime:idx_-0.75', 'dew_or_rime:idx_-0.5', 'dew_or_rime:idx_-0.25', 'dew_or_rime:idx_0.75','is_estimated_0']] = 0
    return df

def preprocessing(targets, observed, estimated, test):
    '''
        This function makes all the preprocessing needed for the correct run of the model, it will perform:
            - Resampling
            - Filtering
            - Imputation
            - Outliers removal
            - Categorical Encoding

        Params:
            targets -> dataframe of the target parquet
            observed -> dataframe of observed train data
            estimated -> dataframe of estimated train data
            test -> dataframe of test data
        Returns:
            train_data -> dataframe of all data ready to train
            test_data -> dataframe of all data ready to test
            is_day -> dataframe of is_day categorical feature for post processing

    '''
    targets['time'] = to_datetime(targets,'time')
    estimated['date_forecast'] = to_datetime(estimated,'date_forecast')
    observed['date_forecast'] = to_datetime(observed,'date_forecast')
    test['date_forecast'] = to_datetime(test,'date_forecast')

    observed_resampled = resampling(observed,'date_forecast')
    estimated_resampled = resampling(estimated,'date_forecast')
    test_resampled = resampling(test,'date_forecast')

    date_calc_resampled_observed = extract_data_calc(estimated)
    date_calc_resampled_test = extract_data_calc(test)

    estimated_resampled = estimated_resampled.merge(date_calc_resampled_observed, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_test, left_on='date_forecast', right_index=True)

    is_day = test_resampled[['date_forecast', 'is_day:idx']]
    test_resampled = filter_df(test_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = filter_df(observed_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m'])
    estimated_resampled = filter_df(estimated_resampled,[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])

    #This MUST be zero because is not estimated.
    observed_resampled['time_dummy'] = 0
    observed_resampled['is_estimated'] = 0
    observed_resampled['time_delta'] = 0

    estimated_resampled = is_estimated_feature(estimated_resampled)
    test_resampled = is_estimated_feature(test_resampled)

    X = pd.concat([observed_resampled,estimated_resampled],axis = 0)
    train_data = pd.merge(targets, X, how='inner', left_on='time', right_on='date_forecast')

    train_data = add_time_features(train_data, 'time')
    test_data = add_time_features(test_resampled, 'date_forecast')

    train_data = delete_stationarity(train_data)

    train_data = filter_df(train_data, ['time','date_calc'])
    test_data = filter_df(test_resampled, ['date_calc'])

    train_data = impute_nan(train_data)
    test_data = impute_nan(test_data)

    train_data = one_hot_encoding(train_data, ['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'])
    test_data = one_hot_encoding(test_data, ['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'])
    test_data = fix_encoding(test_data)
    test_data = test_data[train_data.drop(columns = 'pv_measurement').columns]
    return train_data, test_data, is_day

In [2]:
from sklearn.model_selection import TimeSeriesSplit
def train_data_split(n_splits = 5):
    '''
        This function return the cross_validator split
    '''
    return TimeSeriesSplit(n_splits=n_splits)

import re
def regexdf(df):
    '''
        This function let lgbm work, this because it cannot accept ':'
    '''
    return df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_-]+', '', x))



def final_model(df,model,param,X_test):
    '''
        This function will retrain on the bagged model so the model is trained on ALL TRAIN DATA.

        Params:
            df -> Train data
            model -> your model
            param -> your hyperparameter
            X_test -> Test Data
        Return:
            the prediction
    '''
    X_train = df.drop(columns = 'pv_measurement')
    y_train = df['pv_measurement']
    model.fit(X_train,y_train)
    return model.predict(X_test)

In [103]:
def post_process(prediction, is_day):
    '''
        This function will post_process the predicition, by multiplying the is_day on that prediction.
        I.E.
            This is for rescale the prediction that can be too high in a 0.5 day moment.
        Params:
            Prediction -> your prediction
            is_day -> the is_day dataframe
        Return:
            the clipped version of the adjusted prediction, it's clipped for any eventual negative prediction
    '''
    adjusted_predictions = prediction * is_day['is_day:idx']
    return np.clip(adjusted_predictions, 0, None)

def submission(predictions):
    '''
        This function will create a ready to deliver file

        Params:
            predictions -> the entire prediction for all the location

        Returns:
            stocazzo
        Save:
            Submission file

    '''
    all_predictions = np.array(predictions).flatten()
    # Save the final_predictions to CSV
    df = pd.DataFrame(all_predictions, columns=['prediction'])
    df['id'] = df.index
    df = df[['id', 'prediction']]
    df.to_csv('TestFECla3.csv', index=False)

def feature_engineering(df):
    '''
        This function will create new feature from the interaction of different feature of df dataframe.

        Params:
            df: dataframe
        Return:
            FE dataframe
    '''

    df['direct_diffuse_rad_interaction'] = df['direct_rad:W'] * df['diffuse_rad:W']
    df['raddir'] = (df['direct_rad:W'] ) * (df['absolute_humidity_2m:gm3'])
    df['effectivehum'] = (df['absolute_humidity_2m:gm3']) * (df['direct_rad:W'])
    df['Radiazione_solare_effettiva'] = (df['direct_rad:W'] + df['diffuse_rad:W'] ) * (df['effective_cloud_cover:p'])
    df['direct_radW_squared'] = df['direct_rad:W'] ** 2
    df['clearcloud'] =   df['clear_sky_rad:W']*df['total_cloud_cover:p']
    df['radiation_squared'] = df['clear_sky_rad:W'] ** 2
    df['effectivehum'] = (df['absolute_humidity_2m:gm3']) * (df['direct_rad:W'])

   # df = df[df.index.month.isin([4, 5, 6, 7, 8, 9])]

    return df

In [104]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.base import clone
import numpy as np

def stacking_ensemble_time_series(models, X, y, meta_learner=LinearRegression(), n_folds=5):
    # Create the time series cross-validator
    tscv = TimeSeriesSplit(n_splits=n_folds)
    
    # To store out-of-fold predictions
    meta_features = np.zeros((y.shape[0], len(models)))
    
    # Train and generate meta-features
    for idx, (name, model) in enumerate(models.items()):
        print(f"Training base model: {name}")
        oof_predictions = np.zeros(y.shape[0])
        
        for train_index, test_index in tscv.split(X):
            # Split data into folds
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            # Clone the model to ensure we have a fresh model
            cloned_model = clone(model)
            cloned_model.fit(X_train, y_train)
            
            # Generate out-of-fold predictions
            oof_predictions[test_index] = cloned_model.predict(X_test)
        
        # Store out-of-fold predictions as meta-features
        meta_features[:, idx] = oof_predictions
    
    # Retrieve the last fold for training the meta-learner
    _, meta_train_index = list(tscv.split(X))[-1]
    X_meta_train, y_meta_train = meta_features[meta_train_index, :], y[meta_train_index]
    
    # Train the meta-learner on the last fold's meta-features
    print("Training meta-learner...")
    meta_learner.fit(X_meta_train, y_meta_train)
    
    # Fit all the base models on the full training data
    fitted_models = {}
    for name, model in models.items():
        model.fit(X, y)
        fitted_models[name] = model
    
    # Function to make ensemble predictions
    def make_predictions(X_new):
        meta_features_new = np.column_stack([
            fitted_model.predict(X_new) for _, fitted_model in fitted_models.items()
        ])
        return meta_learner.predict(meta_features_new)
    
    return fitted_models, meta_learner, make_predictions


In [105]:
import pandas as pd
import lightgbm as lgb

def train_and_predict(train_df, test_df, window_size, num_predictions):
    """
    Addestra un modello LightGBM con il train_df e fa previsioni ricorsive utilizzando test_df.

    Parameters:
    train_df (pandas.DataFrame): DataFrame di training contenente la colonna 'pv_measurement'.
    test_df (pandas.DataFrame): DataFrame di test vuoto, utilizzato per accumulare le previsioni.
    window_size (int): Dimensione della finestra per la media mobile.
    num_predictions (int): Numero totale di valori da predire.

    Returns:
    pandas.DataFrame: DataFrame contenente le previsioni.
    """
    
    # Calcolo della rolling mean sul train_df
    train_df['rolling_mean'] = train_df['pv_measurement'].rolling(window=window_size).mean().shift(-window_size + 1)
    train_df = train_df.dropna().reset_index(drop=True)

    # Preparazione dei dati per LightGBM
    features = ['rolling_mean']
    target = 'pv_measurement'
    train_set = lgb.Dataset(train_df[features], label=train_df[target])
    
    # Parametri per LightGBM (da ottimizzare secondo il problema)
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'verbose': -1
    }
    
    # Addestramento del modello
    model = lgb.train(params, train_set, num_boost_round=100)
    
    # Inizializzazione di test_df con la rolling mean degli ultimi 24 valori di train_df
    last_values = train_df['pv_measurement'].tail(window_size).tolist()
    test_df = pd.DataFrame({'pv_measurement': last_values})
    test_df['rolling_mean'] = test_df['pv_measurement'].rolling(window=window_size).mean()
    print(test_df)
    # Eseguiamo le previsioni
    predictions = []
    for _ in range(num_predictions):
        # Preparazione dell'input per la previsione
        input_data = test_df['rolling_mean'].iloc[-1].reshape(1, -1)
        # Previsione
        prediction = model.predict(input_data)[0]
        predictions.append(prediction)
        # Aggiornamento del test_df con la nuova previsione
        new_row = pd.DataFrame({'pv_measurement': prediction}, index=[0])
        test_df = pd.concat([test_df, new_row], ignore_index=True)
        test_df['rolling_mean'] = test_df['pv_measurement'].rolling(window=window_size).mean()

    # Conversione della lista di previsioni in DataFrame
    predictions_df = pd.DataFrame(predictions, columns=['predicted_pv_measurement'])
    
    return predictions_df

# Uso della funzione
# Assicurati che 'train_df' sia il DataFrame con la tua serie temporale di training e 'test_df' sia un DataFrame vuoto
# predictions = train_and_predict(train_df, test_df, 24, 720)


In [107]:
locations = ['A','B','C']
all_predictions = []
params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'verbose': -1
}
for loc in locations:
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    train_data, test_data, is_day= preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
    train_data = feature_engineering(train_data)
    test_data = feature_engineering(test_data)
    train_data = regexdf(train_data)
    test_data = regexdf(test_data)
    
    train_data = add_temporal_aggregations_to_df(train_data)
    test_data = add_temporal_aggregations_to_df(test_data)
    train_data = train_data.drop(columns = ['date_forecast'])
    test_data = test_data.drop(columns = ['date_forecast'])
    
    train = train_data.iloc[:-720]
    test = train_data.iloc[-720:]
    target = 'pv_measurement'
    train_set = lgb.Dataset(train_data.drop(columns = target), label=train_data[target])
    model = lgb.train(params, train_set, num_boost_round=100)
    X_test = test.drop(columns = target)
    y_test = test[target]
    predictions = model.predict(test_data)
    
    predictions_fix = post_process(predictions,is_day)
    print(mean_absolute_error(y_test,predictions_fix))
    all_predictions.append(predictions_fix)

1677.1140005513764
284.633682228361
222.04759635842396


In [108]:
'''sample_submission = pd.read_csv('sample_submission.csv')
sample_submission
sample_submission = sample_submission[['id']].merge(final_df[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)'''

final_predictions = np.concatenate(all_predictions)

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('TestAgg1.csv', index=False)

In [88]:
test

Unnamed: 0,pv_measurement,absolute_humidity_2mgm3,air_density_2mkgm3,ceiling_height_aglm,clear_sky_energy_1hJ,clear_sky_radW,cloud_base_aglm,dew_or_rimeidx,dew_point_2mK,diffuse_radW,...,sun_elevation_lag1,weekly_snow_sum,hourly_mean_wind_speed,daily_temp_change,is_summer,is_winter,is_autumn,is_spring,extreme_weather,trend_energy
1970-01-01 00:00:00.000000004,19.36,8.950,1.21800,1003.500000,3.246815e+04,23.100000,1003.500000,0.0,282.500000,11.975000,...,,2207.125,3.029467,,False,True,False,False,False,0.000000e+00
1970-01-01 00:00:00.000000005,251.02,9.250,1.21650,809.375000,1.794991e+05,84.375000,809.375000,0.0,283.049988,45.125000,...,3.051250,2207.125,3.029467,,False,True,False,False,False,1.794991e+05
1970-01-01 00:00:00.000000006,263.78,9.525,1.21300,757.775024,4.781178e+05,186.649994,757.775024,0.0,283.524994,89.525002,...,8.071000,2207.125,3.029467,,False,True,False,False,False,9.562356e+05
1970-01-01 00:00:00.000000007,522.72,9.700,1.20750,705.650024,8.926679e+05,311.525024,705.650024,0.0,283.799988,139.000000,...,13.956500,2207.125,3.029467,,False,True,False,False,False,2.678004e+06
1970-01-01 00:00:00.000000008,904.42,9.550,1.20500,669.650024,1.357902e+06,442.750000,669.650024,0.0,283.600006,167.100006,...,20.406250,2207.125,3.029467,,False,True,False,False,False,5.431608e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:00:00.000032965,2525.38,2.750,1.30900,2077.850098,6.655242e+05,237.574997,110.199997,0.0,265.899994,86.849998,...,10.605500,2207.125,3.029467,-4.524994,False,True,False,False,False,1.257641e+10
1970-01-01 00:00:00.000032966,3254.46,3.150,1.29375,2077.850098,1.024199e+06,327.200012,128.000000,0.0,267.299988,97.425003,...,16.151001,2207.125,3.029467,-3.524994,False,True,False,False,False,1.935531e+10
1970-01-01 00:00:00.000032967,3666.30,3.325,1.28300,2077.850098,1.295218e+06,386.600006,29.500000,0.0,268.149994,84.250000,...,20.556751,2207.125,3.029467,-1.524994,False,True,False,False,False,2.447832e+10
1970-01-01 00:00:00.000032968,3757.82,3.050,1.27450,2077.850098,1.443859e+06,409.100006,29.500000,0.0,267.200012,79.824997,...,23.451250,2207.125,3.029467,0.250000,False,True,False,False,False,2.728893e+10


In [57]:
test_data.shape

(720, 75)

In [97]:
def add_temporal_aggregations_to_df(df):
    # Assicurati che il DataFrame abbia un indice temporale
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)

    # Aggregazioni giornaliere
   # df['daily_sum_energy'] = df['clear_sky_energy_1hJ'].resample('D').transform('sum')
    #df['daily_mean_humidity'] = df['absolute_humidity_2mgm3'].resample('D').transform('mean')
    #df['daily_max_temp'] = df['t_1000hPaK'].resample('D').transform('max')

    # Aggregazioni settimanali/mensili
    #df['weekly_precip'] = df['precip_5minmm'].resample('W').transform('sum')
    #df['monthly_pressure_mean'] = df['msl_pressurehPa'].resample('M').transform('mean')

    # Media mobile
    #df['rolling_7d_cloud_cover'] = df['total_cloud_coverp'].rolling(window=7).mean()

    # Trasformazioni cicliche per l'ora
    hour = df.index.hour
    df['hour_sin'] = np.sin(hour * (2. * np.pi / 24))
    df['hour_cos'] = np.cos(hour * (2. * np.pi / 24))

    # Differenze temporali
   # df['energy_diff'] = df['clear_sky_energy_1hJ'].diff()

    # Lag features
   # df['sun_elevation_lag1'] = df['sun_elevationd'].shift(1)

    ## Aggregazioni basate su condizioni meteorologiche
    #df['weekly_snow_sum'] = df['fresh_snow_24hcm'].resample('W').transform('sum')
#

        # Aggregazioni orarie
   # df['hourly_mean_wind_speed'] = df['wind_speed_10mms'].resample('H').transform('mean')

    # Variazioni rispetto al giorno precedente
    #df['daily_temp_change'] = df['t_1000hPaK'].diff(periods=24)

    # Indicatori stagionali
    df['is_summer'] = df.index.month.isin([6, 7, 8])
    df['is_winter'] = df.index.month.isin([12, 1, 2])
    df['is_autumn'] = df.index.month.isin([9,10,11])
    df['is_spring'] = df.index.month.isin([3, 4, 5])

    # Condizioni meteorologiche estreme
    #df['extreme_weather'] = (df['wind_speed_10mms'] > 50) | (df['precip_5minmm'] > 10)

    # Tendenze
    #df['trend_energy'] = np.arange(len(df)) * df['clear_sky_energy_1hJ'].values
    # Puoi continuare ad aggiungere altre aggregazioni qui...

    # Rimuovi le colonne con valori NaN dovuti al calcolo delle aggregazioni se necessario
    df.fillna(df.median())

    return df

# Utilizzo:
# df è il tuo DataFrame con una colonna 'timestamp'
# df = add_temporal_aggregations_to_df(df)


In [63]:
train_data.columns


Index(['pv_measurement', 'absolute_humidity_2mgm3', 'air_density_2mkgm3',
       'ceiling_height_aglm', 'clear_sky_energy_1hJ', 'clear_sky_radW',
       'cloud_base_aglm', 'dew_or_rimeidx', 'dew_point_2mK', 'diffuse_radW',
       'diffuse_rad_1hJ', 'direct_radW', 'direct_rad_1hJ',
       'effective_cloud_coverp', 'fresh_snow_12hcm', 'fresh_snow_1hcm',
       'fresh_snow_24hcm', 'fresh_snow_3hcm', 'fresh_snow_6hcm',
       'is_in_shadowidx', 'msl_pressurehPa', 'precip_5minmm',
       'precip_type_5minidx', 'pressure_100mhPa', 'pressure_50mhPa',
       'prob_rimep', 'rain_waterkgm2', 'relative_humidity_1000hPap',
       'sfc_pressurehPa', 'snow_depthcm', 'snow_driftidx', 'snow_melt_10minmm',
       'snow_waterkgm2', 'sun_azimuthd', 'sun_elevationd',
       'super_cooled_liquid_waterkgm2', 't_1000hPaK', 'total_cloud_coverp',
       'visibilitym', 'wind_speed_10mms', 'wind_speed_u_10mms',
       'wind_speed_v_10mms', 'wind_speed_w_1000hPams', 'time_dummy',
       'is_estimated', 'time_delt

In [66]:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

df = train_data

# Seleziona le colonne di interesse (sostituisci con le tue features reali)
features = df.drop(columns = target) # Aggiungi altre features se necessario

# Inizializza PolynomialFeatures con il grado desiderato
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

# Adatta e trasforma le features selezionate
poly_features = poly.fit_transform(features)

# Ottieni i nomi delle nuove features polinomiali
feature_names = poly.get_feature_names_out(features.columns)

# Crea un nuovo DataFrame con le features polinomiali
poly_df = pd.DataFrame(poly_features, columns=feature_names)

# Stampa il nuovo DataFrame con le features polinomiali
print(poly_df)


       absolute_humidity_2mgm3  air_density_2mkgm3  ceiling_height_aglm  \
0                        6.625             1.22075          2287.250000   
1                        6.275             1.21425          2679.074951   
2                        8.350             1.22675           983.799988   
3                        8.175             1.22550          1195.349976   
4                        8.000             1.22600          1308.599976   
...                        ...                 ...                  ...   
11018                    4.400             1.27550          1456.574951   
11019                    4.400             1.27850          1476.349976   
11020                    4.400             1.27900          1516.300049   
11021                    4.400             1.27975          1240.599976   
11022                    4.400             1.27975          1484.500000   

       clear_sky_energy_1hJ  clear_sky_radW  cloud_base_aglm  dew_or_rimeidx  \
0              1.32