In [None]:
!pip install catboost
!unzip data.zip

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2
Archive:  data.zip
   creating: data/
  inflating: __MACOSX/._data         
  inflating: data/.DS_Store          
  inflating: __MACOSX/data/._.DS_Store  
  inflating: data/test.csv           
  inflating: __MACOSX/data/._test.csv  
   creating: data/A/
  inflating: data/Readme.md          
  inflating: __MACOSX/data/._Readme.md  
   creating: data/C/
   creating: data/B/
  inflating: data/sample_submission.csv  
  inflating: __MACOSX/data/._sample_submission.csv  
  inflating: data/read_files.ipynb   
  inflating: data/A/X_train_observed.parquet  
  inflating: data/A/train_targets.parquet  
  inflating: data/A/X_train_estimated.parquet  
  inflating: data/A/X_test_estimated.parquet  
  inflating:

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
# Sopprime tutti i FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

# ADD TIME FEATURES
def add_time_features(df, time_column):
    '''
        This function will add some time feature based on the param 'time_columns'

        Params:
            df-> Dataframe with the column contained in 'time_column'
            time_column -> the column that is a datetime object

        Returns:
            A dataframe with time features
    '''

    df[time_column] = pd.to_datetime(df[time_column])  # Make sure the time column is in datetime format
    df['hour'] = df[time_column].dt.hour
    df['day_of_week'] = df[time_column].dt.dayofweek
    df['month'] = df[time_column].dt.month
    df['day_of_year'] = df[time_column].dt.dayofyear
    df['week_of_year'] = df[time_column].dt.isocalendar().week
    df['year'] = df[time_column].dt.year
    df['sin_hour'] = np.sin(np.pi * df[time_column].dt.hour/24.)
    df['sin_month'] = np.sin(np.pi * df[time_column].dt.month/12.)
    #why these feature? Who knows
    return df

def plot_targets(targets):
    '''
        Plot the target, by a giving date

        Params:
            Targets-> A dataframe with the target value
            Start_date -> the start date
            End_date -> the end date

        Returns:
            Sto cazzo
    '''
    plt.figure(figsize=(15, 6))
    plt.plot(targets, label='PV Measurement', color='blue')
    plt.xlabel('Time')
    plt.ylabel('PV Measurement')
    plt.title('PV Measurement Over Time')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def to_datetime(df,column):
    '''
        Make the column in datetime format
    '''
    return pd.to_datetime(df[column])

def resampling(df,column):
    '''
        Resample df to 1 hour using mean() as aggregator and drop rows where all columns are NaN

        Params :
            df -> the dataframe to be resampled
            column -> the time column
    '''
    return df.set_index(keys = column).resample('1H').mean().dropna(how='all').reset_index()

def filter_df(df,columnlist):
    return df.drop(columns = columnlist)

def extract_data_calc(df):
    '''
    This function create a dataframe with 'date_forecast' as index and the column 'date_calc' resampled by '1H'.
    If there's no data in a specific bin, the resulting value for that bin would be NaN (not a number).
    Params:
        df -> dataframe with 'date_forecast' and 'date_calc' columns.
            'date_calc' is expected to contain data that the user wants to resample or analyze.
    Returns:
        A dataframe with 'date_calc' resampled.
    '''
    return df.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()



def is_estimated_feature(df):
    '''
        This function will create some time feature and estimated information. It's need to let the model understand is
        estimated value.
        Params:
            df -> It MUST be an estimated dataframe, that contains 'data_forecast' as datetime type
        Returns:
            A dataframe with 'time_dummy', 'time_delta' and 'is_estimated'
    '''
    df['time_delta'] = (df['date_calc'] - df['date_forecast']).dt.total_seconds() / 3600
    df['is_estimated'] = 1
    return df

def delete_stationarity(df):
    '''
    Removes constant stretches of data within a DataFrame where the 'pv_measurement' column does not change.
    The function identifies blocks of data where the 'pv_measurement' stays constant for more than two consecutive
    points and removes these blocks to address data stationarity.

    params:
        df -> DataFrame
              A pandas DataFrame with a 'pv_measurement' column which contains the data from which to remove stationarity.

    return:
        The DataFrame with constant stretches of data removed from the 'pv_measurement' column.
    '''

    #Calculate the difference, this need for check the constant
    df['diff'] = df['pv_measurement'].diff().fillna(0)

    # Create an indicator for constant stretches
    df['constant'] = (df['diff'] == 0).astype(int)

    # Use the indicator to mark stretches. The diff() function here identifies change-points.
    df['block'] = (df['constant'].diff() != 0).astype(int).cumsum()

    # Get the size of each constant block
    block_sizes = df.groupby('block')['constant'].sum()

    # Identify blocks that are constant for more than N consecutive time points (in this case 2)
    constant_blocks = block_sizes[block_sizes > 2].index

    # Remove the constant
    filtered_df = df[~df['block'].isin(constant_blocks)]

    return filtered_df.drop(columns=['diff', 'constant', 'block'])

def preprocessing(targets, observed, estimated, test):
    '''
        This function makes all the preprocessing needed for the correct run of the model, it will perform:
            - Resampling
            - Filtering
            - Imputation
            - Outliers removal
            - Categorical Encoding

        Params:
            targets -> dataframe of the target parquet
            observed -> dataframe of observed train data
            estimated -> dataframe of estimated train data
            test -> dataframe of test data
        Returns:
            train_data -> dataframe of all data ready to train
            test_data -> dataframe of all data ready to test
            is_day -> dataframe of is_day categorical feature for post processing

    '''
    targets['time'] = to_datetime(targets,'time')
    estimated['date_forecast'] = to_datetime(estimated,'date_forecast')
    observed['date_forecast'] = to_datetime(observed,'date_forecast')
    test['date_forecast'] = to_datetime(test,'date_forecast')

    observed_resampled = resampling(observed,'date_forecast')
    estimated_resampled = resampling(estimated,'date_forecast')
    test_resampled = resampling(test,'date_forecast')

    date_calc_resampled_observed = extract_data_calc(estimated)
    date_calc_resampled_test = extract_data_calc(test)

    estimated_resampled = estimated_resampled.merge(date_calc_resampled_observed, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_test, left_on='date_forecast', right_index=True)

    is_day = test_resampled[['date_forecast', 'is_day:idx']]
    test_resampled = filter_df(test_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = filter_df(observed_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m'])
    estimated_resampled = filter_df(estimated_resampled,[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])

    #This MUST be zero because is not estimated.
    observed_resampled['is_estimated'] = 0
    observed_resampled['time_delta'] = 0

    estimated_resampled = is_estimated_feature(estimated_resampled)
    test_resampled = is_estimated_feature(test_resampled)

    X = pd.concat([observed_resampled,estimated_resampled],axis = 0)
    train_data = pd.merge(targets, X, how='inner', left_on='time', right_on='date_forecast')

    train_data = add_time_features(train_data, 'time')
    test_data = add_time_features(test_resampled, 'date_forecast')

    train_data = delete_stationarity(train_data)

    train_data = filter_df(train_data, ['time','date_calc'])
    test_data = filter_df(test_resampled, ['date_calc'])

    return train_data, test_data, is_day

In [None]:
from sklearn.model_selection import TimeSeriesSplit
def train_data_split(n_splits = 5):
    '''
        This function return the cross_validator split
    '''
    return TimeSeriesSplit(n_splits=n_splits)

import re
def regexdf(df):
    '''
        This function let lgbm work, this because it cannot accept ':'
    '''
    return df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_-]+', '', x))



def final_model(df,model,param,X_test):
    '''
        This function will retrain on the bagged model so the model is trained on ALL TRAIN DATA.

        Params:
            df -> Train data
            model -> your model
            param -> your hyperparameter
            X_test -> Test Data
        Return:
            the prediction
    '''
    X_train = df.drop(columns = 'pv_measurement')
    y_train = df['pv_measurement']
    model.fit(X_train,y_train)
    return model.predict(X_test)

In [None]:
def post_process(prediction, is_day):
    '''
        This function will post_process the predicition, by multiplying the is_day on that prediction.
        I.E.
            This is for rescale the prediction that can be too high in a 0.5 day moment.
        Params:
            Prediction -> your prediction
            is_day -> the is_day dataframe
        Return:
            the clipped version of the adjusted prediction, it's clipped for any eventual negative prediction
    '''
    adjusted_predictions = prediction * is_day['is_day:idx']
    return np.clip(adjusted_predictions, 0, None)

def submission(predictions):
    '''
        This function will create a ready to deliver file

        Params:
            predictions -> the entire prediction for all the location

        Returns:
            stocazzo
        Save:
            Submission file

    '''
    all_predictions = np.array(predictions).flatten()
    # Save the final_predictions to CSV
    df = pd.DataFrame(all_predictions, columns=['prediction'])
    df['id'] = df.index
    df = df[['id', 'prediction']]
    df.to_csv('TestFECla3.csv', index=False)

def feature_engineering(df):
    '''
        This function will create new feature from the interaction of different feature of df dataframe.

        Params:
            df: dataframe
        Return:
            FE dataframe
    '''
    '''
    df['direct_diffuse_rad_interaction'] = df['direct_rad:W'] * df['diffuse_rad:W']
    df['raddir'] = (df['direct_rad:W'] ) * (df['absolute_humidity_2m:gm3'])
    df['effectivehum'] = (df['absolute_humidity_2m:gm3']) * (df['direct_rad:W'])
    df['Radiazione_solare_effettiva'] = (df['direct_rad:W'] + df['diffuse_rad:W'] ) * (df['effective_cloud_cover:p'])
    df['direct_radW_squared'] = df['direct_rad:W'] ** 2
    df['clearcloud'] =   df['clear_sky_rad:W']*df['total_cloud_cover:p']
    df['radiation_squared'] = df['clear_sky_rad:W'] ** 2
    df['effectivehum'] = (df['absolute_humidity_2m:gm3']) * (df['direct_rad:W'])
    '''
    df['refined_global_rad'] = df['direct_rad:W'] + df['diffuse_rad:W']

   # df = df[df.index.month.isin([4, 5, 6, 7, 8, 9])]

    return df

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

def tune_model(X, y, model_params, n_iter=200, cv=2, verbose=1, random_state=42, n_jobs=-1):
    # Store the best estimators here
    best_estimators = {}

    # Define a dictionary with model shorthand and actual regressor objects
    models = {
        'catboost': CatBoostRegressor(verbose=0, thread_count=n_jobs),
    }
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # Loop through the models and perform Randomized Search
    for name, model in models.items():
        if name in model_params:  # Only if model parameters were provided
            print(f"Tuning hyperparameters for {name}...")
            search = RandomizedSearchCV(
                estimator=model,
                param_distributions=model_params[name],
                n_iter=n_iter,
                cv=cv,
                verbose=verbose,
                random_state=random_state,
                scoring=mae_scorer,
                n_jobs=n_jobs
            )
            search.fit(X, y)
            best_estimators[name] = search.best_estimator_
            print(f"Best parameters for {name}: {search.best_params_}")
        else:
            print(f"No parameters provided for {name}, skipping...")

    return best_estimators

In [None]:
from scipy.stats import randint, uniform

# Parameters for Random Forest
rf_params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 20),
    'min_samples_split': uniform(0.01, 0.1),
    'min_samples_leaf': uniform(0.01, 0.1),
    'bootstrap': [True, False]
}

# Parameters for Extra Trees
et_params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 20),
    'min_samples_split': uniform(0.01, 0.1),
    'min_samples_leaf': uniform(0.01, 0.1),
    'bootstrap': [True, False]
}

# Parameters for XGBoost
xgb_params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'min_child_weight': randint(1, 10)
}

# Parameters for LightGBM
lgbm_params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(-1, 20),
    'learning_rate': uniform(0.01, 0.3),
    'num_leaves': randint(20, 300),
    'min_child_samples': randint(10, 100),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'verbose' : [-1]
}

# Parameters for CatBoost
cat_params = {
    'iterations': randint(100, 500),
    'depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'l2_leaf_reg': randint(1, 10),
    'border_count': randint(1, 255),
    'subsample': uniform(0.5, 0.5)
}

# Combine all parameter dictionaries into one
model_params = {
    'catboost': cat_params
}

In [24]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

def pipeline(df):
  numerical_df = df.drop(columns = ['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'])
  categorical_df = df[['dew_or_rime:idx', 'is_in_shadow:idx']]
  numerical_imputer = SimpleImputer(strategy='mean')

  # Adattare l'imputer ai dati
  numerical_imputer.fit(numerical_df)

  # Trasformare i dati con i valori mancanti imputati
  transformed_data = pd.DataFrame(numerical_imputer.transform(numerical_df),columns = numerical_df.columns)

  categorical_imputer = SimpleImputer(strategy='most_frequent')
  # Adattare l'imputer ai dati
  categorical_imputer.fit(categorical_df)

  # Trasformare i dati con i valori mancanti imputati
  transformed_data_categorical = pd.DataFrame(categorical_imputer.transform(categorical_df),columns = categorical_df.columns)

  # Creiamo un OneHotEncoder
  encoder = OneHotEncoder(sparse=False)

  # Adattiamo e trasformiamo i dati usando l'encoder
  encoded_data = encoder.fit_transform(transformed_data_categorical)

  # Il risultato è un array NumPy, quindi se vogliamo un DataFrame dobbiamo convertirlo
  encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(transformed_data_categorical.columns))

  return pd.concat([transformed_data,encoded_df],axis = 1)


In [25]:
locations = ['A','B','C']
all_predictions = []

for loc in locations:
    train = pd.read_parquet(f'data/{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'data/{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'data/{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'data/{loc}/X_test_estimated.parquet')

    train_data, test_data, is_day= preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
    train_data = train_data.drop(columns = ['date_forecast'])
    test_data = test_data.drop(columns = 'date_forecast')

    train_data = feature_engineering(train_data)
    test_data = feature_engineering(test_data)
    train_data = regexdf(train_data)
    test_data = regexdf(test_data)
    print(train_data.shape, test_data.shape)
    X = train_data.drop(columns = ['pv_measurement']).reset_index().drop(columns = 'index')
    y = train_data['pv_measurement'].reset_index().drop(columns = 'index')

    print(X.shape)
    print('__________')
    model = tune_model(X,y,model_params)
    print('__________')
    model['catboost'].fit(X,y)
    predictions =  model['catboost'].predict(test_data)
    predictions_fix = post_process(predictions.flatten(),is_day)
    all_predictions.append(predictions_fix)

(19622, 60) (720, 59)
(19622, 59)
__________
Tuning hyperparameters for xgboost...
Fitting 2 folds for each of 200 candidates, totalling 400 fits


KeyboardInterrupt: ignored

In [None]:
test_data.columns

In [None]:
import json

def save(best_models, loc):
  best_params = {}

  # Leggere il file esistente se esiste e caricare i parametri
  filename = f'best_params{loc}.json'
  try:
      with open(filename, 'r') as file:
          best_params = json.load(file)
  except FileNotFoundError:
      print(f"Il file '{filename}' non esiste. Sarà creato uno nuovo.")

  # Iterazione attraverso ciascun modello nel dizionario best_models
  for model_name, model in best_models.items():
      if model_name in ['rf', 'et']:  # Scikit-learn models
          best_params[model_name] = model.get_params()
      elif model_name == 'xgboost':
          best_params[model_name] = model.get_xgb_params()
      elif model_name == 'lightgbm':
          best_params[model_name] = model.get_params()
      elif model_name == 'catboost':
          best_params[model_name] = model.get_params()

  # Salvare il dizionario aggiornato nel file
  with open(filename, 'w') as file:
      json.dump(best_params, file, indent=4)

  print(f"I parametri dei modelli sono stati salvati o aggiornati in '{filename}'")


In [None]:
'''sample_submission = pd.read_csv('sample_submission.csv')
sample_submission
sample_submission = sample_submission[['id']].merge(final_df[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)'''

final_predictions = np.concatenate(all_predictions)

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('TestFE+Ens.csv', index=False)