In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from pycaret.regression import *


In [None]:
# Setting the maximum number of rows when you printing out a DataFrame
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
# Time features nr 1
def add_time_features(df, time_column):
    
    df[time_column] = pd.to_datetime(df[time_column])  

    # Extract various time features
    df['hour'] = df[time_column].dt.hour
    df['day_of_week'] = df[time_column].dt.dayofweek
    df['month'] = df[time_column].dt.month
    df['day_of_year'] = df[time_column].dt.dayofyear
    df['week_of_year'] = df[time_column].dt.isocalendar().week 
    df['year'] = df[time_column].dt.year

    return df

In [None]:
# Time features nr 2
def add_time_features_cat(df, time_column):
    
    df[time_column] = pd.to_datetime(df[time_column])  
    
    df['sin_hour'] = np.sin(np.pi * df[time_column].dt.hour/23.)
    df['sin_month'] = np.sin(np.pi * df[time_column].dt.month/12.)
    
    return df

In [None]:
def is_est(observed, estimated, test):
    
    # 1. Create time-delta for estimated data
      estimated['time_dummy'] = (estimated['date_forecast'] - estimated['date_forecast'].dt.normalize()).dt.total_seconds() / 3600
      observed['time_dummy'] = 0 
      test['time_dummy'] = (test['date_forecast'] - test['date_forecast'].dt.normalize()).dt.total_seconds() / 3600
      
      estimated['time_delta'] = (estimated['date_calc'] - estimated['date_forecast']).dt.total_seconds() / 3600
      observed['time_delta'] = 0  # since observed data is not forecasting ahead
      test['time_delta'] = (test['date_calc'] - test['date_forecast']).dt.total_seconds() / 3600
      
      # 2. Add indicator variable for estimated data
      estimated['is_estimated'] = 1
      observed['is_estimated'] = 0
      test['is_estimated'] = 1
      # Merge or concatenate data
      df = pd.concat([observed, estimated], axis=0).sort_values(by='date_forecast')
      
      return df, test

In [None]:
def delete_stationary(df):
    
    # Removing data where the power output is saturated
    # Step 1: Calculate the difference
    df['diff'] = df['pv_measurement'].diff().fillna(0)

    # Step 2: Create an indicator where diff is zero
    df['constant'] = (df['diff'] == 0).astype(int)

    # Step 3: Use the indicator where diff is zero. The diff() function here identifies change-points.
    df['block'] = (df['constant'].diff() != 0).astype(int).cumsum()
    block_sizes = df.groupby('block')['constant'].sum()

    # Identify blocks that are constant for more than 2 consecutive time points (you can adjust this threshold)
    constant_blocks = block_sizes[block_sizes > 2].index

    # Step 4: Remove the constant where diff is zero
    filtered_data = df[~df['block'].isin(constant_blocks)]

    # Drop time and temporary features
    targets_ny = filtered_data[ ['time', 'pv_measurement']]
    filtered_data = filtered_data.drop(columns=['diff', 'constant', 'block'])
    return filtered_data, targets_ny

In [None]:
# Preprocessing LigtGBM + catboost
def preprocessing(targets, observed, estimated, test):
    
    # Ensure the datetime columns are in datetime format
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    test['date_forecast'] = pd.to_datetime(test['date_forecast'])

    date_calc_resampled_ob = estimated.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    date_calc_resampled_te = test.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    
    estimated_resampled = estimated_resampled.merge(date_calc_resampled_ob, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_te, left_on='date_forecast', right_index=True)
    
    #Save the is_day feature as this says a lot about when the power output is zero or not
    is_day_feature = test_resampled[['date_forecast', 'is_day:idx']]
    
    #Drop some features that is noise
    test_resampled = test_resampled.drop(columns =['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = observed_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])
    estimated_resampled = estimated_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])

    
    # Add extra features
    weather_data, test_resampled = is_est(observed_resampled, estimated_resampled, test_resampled)
    
    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Add the time-based features
    merged_data = add_time_features(merged_data, 'time')  
    test_resampled = add_time_features(test_resampled, 'date_forecast') 
    
    filtered_data, targets_ny = delete_stationary(merged_data)
    
    # Drop time features
    filtered_data = filtered_data.drop(columns=['time', 'pv_measurement','date_calc'])
    test_resampled = test_resampled.drop(columns=[ 'date_forecast', 'date_calc'])
    
    return filtered_data, test_resampled, is_day_feature, targets_ny

In [None]:
def is_est_cat(observed, estimated, test):
      # Add indicator variable for estimated data
      estimated['is_estimated'] = 1
      observed['is_estimated'] = 0
      test['is_estimated'] = 1

      # Merge or concatenate data
      df = pd.concat([observed, estimated], axis=0).sort_values(by='date_forecast')
      
      return df, test

In [None]:
# Preprocessing nr 2 for catboost models
def preprocessing_cat(targets, observed, estimated, test):
    
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    test['date_forecast'] = pd.to_datetime(test['date_forecast'])

    # Start the resampling from 15min to 1 hour
    date_calc_resampled_ob = estimated.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    date_calc_resampled_te = test.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    
    estimated_resampled = estimated_resampled.merge(date_calc_resampled_ob, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_te, left_on='date_forecast', right_index=True)
    
    #Save the is_day feature as this says a lot about when the power output is zero or not
    is_day_feature = test_resampled[['date_forecast', 'is_day:idx']]
    
    #Drop some features that is noise
    test_resampled = test_resampled.drop(columns =['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = observed_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])
    estimated_resampled = estimated_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])

    # Filter observed and estimated data for April to August
    observed_resampled = observed_resampled[observed_resampled['date_forecast'].dt.month.isin([4, 5, 6, 7, 8])]
    estimated_resampled = estimated_resampled[estimated_resampled['date_forecast'].dt.month.isin([4, 5, 6, 7, 8])]
    targets = targets[targets['time'].dt.month.isin([4, 5, 6, 7, 8])]

    # Merge the observed and estimated data
    weather_data, test_resampled = is_est_cat(observed_resampled, estimated_resampled, test_resampled)

    # Merge with target values filtering for the same months
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')
    merged_data = add_time_features_cat(merged_data, 'time')  
    test_resampled = add_time_features_cat(test_resampled, 'date_forecast')

    filtered_data, targets_ny = delete_stationary(merged_data)
    
    # Drop some time features
    filtered_data = filtered_data.drop(columns=['time', 'date_forecast', 'pv_measurement','date_calc'])
    test_resampled = test_resampled.drop(columns=[ 'date_forecast','date_calc'])
    
    return filtered_data, test_resampled, is_day_feature, targets_ny

In [None]:
def process_location_ex(X, y, location_name,seeds):
    
    # Combine feature data and target into a single DataFrame
    data = X.copy()
    data['target'] = y['pv_measurement']

    # Added some extra features to this one model, did it here so we could reuse the same preprocesssing function on diffrent models
    # Feature Combination 1: Solar Radiation and Cloud Cover Combination
    data['weighted_rad'] = ((data['direct_rad:W'] * (1 - data['total_cloud_cover:p']/100)) +
                        (data['diffuse_rad:W'] * (data['total_cloud_cover:p']/100)))

    # Feature Combination 2: Atmospheric Conditions Combination
    data['adjusted_clear_sky_rad'] = (data['clear_sky_rad:W'] *
                                  np.exp(-0.0001 * data['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (data['air_density_2m:kgm3'] - 1.225)))  # Adjusted based on humidity and air density
    
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,

                    categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    #remove_outliers=True,  #Ble dårligere med denne
                    html=False, 
                    experiment_name=f'exp_{location_name}')

    # Create a LightGBM model
    lightgbm = create_model('lightgbm')
    
    # Tune the model
    tuned_lightgbm = tune_model(lightgbm)#, early_stopping=True, fold=15)

    # Create a bagged version of the tuned model
    bagged_lightgbm = ensemble_model(tuned_lightgbm, method='Bagging')

    # Finalize the model by training on whole dataset
    final_model = finalize_model(bagged_lightgbm)
        
    return final_model

In [None]:
def process_location_cat_2(X, y, location_name,seeds):
    
    # Dropping some features for this one model
    features_to_drop = ['dew_or_rime:idx', #'snow_density:kgm3',
                        'fresh_snow_3h:cm', 'fresh_snow_1h:cm', 'snow_drift:idx', 
                        'snow_depth:cm', 'wind_speed_w_1000hPa:ms', 'prob_rime:p', 
                        'fresh_snow_6h:cm', 'snow_melt_10min:mm', 
                        'fresh_snow_12h:cm', 'rain_water:kgm2', 
                        'super_cooled_liquid_water:kgm2']
    
    X = X.drop(columns=features_to_drop)
    
    data = X.copy()
    data['target'] = y['pv_measurement']
    
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,

                    #categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    #remove_outliers=True,  #Ble dårligere med denne
                    html=False,
                    experiment_name=f'exp_{location_name}')

    # Create a Catboost model
    cat = create_model('catboost')

    # Tune the model
    tuned_cat = tune_model(cat)
    
    # Create a bagged version of the tuned model
    bagged_cat = ensemble_model(tuned_cat, method='Bagging')

    # Train on whole dataset
    final_model = finalize_model(bagged_cat)

    return final_model

In [None]:
locations = ['A']

# Training and predictions

### LightGBM

In [None]:
# LightGBM training and predictions
all_predictions_lGBM_e = []
for loc in locations:

    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')
    
    # Calling preprocessing
    X_train_1, X_test_1, is_day_feature_1, targets_1 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
    
    # Adding the extra features to the test set as well
    X_train_1 = X_train_1.drop(columns=['date_forecast'])
    X_test_1['weighted_rad'] = ((X_test_1['direct_rad:W'] * (1 - X_test_1['total_cloud_cover:p']/100)) +
                        (X_test_1['diffuse_rad:W'] * (X_test_1['total_cloud_cover:p']/100)))

    X_test_1['adjusted_clear_sky_rad'] = (X_test_1['clear_sky_rad:W'] *
                                  np.exp(-0.0001 * X_test_1['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (X_test_1['air_density_2m:kgm3'] - 1.225)))
    
    # Training and prediction for diffrent seeds
    total_predictions_light = None
    seeds = [42]
    for seed in seeds: 
        final_model_lGBM_e = process_location_ex(X_train_1, targets_1, loc, seed)
        predictions_lGBM_e = predict_model(final_model_lGBM_e, data=X_test_1)
        final_predictions_lGBM_e = predictions_lGBM_e['prediction_label']
        if total_predictions_light is None:
            total_predictions_light = np.zeros_like(final_predictions_lGBM_e)
        total_predictions_light += final_predictions_lGBM_e

    mean_pred_light = total_predictions_light/len(seeds)

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_lGBM_e = mean_pred_light * is_day_feature_1['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_lGBM_e = np.clip(adjusted_final_predictions_lGBM_e, 0, None)

    # Appening predictions for each location to final list
    all_predictions_lGBM_e.append([adjusted_final_predictions_lGBM_e])

# Changing final list to array
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e)

### CATBoost

In [None]:
all_predictions_cat = []
# Catboost model nr 1
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train_cat, X_test_cat, is_day_feature1, targets_cat = preprocessing_cat(train, X_train_observed, X_train_estimated, X_test_estimated)

    # Making categorical features
    cat_features = ['dew_or_rime:idx' ,'is_in_shadow:idx']
    X_train_cat['dew_or_rime:idx'] = X_train_cat['dew_or_rime:idx'].astype(int)
    X_train_cat['is_in_shadow:idx'] = X_train_cat['is_in_shadow:idx'].astype(int)
    X_test_cat['dew_or_rime:idx'] = X_test_cat['dew_or_rime:idx'].astype(int)
    X_test_cat['is_in_shadow:idx'] = X_test_cat['is_in_shadow:idx'].astype(int)

    # Catboooooooozt fun
    model_cat = CatBoostRegressor(
        loss_function='MAE', 
        learning_rate=0.1, 
        verbose=200,
        cat_features=cat_features,
        random_state=42) 
        #n_estimators=20000,
        #early_stopping_rounds=50,)

    #X_train_cat1, X_val_cat1, y_train_cat1, y_val_cat1 = train_test_split(X_train_cat, targets_cat, test_size=0.2, random_state=42)
    
    # Training
    model_cat.fit(X_train_cat,targets_cat['pv_measurement']) #X_train_cat1, y_train_cat1['pv_measurement'],eval_set=(X_val_cat1, y_val_cat1['pv_measurement']),)

    # Prediction
    predictions_cat = model_cat.predict(X_test_cat)
    feature_importances = model_cat.get_feature_importance()
    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat = predictions_cat * is_day_feature1['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat = np.clip(adjusted_final_predictions_cat, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat.append(adjusted_final_predictions_cat)

# Changing final list to array
all_predictions_cat = np.array(all_predictions_cat)

In [None]:
all_predictions_cat_2 = []
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train, X_test, is_day_feature, targets = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)

    # Dropping some winter months
    X_train_cat_2 = X_train#[X_train['date_forecast'].dt.month.isin([3,4, 5, 6, 7, 8, 9,10])]

    # Dropping date feature
    X_train_cat_2 = X_train_cat_2.drop(columns=['date_forecast'])

    # Training and prediction for diffrent seeds
    seeds = [42]
    total_predictions_cat_2 = None
    for seed in seeds: 
        final_model_cat_2 = process_location_cat_2(X_train_cat_2, targets, loc,seed)#its aactually a catboost wohoo
        predictions_cat_2 = predict_model(final_model_cat_2, X_test)
        final_predictions_cat_2 = predictions_cat_2['prediction_label']
        if total_predictions_cat_2 is None:
            total_predictions_cat_2 = np.zeros_like(final_predictions_cat_2)
            total_predictions_cat_2+=final_predictions_cat_2

    mean_pred_cat_2 = total_predictions_cat_2/len(seeds)

    adjusted_final_predictions_cat_2 = mean_pred_cat_2 * is_day_feature['is_day:idx']
    adjusted_final_predictions_cat_2 = np.clip(adjusted_final_predictions_cat_2, 0, None)
    all_predictions_cat_2.append([adjusted_final_predictions_cat_2])
all_predictions_cat_2 = np.array(all_predictions_cat_2)

In [None]:
all_predictions_cat_3 = []
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train_3, X_test_3, is_day_feature_3, targets_3 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)

    # Dropping date feature
    X_train_3 = X_train_3.drop(columns=['date_forecast'])
    
    # Catboooooooozt fun round 3 wohoo
    model_cat_3 = CatBoostRegressor(
    verbose=200, 
    learning_rate=0.03,
    depth=10,
    l2_leaf_reg=5,
    random_state=42, 
    n_estimators=20000, 
    loss_function='MAE', 
    early_stopping_rounds=100,)

    # Create 'sin_sun_azimuth' and 'cos_sun_azimuth' from 'sun_azimuth' in radians
    X_train_3['sin_sun_azimuth'] = np.sin(np.radians(X_train_3['sun_azimuth:d']))
    X_train_3['cos_sun_azimuth'] = np.cos(np.radians(X_train_3['sun_azimuth:d']))
    X_test_3['sin_sun_azimuth'] = np.sin(np.radians(X_test_3['sun_azimuth:d']))
    X_test_3['cos_sun_azimuth'] = np.cos(np.radians(X_test_3['sun_azimuth:d']))

    # Now drop the original 'sun_azimuth' feature
    X_train_3.drop('sun_azimuth:d', axis=1, inplace=True)
    X_test_3.drop('sun_azimuth:d', axis=1, inplace=True)

    # Split the training data into training and validation sets
    X_train_cat_3, X_test_cat_3, y_train_cat_3, y_test_cat_3 = train_test_split(X_train_3, targets_3, test_size=0.2, random_state=42)

    # Train model
    model_cat_3.fit(X_train_cat_3, y_train_cat_3['pv_measurement'],eval_set=(X_test_cat_3, y_test_cat_3['pv_measurement']),)  
    
    # Pred
    pred_cat_3 = model_cat_3.predict(X_test_3)

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat_3 = pred_cat_3 * is_day_feature_3['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat_3 = np.clip(adjusted_final_predictions_cat_3, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat_3.append(adjusted_final_predictions_cat_3) 

# Changing final list to array   
all_predictions_cat_3 = np.array(all_predictions_cat_3)

## XGBOOST

In [None]:
# LightGBM with some extra features
def train_xgb(X, y, location_name,seeds):
    # Combine feature data and target into a single DataFrame
    data = X.copy()
    data['target'] = y['pv_measurement']
    
    # Added some extra features to this one model, did it here so we could reuse the same preprocesssing function on diffrent models
    # Feature Combination 1: Solar Radiation and Cloud Cover Combination
    data['weighted_rad'] = ((data['direct_rad:W'] * (1 - data['total_cloud_cover:p']/100)) +
                        (data['diffuse_rad:W'] * (data['total_cloud_cover:p']/100)))

    # Feature Combination 2: Atmospheric Conditions Combination
    data['adjusted_clear_sky_rad'] = (data['clear_sky_rad:W'] *
                                  np.exp(-0.0001 * data['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (data['air_density_2m:kgm3'] - 1.225)))  # Adjusted based on humidity and air density
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,
                    html=False, 
                    experiment_name=f'exp_{location_name}')

    # Create a LightGBM model
    xgb = create_model('xgboost')
    
    # Tune the model
    tuned_xgb = tune_model(xgb, optimize='MAE')

    # Create a bagged version of the tuned model
    bagged_xgb = ensemble_model(tuned_xgb, method='Bagging')

    # Finalize the model by training on whole dataset
    final_model = finalize_model(bagged_xgb)
        
    return final_model

In [None]:
xb = []  # Initialize a list to store results for each location.

for loc in locations:  # Iterate through each location.
    # Load data specific to the current location.
    train, X_train_estimated, X_train_observed, X_test_estimated = load(loc)
    
    # Preprocess the data.
    X_train, X_test, is_day_feature, targets = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)

    X_train = X_train.drop(columns=['date_forecast'])
    X_test['weighted_rad'] = ((X_test['direct_rad:W'] * (1 - X_test['total_cloud_cover:p']/100)) +
                        (X_test['diffuse_rad:W'] * (X_test['total_cloud_cover:p']/100)))

    X_test['adjusted_clear_sky_rad'] = (X_test['clear_sky_rad:W'] *
                                  np.exp(-0.0001 * X_test['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (X_test['air_density_2m:kgm3'] - 1.225)))
    

    # Train the XGBoost model using a custom function (train_xgb).
    final_model_xgb = train_xgb(X_train, targets, loc, 42)

    # Make predictions on the test dataset using the trained model.
    predictions_xgb = predict_model(final_model_xgb, data=X_test)
    final_predictions_xgb = predictions_xgb['prediction_label']
        
    # Adjust the predictions by multiplying with the 'is_day:idx' feature.
    final_predictions_xgb_filter = final_predictions_xgb * is_day_feature['is_day:idx']

    # Clip the predictions to ensure they are non-negative.
    final_predictions_xgb_filter = np.clip(final_predictions_xgb_filter, 0, None)

    # Append the adjusted predictions for each location to the 'all_predictions_xb' list.
    xb.append([final_predictions_xgb_filter])

# Convert the list of predictions for all locations into a numpy array.
xb = np.array(xb)


## Combining Model Predictions

In [None]:
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e).flatten()
all_predictions_xb = np.array(xb).flatten()
all_predictions_cat = np.array(all_predictions_cat).flatten()
all_predictions_cat_2 = np.array(all_predictions_cat_2).flatten()
all_predictions_cat_3 = np.array(all_predictions_cat_3).flatten()
all_pred = 0.2*all_predictions_cat+0.2 * all_predictions_lGBM_e+ 0.2* all_predictions_cat_2+0.2*all_predictions_cat_3 + 0.2*all_predictions_xb
all_pred[all_pred < 6] = 0
print(all_pred.shape)

#### Saving the final predictions to CSV

In [None]:
final_predictions = all_pred

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('final_predictions.csv', index=False)

In [None]:
df1 = pd.read_csv('final_predictions_ok.csv')
df_diff = df1[0:720] - df
df_diff.plot()