In [1]:
import pandas as pd
import xarray as xr
import data_preprocessing as dp

# Load the input data from the NetCDF file
input_data = xr.open_dataset('./raw_data/UEPS_v1.nc')

# Print information about the input data
print("Input Data (Brazil Windfarm):\n{}".format(input_data))

# Specify the target variable
target_variable = "active_power_total"

# Setting the method to fill NaN values ["interpolate_spline", "ffill", "bfill"]
fill_nan_method = "interpolate_spline"

# Boolean variable for min-max normalization of the data
min_max_normalization = False

# Get the list of variables in the input data
variable_list = dp.get_data_variables(input_data)

# Convert the dictionary of variables to a dataframe
input_data = dp.dictionary_to_dataframe(variable_list)

# Separate the target variable from the input data
target_data = input_data[target_variable]

if min_max_normalization:
    # Normalize the input data and target data between 0 and 1
    input_data = (input_data - input_data.min()) / (input_data.max() - input_data.min())
    target_data = (target_data - target_data.min()) / (target_data.max() - target_data.min())

# Remove columns with NaN values from the input data
input_data = dp.remove_nan_columns(input_data, target_data)

# Save the unprocessed data to a CSV file
# input_data.to_csv("brazil_data_unprocessed.csv", encoding='utf-8', index=False)

Input Data (Brazil Windfarm):
<xarray.Dataset>
Dimensions:                 (Time: 52560, Height: 6, Turbine: 20, Range: 26)
Coordinates:
  * Time                    (Time) datetime64[ns] 2013-08-01 ... 2014-07-31T2...
  * Height                  (Height) float64 10.0 20.0 40.0 60.0 80.0 100.0
  * Turbine                 (Turbine) float64 1.0 2.0 3.0 4.0 ... 18.0 19.0 20.0
Dimensions without coordinates: Range
Data variables: (12/48)
    range                   (Range) float64 ...
    wind_speed              (Height, Time) float64 ...
    wind_direction          (Height, Time) float64 ...
    wind_speed_std          (Height, Time) float64 ...
    wind_direction_std      (Height, Time) float64 ...
    wind_speed_max          (Height, Time) float64 ...
    ...                      ...
    lidar_wind_direction    (Range, Time) float64 ...
    lidar_wind_speed_std    (Range, Time) float64 ...
    lidar_ws_u              (Range, Time) float64 ...
    lidar_ws_v              (Range, Time) flo

In [2]:
#Select feature selection algorithm ("pearson", "spearman", "xgboost", "time_lag_corr") and number of features

feature_selection_type = "xgboost"
num_features = 11

# prediction horizon 6 = 1 hour, 36 = 6 hour, 144 = 24 hour
time_lag = -144

# Get the top important features
top_features = dp.feature_selection_func(input_data[0], target_data, target_variable, feature_selection_type, num_features, time_lag)

# Create a new DataFrame with only the top important features and the target variable
filtered_data = input_data[0][top_features[:num_features]]

# Save the filtered data to a new CSV file
filtered_data.to_csv(f'./preprocessed_data/filtered_dataset_brazil2_{feature_selection_type}.csv', index=False)

Top features based on XGBoost feature importance:
['active_power_total', 'active_power_total_min', 'wind_speed_max', 'air_density', 'rotor_rpm_max', 'wind_speed_nacelle', 'rotor_rpm', 'tilt_Y_std', 'UST', 'lidar_wind_speed_std', 'act_position']
