In [1]:
import pandas as pd
from pandas_ods_reader import read_ods
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFECV
import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

### A Recursive Feature Elimination (RFE) model with automatic tuning of the number of features selected with cross-validation.

In [2]:
file_names = ['flux_soc_out', 'atmosphere_out', 'n_flux_out', 'p_flux_out', 'soil_c_out', 'soil_water_out', 'surf_water_out', 'temp_out']

dirs = ['warm_temp_maize_soybean_irrigated', 'warm_temp_maize-soybean_dryland', 'cool_temp_maize_soybean']
target_flux = ['CO2_FLUX', 'NBP']
target_soil = ['ECO_CO2_FLUX', 'SOIL_CO2_FLUX']

In [3]:
def load_datasets(dirs: list) -> dict:
    datasets = {}
    for dr in dirs:
        path = 'datasets/' + dr + '/daily_data_merged'
        flux_soc = pd.read_csv(os.path.join(path, file_names[0] + '.csv'))
        atm = pd.read_csv(os.path.join(path, file_names[1] + '.csv'))
        nitrogen = pd.read_csv(os.path.join(path, file_names[2] + '.csv'))
        phosp = pd.read_csv(os.path.join(path, file_names[3] + '.csv'))
        soil_carbon = pd.read_csv(os.path.join(path, file_names[4] + '.csv'))
        soil_water = pd.read_csv(os.path.join(path, file_names[5] + '.csv'))
        surf_water = pd.read_csv(os.path.join(path, file_names[6] + '.csv'))
        temp = pd.read_csv(os.path.join(path, file_names[7] + '.csv'))

        flux_soc = flux_soc.drop(flux_soc.columns[0], axis=1)
        atm = atm.drop(atm.columns[0], axis=1)
        nitrogen = nitrogen.drop(nitrogen.columns[0], axis=1)
        phosp = phosp.drop(phosp.columns[0], axis=1)
        soil_carbon = soil_carbon.drop(soil_carbon.columns[0], axis=1)
        soil_water = soil_water.drop(soil_water.columns[0], axis=1)
        surf_water = surf_water.drop(surf_water.columns[0], axis=1)
        temp = temp.drop(temp.columns[0], axis=1)

        csv_list = [flux_soc, atm, nitrogen, phosp, soil_carbon, soil_water, surf_water, temp]

        data_dict = {}
        for i in range (len(csv_list)):
            data_dict[file_names[i]] = csv_list[i]

        datasets[dr] = data_dict

    return datasets

datasets = load_datasets(dirs)

In [None]:
irrigated_df = datasets['warm_temp_maize_soybean_irrigated']
x = pd.DataFrame()
for key in irrigated_df:
       x = pd.concat([x, irrigated_df[key]], axis = 1)
        
y = irrigated_df['flux_soc_out']['CO2_FLUX']
x = x.drop(['CO2_FLUX' ,'DATE', 'unnamed.1'], axis=1)

In [None]:
min_features_to_select = 1  # Minimum number of features to consider

lmbdas = [i/2 for i in range(1,51)]
cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
clf = RidgeCV(alphas=lmbdas, cv=cv, scoring='neg_mean_squared_error')

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")