In [1]:
import pandas as pd
from pandas_ods_reader import read_ods
import seaborn as sn
import matplotlib.pyplot as plt
import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

### This file is used to find correlation between Net biome productivity, soil organic carbon, ecosystem co2 and the rest of the properties


#### Measured using Pearson's coefficient

#### Target dict contains the target variables from each file 

In [2]:
file_names = ['flux_soc_out', 'atmosphere_out', 'n_flux_out', 'p_flux_out', 'soil_c_out', 'soil_water_out', 'surf_water_out', 'temp_out']

dirs = ['warm_temp_maize_soybean_irrigated', 'warm_temp_maize-soybean_dryland', 'cool_temp_maize_soybean']
target_flux = ['CO2_FLUX', 'NBP']
target_soil = ['ECO_CO2_FLUX', 'SOIL_CO2_FLUX']

In [3]:
def load_datasets(dirs: list) -> dict:
    datasets = {}
    for dr in dirs:
        path = 'datasets/' + dr + '/daily_data_merged'
        flux_soc = pd.read_csv(os.path.join(path, file_names[0] + '.csv'))
        atm = pd.read_csv(os.path.join(path, file_names[1] + '.csv'))
        nitrogen = pd.read_csv(os.path.join(path, file_names[2] + '.csv'))
        phosp = pd.read_csv(os.path.join(path, file_names[3] + '.csv'))
        soil_carbon = pd.read_csv(os.path.join(path, file_names[4] + '.csv'))
        soil_water = pd.read_csv(os.path.join(path, file_names[5] + '.csv'))
        surf_water = pd.read_csv(os.path.join(path, file_names[6] + '.csv'))
        temp = pd.read_csv(os.path.join(path, file_names[7] + '.csv'))

        flux_soc = flux_soc.drop(flux_soc.columns[0], axis=1)
        atm = atm.drop(atm.columns[0], axis=1)
        nitrogen = nitrogen.drop(nitrogen.columns[0], axis=1)
        phosp = phosp.drop(phosp.columns[0], axis=1)
        soil_carbon = soil_carbon.drop(soil_carbon.columns[0], axis=1)
        soil_water = soil_water.drop(soil_water.columns[0], axis=1)
        surf_water = surf_water.drop(surf_water.columns[0], axis=1)
        temp = temp.drop(temp.columns[0], axis=1)

        csv_list = [flux_soc, atm, nitrogen, phosp, soil_carbon, soil_water, surf_water, temp]

        data_dict = {}
        for i in range (len(csv_list)):
            data_dict[file_names[i]] = csv_list[i]

        datasets[dr] = data_dict

    return datasets

datasets = load_datasets(dirs)

In [4]:
# no GPT / copilot baby
def get_correlations(data: dict, file_name: str, target_vars: list, threshold: int ) -> dict:
    corr_dict = {}
    for target_var in target_vars:
        corr = pd.Series([])
        target_data = data[file_name][target_var]
        
        for file in data:
            df = data[file]
            
            if target_var not in df.columns:
                df[target_var] = target_data
                
            c = df.corr()[target_var]
            c = c[abs(c) > threshold]
            c = c.where(c.index != target_var).dropna()
            corr = pd.concat([corr, c])
        
        corr = corr.sort_values(ascending=False)
        corr_dict[target_var] = corr
    
    return corr_dict     

In [5]:
corrs_per_dataset = {}
for idx in datasets:
    targets = {}
    flux_corr = get_correlations(datasets[idx], 'flux_soc_out', target_flux, 0.6)

    soil_corr = get_correlations(datasets[idx], 'soil_c_out', target_soil, 0.6)
    
    targets['flux'] = flux_corr
    targets['soc'] = soil_corr
    corrs_per_dataset[idx] = targets    

In [10]:
corrs_per_dataset['warm_temp_maize_soybean_irrigated']['flux']['CO2_FLUX']

ECO_RH             0.990075
AUTO_RESP          0.936333
ECO_RA             0.854971
NET_PL_EXCH_P      0.733536
NET_PL_EXCH_N      0.717263
N2O_FLUX           0.642676
O2_13              0.603034
TMIN_SOIL_11      -0.605539
TEMP_11           -0.605933
TMAX_SOIL_11      -0.606578
TTL_DIC           -0.613154
SUR_DON+SED_FLX   -0.615236
CO2_12            -0.624699
LITTER_C          -0.626094
ECO_HVST_N        -0.663872
SUR_DOC+SED_FLX   -0.664400
CO2_13            -0.665710
TMIN_SOIL_12      -0.666647
TEMP_12           -0.666994
TMAX_SOIL_12      -0.667543
NET_N_MIN         -0.685966
ECO_HVST_P        -0.687861
CH4_FLUX          -0.703826
TMIN_SOIL_13      -0.707148
TEMP_13           -0.707399
TMAX_SOIL_13      -0.707812
TMIN_SOIL_14      -0.724191
TEMP_14           -0.724379
TMAX_SOIL_14      -0.724703
SUR_DIC_FLX       -0.727757
NET_P_MIN         -0.732090
ECO_NPP           -0.736765
SUR_DIN_FLX       -0.787998
ECO_GPP           -0.793190
RUNOFF            -0.835934
PRECN             -0

In [11]:
corrs_per_dataset['warm_temp_maize-soybean_dryland']['flux']['CO2_FLUX']

ECO_RH           0.949529
ECO_RA           0.749474
AUTO_RESP        0.673349
O2_13            0.651048
NET_PL_EXCH_N    0.626926
O2_12            0.618405
N2O_FLUX         0.601077
RUNOFF          -0.622614
TMIN_SOIL_11    -0.654392
TEMP_11         -0.654804
TMAX_SOIL_11    -0.655432
TMIN_SOIL_12    -0.706792
CH4_FLUX        -0.706902
TEMP_12         -0.707151
TMAX_SOIL_12    -0.707677
SUR_DIN_FLX     -0.735948
TMIN_SOIL_13    -0.739534
TEMP_13         -0.739793
TMAX_SOIL_13    -0.740175
TMIN_SOIL_14    -0.752385
TEMP_14         -0.752579
TMAX_SOIL_14    -0.752874
ECO_GPP         -0.761284
ECO_NPP         -0.762566
ET              -0.869423
PRECN           -0.909101
O2_FLUX         -0.997614
dtype: float64

In [248]:
corrs_per_dataset['cool_temp_maize_soybean']['flux']['CO2_FLUX']

CH4_FLUX         0.657553
LITTER_C         0.619137
AUTO_RESP       -0.677937
SUB_DIC_FLX     -0.735793
NET_PL_EXCH_P   -0.812025
NET_PL_EXCH_N   -0.813142
dtype: float64