In [1]:
import pandas as pd

df = pd.read_csv('./data/lucas_preprocessed_v20251125.csv')


In [12]:
cov = df.columns[17:-6].values.tolist()
cov.sort()

In [15]:
len(cov)

362

In [14]:
with open("./data/covariate_full_list.txt", "w") as f:
    for item in cov:
        f.write(f"{item}\n")

In [15]:
cols = ['cf', 'ocd', 'bd', 'soc']

# 1) Create a boolean availability table (True = value present, False = NaN)
avail = df[cols].notna()

# 2) Count how many rows share each availability pattern
pattern_counts = (
    avail
    .value_counts()        # counts each unique combination of True/False
    .reset_index()         # turn index (the booleans) back into columns
    .rename(columns={0: 'count'})
)

print(pattern_counts)




      cf    ocd     bd   soc  count
0  False  False  False  True  50923
1   True   True   True  True   5194


In [17]:
# Choose the properties you want summary stats for
props = ['cf', 'ocd', 'bd', 'soc']

# Function: safe mean and std that always returns float (even when all NaN)
def safe_mean(x):
    return x.mean(skipna=True)

def safe_std(x):
    return x.std(skipna=True)

# Group and summarize
year_summary = (
    df.groupby('time')
      .agg(
          N_sites=('id', 'nunique'),
          **{f'{col}_mean': (col, safe_mean) for col in props},
          **{f'{col}_std': (col, safe_std) for col in props},
      )
)

# Build formatted mean ± sd columns
for col in props:
    mean_col = f'{col}_mean'
    std_col = f'{col}_std'
    out_col = f'{col}_mean_std'

    year_summary[out_col] = (
        year_summary[mean_col].round(3).astype('string')
        + " ± "
        + year_summary[std_col].round(3).astype('string')
    )

# Final formatted table
formatted_cols = ['N_sites'] + [f'{c}_mean_std' for c in props]
year_summary_formatted = year_summary[formatted_cols]

print(year_summary_formatted)


      N_sites    cf_mean_std     ocd_mean_std    bd_mean_std     soc_mean_std
time                                                                         
2009    17817           <NA>             <NA>           <NA>  38.266 ± 72.672
2012     1777           <NA>             <NA>           <NA>   19.517 ± 9.528
2015    19780           <NA>             <NA>           <NA>   32.44 ± 53.596
2018    16743  0.061 ± 0.055  23.411 ± 17.764  1.032 ± 0.302  34.836 ± 57.591


In [11]:
series_length_counts = points_per_id.value_counts().sort_index()
print(series_length_counts)


time
1     6429
2     7309
3    11690
Name: count, dtype: int64


In [27]:
df.columns[17:-6]

Index(['cropland_extent_glad_interpolate_p_30m_s_YYYY0101_YYYY1231_eu_epsg_3035_v20240604',
       'ndvi_glad_landast_ard2_seasconv_m_yearly_p25_30m_s_YYYY0101_YYYY1231_eu_epsg_3035_v20231127',
       'evi_glad_landsat_ard2_seasconv_m_30m_s_YYYY0901_YYYY1031_eu_epsg_3035_v20231127',
       'pft_grass_nat_esa_cci_lc_pc_300m_s_YYYY0101_YYYY1231_go_epsg_4326_v20230616',
       'pft_trees_bd_esa_cci_lc_pc_300m_s_YYYY0101_YYYY1231_go_epsg_4326_v20230616',
       'pft_bare_esa_cci_lc_pc_300m_s_YYYY0101_YYYY1231_go_epsg_4326_v20230616',
       'clm_lst_mod11a2_nighttime_p05_1km_s0_0cm_YYYY_v1_2',
       'ndvi_glad_landast_ard2_seasconv_m_yearly_p50_30m_s_YYYY0101_YYYY1231_eu_epsg_3035_v20231127',
       'evi_glad_landsat_ard2_seasconv_m_30m_s_YYYY0301_YYYY0430_eu_epsg_3035_v20231127',
       'wv_mcd19a2v061_seasconv_m_yearly_p25_1km_s_YYYY0101_YYYY1231_go_epsg_4326_v20230619',
       ...
       'lithology_67_lithology_egdi_1m_c_250m_s_20000101_20221231_eu_epsg_3035_v20240530',
       'litholo