### organize the training dataset for lulc

In [2]:
import pandas as pd
dff = pd.read_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/003_data_soc.txt', low_memory=False)
# dff = dff.drop(columns=['hzn_top', 'hzn_btm', 'ref', 'nuts0', 'oc'])
# dff = dff.add_suffix('_datacube2024')


import joblib
file_path = '/mnt/primus/xuemeng_tmp_harbour/tillage_index/lucas_ls_preprocessed.joblib'
edc = joblib.load(file_path)

import pandas as pd
edc = pd.DataFrame(edc)
edc['time'] = edc['survey_date'].dt.year.astype(float)
edc['point_id'] = edc['point_id'].astype(int)

result = pd.merge(dff, edc, left_on=['id','time'], right_on=['point_id','time'],how='inner')
result = result.drop(columns='tile_id_y')
result = result.rename(columns={'tile_id_x':'tile_id'})

In [6]:
## soc
oc_meta = ['id', 'lat', 'lon', 'time', 'hzn_top', 'hzn_btm', 'ref', 'nuts0', 'oc', 'tile_id']
with open('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/000_feature_all.txt') as f:
    oc_all = f.read().splitlines()
    
oc = result[oc_meta+oc_all]
oc.to_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/003_data_soc.txt', index=False)

In [13]:
## lc
lc = result.drop(columns=['time', 'hzn_top', 'hzn_btm', 'ref', 'nuts0', 'oc'])
lc.to_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/004_data_lc.txt', index=False)

### separate training and testing dataset

In [22]:
import numpy as np
test = result.groupby('tile_id', group_keys=False).apply(lambda x: x.sample(n=max(1, int(np.ceil(0.1 * len(x))))))
train = result.loc[~result.index.isin(test.index)]


In [26]:
oc_train = train[oc_meta+oc_all]
oc_train.to_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/003.1_data.train_soc.txt', index=False)

lc_train = train.drop(columns=['time', 'hzn_top', 'hzn_btm', 'ref', 'nuts0', 'oc'])
lc_train.to_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/004.1_data.train_lc.txt', index=False)

In [27]:
oc_test = test[oc_meta+oc_all]
oc_test.to_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/003.2_data.test_soc.txt', index=False)

lc_test = test.drop(columns=['time', 'hzn_top', 'hzn_btm', 'ref', 'nuts0', 'oc'])
lc_test.to_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/004.2_data.test_lc.txt', index=False)

In [33]:
len(train)

52306

In [34]:
len(test)

8417

In [35]:
len(result)

60723

### organize the training set for soc regression

In [4]:
import pandas as pd
df = pd.read_csv('/mnt/primus/xuemeng_tmp_harbour/soc/data/test_covar_overlayed.csv', low_memory=False)

from eumap.misc import find_files, nan_percentile, GoogleSheet, ttprint
with open('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/000_feature_all.txt') as f:
    name = f.read().splitlines()

meta = ['id', 'lat', 'lon', 'time', 'hzn_top', 'hzn_btm', 'ref', 'nuts0', 'oc', 'tile_id']
dff = df[meta+name]
dff = dff.loc[dff['ref']=='LUCAS']  # only LUCAS
dff = dff.loc[(dff['hzn_btm']==20) & (dff['hzn_top']==0)] # only top soil 0-20cm
dff = dff.dropna(subset=name+['oc']) # all valid

dff.to_csv('/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/003_data_soc.txt',index=False)

### organize the feature sets

In [2]:

# read in potential usable overlay files
key_file = '/mnt/inca/soc_eu_model/gaia-319808-913d36b5fca4.json'
url = 'https://docs.google.com/spreadsheets/d/1eIoPAvWM5jrhLrr25jwguAIR0YxOh3f5-CdXwpcOIz8/edit#gid=0'

gsheet = GoogleSheet(key_file, url)
covar = gsheet.covar
covar = covar.iloc[0:190]

In [3]:
# function to generate file paths by year, and check if the urls are valid
def generate_overlay_path(row,year,filt=None):
            
    # determine if static variable
    if row['temporal resolution'] == 'static':
        return [row['path']],[row['path']],[row['landsat']],[row['theme']]
    
    if row['temporal resolution'] == 'long term':
        perc_list = row['perc'].split(',')
        output_paths = [row['path'].replace('{perc}', perc) for perc in perc_list]
        return output_paths, output_paths, [row['landsat'] for i in output_paths], [row['theme'] for i in output_paths]
        
    # determine if the year is ahead of the availibility of the variable
    if year>int(row['end year']):
        year = int(row['end year'])
    
    # determine if it's an annual variable or (bi)monthly variable
    if '{start_m}' not in row['path']:
        output_paths = [row['path'].replace('{year}',f'{int(year)}')]
    else:
        output_paths = []
        start_list = row['start_m'].split(', ')
        end_list = row['end_m'].split(', ')
        output_paths = [row['path'].replace('{year}',f'{int(year)}').replace('{start_m}',start_list[i]).replace('{end_m}',end_list[i]) for i in range(len(end_list))]
    
    if '{perc}' in row['path']:
        perc_list = row['perc'].split(',')
        output_paths = [p.replace('{perc}', perc) for p in output_paths for perc in perc_list]
        
    if (row['leap year'] == '1') & (year%4==0):
        output_paths = [p.replace('0228', '0229') if '0228' in p else p for p in output_paths]
    
    return output_paths, [i.replace(str(int(year)),'{year}') for i in output_paths], [row['landsat'] for i in output_paths], [row['theme'] for i in output_paths]
    
import requests
def check_path(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        # Check if the status code is not 200 (OK). You might want to specifically check for 404 or other error codes.
        if response.status_code == 404:
            print(f"{url} returned HTTP 404 Not Found")
            return url
        elif response.status_code != 200:
            print(f"{url} returned HTTP {response.status_code}")
            return url
        return None  # URL is fine (HTTP 200), or you might want to handle redirections (HTTP 3xx) separately if needed.
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {str(e)}")
        return url
    
# # check function validity
# # generate paths
# paths = []
# for index,row in covar.iterrows():
#     paths.extend(generate_overlay_path(row,2000))
    
pathl = []
namel = []
tier = []
theme = []
year = 2000
for index,row in covar.iterrows():
    if row['landsat']=='':
        continue
    paths, names, tiers, themes = generate_overlay_path(row, year)
    pathl.extend(paths)
    namel.extend(names)
    tier.extend(tiers)
    theme.extend(themes)
    
for i in pathl:
    check_path(i)

In [28]:
atheme = list(set(theme))
         
for itheme in atheme:
    setl = []
    for jjj in range(len(name)):
        if theme[jjj] == itheme:
            setl.append(name[jjj])
    
    with open(f'/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/001_feature_theme.{itheme}.txt', 'w') as file:
        for item in setl:
            file.write(f"{item}\n")
            
atier = list(set(tier))
         
for itier in atier:
    setl = []
    for jjj in range(len(name)):
        if tier[jjj] == itier:
            setl.append(name[jjj])
    
    with open(f'/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/002_feature_tier.{itier}.txt', 'w') as file:
        for item in setl:
            file.write(f"{item}\n")
            
name = [i.split('/')[-1][0:-4] for i in name]
with open(f'/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/000_feature_all.txt', 'w') as file:
    for item in namel:
        file.write(f"{item}\n")

In [9]:
with open(f'/mnt/primus/xuemeng_tmp_harbour/tillage_index/data_and_feature_set/000_feature_all.txt', 'w') as file:
    for item in name:
        file.write(f"{item}\n")