In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from eumap.misc import find_files, ttprint, nan_percentile, GoogleSheet
from eumap.raster import read_rasters, save_rasters
import warnings
import multiprocess as mp
import time
from scipy.special import expit, logit
import warnings
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_val_score, HalvingGridSearchCV, KFold, GroupKFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
import joblib
import pickle
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from cubist import Cubist
from sklearn.base import BaseEstimator, TransformerMixin
from pathlib import Path
import os
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import cross_val_predict
import math
from quantile_forest import RandomForestQuantileRegressor

# df = pd.read_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc/data/002_data_whole.csv',low_memory=False)
output_folder = '/mnt/primus/xuemeng_tmp_harbour/soc/results'

In [6]:
def calc_picp(lower_bounds, upper_bounds, true_values):
    within_bounds = np.sum((true_values >= lower_bounds) & (true_values <= upper_bounds))
    picp = within_bounds / len(true_values)
    return picp

def calc_qcp(predictions, true_values, quantile):
    return np.mean(true_values <= predictions)

def quantile_cross_val_predict(estimator, X, y, cv, quantiles, groups=None):
    """ Custom cross-validation to handle quantile predictions with group support. """
    predictions = {q: [] for q in quantiles}
    if groups is None:
        cv_split = cv.split(X)
    else:
        cv_split = cv.split(X, y, groups)
    
    for train_idx, test_idx in cv_split:
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train = y.iloc[train_idx]
        
        # Fit the model on the training data
        estimator.fit(X_train, y_train)
        
        # Predict each quantile and store results
        for q in quantiles:
            pred = estimator.predict(X_test, quantiles=q)
            predictions[q].extend(pred)
    
    return predictions

In [7]:
# a = find_files('/mnt/primus/xuemeng_tmp_harbour/soc/results/', 'model_rf.*.ccc.joblib')
# a = [str(i) for i in a]

prop_list = ['oc','ph_h2o','ph_cacl2','bulk_density','caco3','CEC','EC','P','K','N']
rf_list = ['/mnt/primus/xuemeng_tmp_harbour/soc/results/CEC/model_rf.CEC_log1p.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/EC/model_rf.EC_log1p.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/K/model_rf.K_log1p.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/N/model_rf.N_log1p.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/P/model_rf.P_log1p.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/bulk_density/model_rf.bulk_density_normal.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/caco3/model_rf.caco3_log1p.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/oc/model_rf.oc_log1p.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/ph_cacl2/model_rf.ph_cacl2_normal.ccc.joblib',
 '/mnt/primus/xuemeng_tmp_harbour/soc/results/ph_h2o/model_rf.ph_h2o_normal.ccc.joblib']

In [8]:
quantiles = [0.05,0.95]
results = []
cv = GroupKFold(n_splits=5)  
spatial_cv_column = 'tile_id'

for prop in prop_list:
    test = pd.read_csv(f'{output_folder}/{prop}/benchmark_test.pnts_{prop}.csv',low_memory=False)
    train = pd.read_csv(f'{output_folder}/{prop}/benchmark_train.pnts_{prop}.csv',low_memory=False)

    covs_a = pd.read_csv(f'{output_folder}/{prop}/benchmark_selected.covs_{prop}.txt').values.tolist()
    covs = [item for sublist in covs_a for item in sublist]
    
    train = train.dropna(subset=covs,how='any')
    test = test.dropna(subset=covs,how='any')
    
    for i in rf_list:
        if prop in i:
            file_rf = i
            
    rf_model = joblib.load(file_rf)
    params = rf_model.get_params()
    params['n_jobs'] = 90
    qrf = RandomForestQuantileRegressor(**params)
    
    ttprint(f'quantile, cv, {prop}')
    # cv valiadate
    # y_pred_cv_rf = cross_val_predict(rf_model, train[covs], train[prop], cv=cv, n_jobs=-1)
    predictions = quantile_cross_val_predict(qrf, train[covs], train[prop], cv, quantiles, groups = train[spatial_cv_column])
    picp_cv = calc_picp(np.array(predictions[quantiles[0]]), np.array(predictions[quantiles[1]]), train[prop])
    
    qcp_cv_lower = calc_qcp(np.array(predictions[quantiles[0]]), train[prop], quantiles[0])
    qcp_cv_upper = calc_qcp(np.array(predictions[quantiles[1]]), train[prop], quantiles[1])
    
    piw_cv = np.mean(np.array(predictions[quantiles[1]])-np.array(predictions[quantiles[0]]))
    
    
    # individual test data
    ttprint(f'quantile, individual test, {prop}')
    qrf.fit(train[covs], train[prop])
    y_pred_test = qrf.predict(test[covs], quantiles=quantiles)
    picp_test = calc_picp(y_pred_test[:,0], y_pred_test[:,1], test[prop])
    
    qcp_test_lower = calc_qcp(y_pred_test[:,0], test[prop], quantiles[0])
    qcp_test_upper = calc_qcp(y_pred_test[:,1], test[prop], quantiles[1])
    
    piw_test = np.mean(y_pred_test[:,1]-y_pred_test[:,0])
    
    results.append({
            'property': prop,
            'picp_val': picp_test,
            'piw_val': piw_test,
            'qcp05_val': qcp_test_lower,
            'qcp95_val': qcp_test_upper,
            'picp_cv': picp_cv,
            'piw_cv': piw_cv,
            'qcp05_cv': qcp_cv_lower,
            'qcp95_cv': qcp_cv_upper
        })
    
results = pd.DataFrame(results)
results.to_csv(f'{output_folder}/benchmark_metrics_quantiles.csv',index=False)

[14:35:45] quantile, cv, oc




[14:39:04] quantile, individual test, oc




[14:39:58] quantile, cv, ph_h2o




[14:42:54] quantile, individual test, ph_h2o




[14:43:45] quantile, cv, ph_cacl2




[14:45:23] quantile, individual test, ph_cacl2




[14:45:50] quantile, cv, bulk_density




[14:46:23] quantile, individual test, bulk_density




[14:46:35] quantile, cv, caco3




[14:47:49] quantile, individual test, caco3




[14:48:09] quantile, cv, CEC




[14:48:40] quantile, individual test, CEC




[14:48:52] quantile, cv, EC




[14:49:55] quantile, individual test, EC




[14:50:18] quantile, cv, P




[14:52:22] quantile, individual test, P




[14:52:58] quantile, cv, K




[14:54:40] quantile, individual test, K




[14:55:12] quantile, cv, N




[14:57:08] quantile, individual test, N




In [9]:
results

Unnamed: 0,property,picp_val,piw_val,qcp05_val,qcp95_val,picp_cv,piw_cv,qcp05_cv,qcp95_cv
0,oc,0.915508,113.885168,0.0439,0.958747,0.558249,118.193348,0.218569,0.776642
1,ph_h2o,0.911986,2.197275,0.042017,0.953118,0.54647,2.418144,0.222335,0.768477
2,ph_cacl2,0.923706,2.212017,0.036709,0.956877,0.544872,2.46521,0.235988,0.778521
3,bulk_density,0.902604,0.851748,0.043394,0.945998,0.593735,0.766303,0.210814,0.804134
4,caco3,0.958155,200.866792,0.319889,0.96118,0.778169,192.609343,0.379088,0.825687
5,CEC,0.930325,30.865272,0.039151,0.957532,0.750118,34.048257,0.133494,0.880614
6,EC,0.912077,52.928952,0.040146,0.952223,0.705949,49.26226,0.149725,0.855653
7,P,0.935855,70.587758,0.093785,0.948905,0.780411,69.264361,0.145537,0.894015
8,K,0.897238,425.267293,0.047293,0.94453,0.746413,425.263079,0.148428,0.894841
9,N,0.928998,8.397541,0.042911,0.961734,0.556914,11.916887,0.222115,0.771792
