# Modeling of relative Yield, P-Uptake and P-Balance

Marc PErez  
February 13, 2025

In [None]:
RES <- readRDS("data/RES.rds")
Dmlr <- RES$nlme.coef.mrg
Dmlr$soil_0_20_P_CO2_log[is.infinite(Dmlr$soil_0_20_P_CO2_log)] <- NA 
Dmlr$soil_0_20_P_AAE10_log


  [1]        NA 3.0773123        NA 2.9069011        NA 2.9601051        NA
  [8] 2.9069011        NA 3.9815491        NA 3.7954892 3.5973123        NA
 [15]        NA 3.5890591        NA 4.4485164        NA 4.2541933 3.9926809
 [22]        NA        NA 4.4578296 1.8870696 2.1162555 2.3321439 2.2300144
 [29] 2.4069451 2.3321439 2.3887628        NA 2.1972246 1.8245493        NA
 [36] 2.3795461 2.4849066 2.3223877 2.3418058 2.3418058 2.1400662 2.2925348
 [43] 0.8329091 1.9878743 1.7047481 2.9069011 1.7047481 1.1939225 1.7917595
 [50]        NA 1.6292405 1.7749524 2.6461748 1.4586150        NA 1.9878743
 [57] 2.1282317 2.1162555 1.8870696 2.1162555 3.1441523 3.4111477 3.2425924
 [64]        NA 3.1863526 3.4045252 3.1945831 3.1863526 3.1045867        NA
 [71] 2.7850112 2.7972813 2.9014216 2.8449094 2.8094027 2.7850112 3.2580965
 [78] 3.0007198 2.4069451 2.6026897 2.8033604 2.6026897 2.5726122 2.7663191
 [85] 2.7343675        NA 2.9069011 2.8903718        NA 3.0587071 2.9069011
 [92] 2.9069

## Setup

In [None]:
library(mlr3verse, quietly = TRUE)

mse <- msrs(c("regr.mse"))

if (!interactive())
  lgr::get_logger("mlr3")$set_threshold("warn")

get_benchi_table <- function(tasks, nfolds = 5) {
  set.seed(123)
  learners <- lrns(c("regr.featureless", "regr.lm", "regr.xgboost", "regr.ranger"))
  learners$regr.xgboost$param_set$set_values(
    eta = 0.04, 
    nrounds = 300, 
    max_depth = 2
  )

  benchi <- xfun::cache_rds({
    benchmark(benchmark_grid(
      tasks, 
      learners, 
      rsmp("cv", folds = nfolds)
    ))
  }, 
  file = "benchmark.rds", 
  dir = "cache/",
  hash = list(tasks, nfolds)
  )
  
  res <- tidyr::pivot_wider(benchi$aggregate(mse), 
    id_cols = task_id,
    names_from = learner_id,
    values_from = regr.mse
  ) |> as.data.frame()
  
  rownames(res) <- res$task_id
  res <- res[, -1]
  colnames(res) <- gsub("regr.", "", colnames(res))
  stopifnot(any(colnames(res) == "featureless"))
  res <- 1 - res / res$featureless
  res[, -1, drop = FALSE] |> round(3)
}


Testing prediction quality using

-   Linear models
-   Random forests (default parameters)
-   XGBoost (with parameter tuning)

**Weather Variables:**

In [None]:
Weather_vars <- c(
  "anavg_temp", "ansum_prec",
  "juvdev_prec", "juvdev_sun",
  "ansum_sun", "juvdev_temp"
)
stopifnot(all(Weather_vars %in% names(Dmlr)))
Weather_vars


[1] "anavg_temp"  "ansum_prec"  "juvdev_prec" "juvdev_sun"  "ansum_sun"  
[6] "juvdev_temp"

**Phosphor Variable sets:**

In [None]:
P_var_sets <- list(
  onlyweather = NULL,
  k = "k",
  PS = "PS_log",
  kPS = c("PS_log", "k", "kPS_log"),
  AAE10 = "soil_0_20_P_AAE10_log",
  CO2 = "soil_0_20_P_CO2_log",
  AAE10_CO2 = c("soil_0_20_P_AAE10_log", "soil_0_20_P_CO2_log"),
  AAE10_CO2_kPS = c("soil_0_20_P_AAE10_log", "soil_0_20_P_CO2_log", "PS_log", "k", "kPS_log"),
  CO2_kPS = c("soil_0_20_P_CO2_log", "PS_log", "k", "kPS_log")
)


**Response Variables**

In [None]:
Y_vars <- c("Ymain_rel", "annual_P_uptake", "annual_P_balance")


## With Weather data

### TODO: Group - cross validation

    \(nam){
      mytsk <- as_task_regr(
        Dmlr[complete.cases(Dmlr$Ymain_rel), c(y, Weather_vars, P_var_sets[[nam]], "Site")],
        target = y,
        id = nam)
      mytsk$set_col_roles("Site", "group")
      mytsk
    }

Algorithm learns to predict location from weather since we do not do stratified cross-validation (leaving out locations).

### Ymain_rel

In [None]:
y <- "Ymain_rel"
lapply(names(P_var_sets), \(nam) as_task_regr(
    Dmlr[complete.cases(Dmlr[,c("Ymain_rel",Weather_vars,P_var_sets$AAE10_CO2_kPS)]),c(y, Weather_vars, P_var_sets[[nam]])],
    target = y,
    id = nam)) |>
  get_benchi_table() |> knitr::kable()


                       lm   xgboost   ranger
  --------------- ------- --------- --------
  onlyweather       0.138     0.544    0.542
  k                 0.107     0.553    0.505
  PS                0.195     0.572    0.596
  kPS               0.141     0.570    0.618
  AAE10             0.255     0.605    0.598
  CO2               0.200     0.645    0.638
  AAE10_CO2         0.222     0.651    0.647
  AAE10_CO2_kPS     0.197     0.663    0.625
  CO2_kPS           0.183     0.663    0.627


|               |    lm | xgboost | ranger |
|:--------------|------:|--------:|-------:|
| onlyweather   | 0.201 |   0.518 |  0.520 |
| k             | 0.186 |   0.476 |  0.487 |
| PS            | 0.234 |   0.603 |  0.585 |
| kPS           | 0.265 |   0.569 |  0.555 |
| AAE10         | 0.294 |   0.530 |  0.582 |
| CO2           | 0.259 |   0.569 |  0.578 |
| AAE10_CO2     | 0.288 |   0.515 |  0.594 |
| AAE10_CO2_kPS | 0.347 |   0.596 |  0.607 |
| CO2_kPS       | 0.275 |   0.530 |  0.575 |

### annual_P_uptake

In [None]:
y <- "annual_P_uptake"
lapply(names(P_var_sets), \(nam) as_task_regr(
    Dmlr[complete.cases(Dmlr[,c("Ymain_rel",Weather_vars,P_var_sets$AAE10_CO2_kPS)]),c(y, Weather_vars, P_var_sets[[nam]])],
    target = y,
    id = nam)) |>
  get_benchi_table()  |> knitr::kable()


                       lm   xgboost   ranger
  --------------- ------- --------- --------
  onlyweather       0.414     0.784    0.782
  k                 0.380     0.763    0.766
  PS                0.397     0.757    0.816
  kPS               0.434     0.777    0.814
  AAE10             0.489     0.822    0.834
  CO2               0.472     0.797    0.846
  AAE10_CO2         0.468     0.852    0.855
  AAE10_CO2_kPS     0.488     0.796    0.819
  CO2_kPS           0.457     0.808    0.840


|               |    lm | xgboost | ranger |
|:--------------|------:|--------:|-------:|
| onlyweather   | 0.468 |   0.649 |  0.648 |
| k             | 0.456 |   0.564 |  0.609 |
| PS            | 0.477 |   0.594 |  0.617 |
| kPS           | 0.459 |   0.560 |  0.474 |
| AAE10         | 0.503 |   0.619 |  0.621 |
| CO2           | 0.481 |   0.615 |  0.642 |
| AAE10_CO2     | 0.484 |   0.566 |  0.580 |
| AAE10_CO2_kPS | 0.465 |   0.528 |  0.423 |
| CO2_kPS       | 0.487 |   0.536 |  0.428 |

### annual_P_balance

In [None]:
y <- "annual_P_balance"
lapply(names(P_var_sets), \(nam) as_task_regr(
    Dmlr[complete.cases(Dmlr[,c("Ymain_rel",Weather_vars,P_var_sets$AAE10_CO2_kPS)]),c(y, Weather_vars, P_var_sets[[nam]])],
    target = y,
    id = nam)) |>
  get_benchi_table()  |> knitr::kable()


                       lm   xgboost   ranger
  --------------- ------- --------- --------
  onlyweather       0.027     0.103    0.102
  k                 0.045     0.565    0.243
  PS                0.627     0.864    0.774
  kPS               0.639     0.837    0.860
  AAE10             0.392     0.597    0.561
  CO2               0.490     0.707    0.695
  AAE10_CO2         0.471     0.760    0.732
  AAE10_CO2_kPS     0.630     0.852    0.827
  CO2_kPS           0.645     0.881    0.866


|               |     lm | xgboost | ranger |
|:--------------|-------:|--------:|-------:|
| onlyweather   |  0.011 |   0.120 |  0.121 |
| k             | -0.008 |   0.313 |  0.127 |
| PS            |  0.310 |   0.647 |  0.608 |
| kPS           |  0.282 |   0.609 |  0.617 |
| AAE10         |  0.225 |   0.510 |  0.524 |
| CO2           |  0.225 |   0.532 |  0.591 |
| AAE10_CO2     |  0.225 |   0.546 |  0.585 |
| AAE10_CO2_kPS |  0.335 |   0.617 |  0.596 |
| CO2_kPS       |  0.302 |   0.626 |  0.631 |

## Without Weather data

In [None]:
if("onlyweather" %in% names(P_var_sets))
  P_var_sets <- P_var_sets[-1]


xgboost & ranger are no good in this setting since only very few variables available

### Ymain_rel

In [None]:
y <- "Ymain_rel"
lapply(names(P_var_sets), \(nam) as_task_regr(
    Dmlr[complete.cases(Dmlr[,c("Ymain_rel",Weather_vars,P_var_sets$AAE10_CO2_kPS)]),c(y, P_var_sets[[nam]])],
    target = y,
    id = nam)) |>
  get_benchi_table()  |> knitr::kable()


                        lm   xgboost   ranger
  --------------- -------- --------- --------
  k                 -0.010     0.058   -0.008
  PS                 0.035    -0.018   -0.124
  kPS                0.024     0.001   -0.060
  AAE10              0.124     0.064   -0.107
  CO2                0.092    -0.009   -0.137
  AAE10_CO2          0.116     0.082    0.095
  AAE10_CO2_kPS      0.117     0.109    0.069
  CO2_kPS            0.051    -0.032   -0.044


|               |     lm | xgboost | ranger |
|:--------------|-------:|--------:|-------:|
| k             | -0.005 |  -0.200 | -0.312 |
| PS            |  0.060 |  -0.141 | -0.221 |
| kPS           |  0.054 |  -0.343 | -0.353 |
| AAE10         |  0.099 |   0.104 | -0.009 |
| CO2           |  0.068 |  -0.086 | -0.226 |
| AAE10_CO2     |  0.073 |   0.067 |  0.064 |
| AAE10_CO2_kPS |  0.072 |  -0.045 | -0.059 |
| CO2_kPS       |  0.023 |  -0.168 | -0.196 |

### annual_P_uptake

In [None]:
y <- "annual_P_uptake"
lapply(names(P_var_sets), \(nam) as_task_regr(
    Dmlr[complete.cases(Dmlr[,c("Ymain_rel",Weather_vars,P_var_sets$AAE10_CO2_kPS)]),c(y, P_var_sets[[nam]])],
    target = y,
    id = nam)) |>
  get_benchi_table()  |> knitr::kable()


                        lm   xgboost   ranger
  --------------- -------- --------- --------
  k                 -0.028    -0.103   -0.170
  PS                 0.012    -0.024   -0.123
  kPS                0.064    -0.152   -0.199
  AAE10              0.065    -0.129   -0.254
  CO2                0.058    -0.078   -0.176
  AAE10_CO2          0.070    -0.012   -0.058
  AAE10_CO2_kPS      0.099    -0.066   -0.008
  CO2_kPS            0.095    -0.059   -0.083


|               |     lm | xgboost | ranger |
|:--------------|-------:|--------:|-------:|
| k             | -0.006 |  -0.265 | -0.434 |
| PS            | -0.006 |  -0.278 | -0.485 |
| kPS           | -0.006 |  -0.225 | -0.317 |
| AAE10         |  0.021 |  -0.139 | -0.398 |
| CO2           |  0.005 |  -0.126 | -0.276 |
| AAE10_CO2     |  0.017 |  -0.194 | -0.310 |
| AAE10_CO2_kPS |  0.020 |  -0.229 | -0.363 |
| CO2_kPS       | -0.054 |  -0.206 | -0.295 |

### annual_P_balance

In [None]:
y <- "annual_P_balance"
lapply(names(P_var_sets), \(nam) as_task_regr(
    Dmlr[complete.cases(Dmlr[,c("Ymain_rel",Weather_vars,P_var_sets$AAE10_CO2_kPS)]),c(y, P_var_sets[[nam]])],
    target = y,
    id = nam)) |>
  get_benchi_table()  |> knitr::kable()


                       lm   xgboost   ranger
  --------------- ------- --------- --------
  k                 0.016     0.499    0.629
  PS                0.598     0.695    0.666
  kPS               0.598     0.681    0.667
  AAE10             0.338     0.288    0.224
  CO2               0.438     0.436    0.401
  AAE10_CO2         0.447     0.495    0.498
  AAE10_CO2_kPS     0.594     0.703    0.700
  CO2_kPS           0.589     0.686    0.680


|               |    lm | xgboost | ranger |
|:--------------|------:|--------:|-------:|
| k             | 0.011 |   0.175 |  0.160 |
| PS            | 0.309 |   0.274 |  0.178 |
| kPS           | 0.305 |   0.279 |  0.244 |
| AAE10         | 0.147 |   0.062 | -0.102 |
| CO2           | 0.189 |   0.170 |  0.085 |
| AAE10_CO2     | 0.192 |   0.062 |  0.015 |
| AAE10_CO2_kPS | 0.313 |   0.235 |  0.201 |
| CO2_kPS       | 0.313 |   0.274 |  0.243 |

In [None]:
cor(Dmlr$annual_P_balance, Dmlr$PS) # 0.54389


[1] 0.5611764

[1] 0.5455537

[1] 0.09920453

We did manage to have high predictive power for weather. This could also be due to our regression models recovering location&year from it and hence still overfitting on the test set.

Without Weather data we only managed for annual balance to get some predictive power (30%). Since we the balance is uptake - fert_P, this means that we mostly predicted fert_P. Interestingly PS is best to predict this quantity

### Legacy Code

In [None]:

# Get parameter estimates for XGBoost
t <- as_task_regr(
  subset(Dmlr[complete.cases(Dmlr$annual_P_balance),], 
    select = c("annual_P_balance", P_var_sets$AAE10_CO2_kPS#, Weather_vars
    )),
  target = "annual_P_balance"
)

l <- lrn("regr.xgboost",
  nrounds = 500  # More iterations due to lower learning rate
)

# Create search space
ps <- ps(
  max_depth = p_int(2, 4),
  eta = p_dbl(0.001, 0.3, tags = "logscale")
)

# Setup tuning
instance <- ti(
  task = t,
  learner = l,
  resampling = rsmp("cv", folds = 3),
  measure = msr("regr.mse"),
  terminator = trm("none"),
  search_space = ps
)

# Grid search
tuner <- mlr3tuning::tnr("grid_search")
tuner$optimize(instance)
instance$result


Ymain_rel max_depth eta learner_param_vals x_domain regr.mse <int> <num> <list> <list> <num> 1: 2 0.067444 \<list\[5\]\> \<list\[2\]\> 177.18

P uptake max_depth eta learner_param_vals x_domain regr.mse <int> <num> <list> <list> <num> 1: 2 0.034222 \<list\[5\]\> \<list\[2\]\> 137.41

annual_P_balance max_depth eta learner_param_vals x_domain regr.mse <int> <num> <list> <list> <num> 1: 2 0.034222 \<list\[5\]\> \<list\[2\]\> 145.21

In [None]:
# nlme.coef$kPS <- nlme.coef$k * nlme.coef$PS
# 
# 
# nlme.coef.mrg <- merge(nlme.coef,allP[allP$year>=2017,],by = "uid")
# # add log-transformed versions
# Dmlr$kPS_log <- log(Dmlr$kPS)
# Dmlr$PS_log <- log(Dmlr$PS)
# Dmlr$soil_0_20_P_AAE10_log <- log(Dmlr$soil_0_20_P_AAE10)
# Dmlr$soil_0_20_P_CO2_log <- log(Dmlr$soil_0_20_P_CO2)
# 
# Dmlr$k



subset(Dmlr, select = c("Ymain_rel", P_var_sets$AAE10_CO2_kPS, Weather_vars))


    Ymain_rel soil_0_20_P_AAE10_log soil_0_20_P_CO2_log     PS_log          k
1      178.42                    NA                  NA -2.7715938 0.10467464
2          NA             3.0773123        -1.386294361 -2.7715938 0.10467464
3      179.72                    NA                  NA -3.0042091 0.12262173
4          NA             2.9069011        -1.771956842 -3.0042091 0.12262173
5      190.64                    NA                  NA -2.7693913 0.10448023
6          NA             2.9601051        -1.469675970 -2.7693913 0.10448023
7      178.51                    NA                  NA -2.9077724 0.11568444
8          NA             2.9069011        -1.514127733 -2.9077724 0.11568444
9      210.05                    NA                  NA -2.3473051 0.12628437
10         NA             3.9815491        -0.843970070 -2.3473051 0.12628437
11     209.45                    NA                  NA -2.1364490 0.11502207
12         NA             3.7954892        -0.941608540 -2.13644

# Methods

we used machine learning methods to assess how much information different sets of variables (c.f. `P_var_sets`) have each on the dependent variable (Puptake, Y-rel, P-balance), how redundant this information is. The machine learning methods to quantify the predictive power of different variable sets are: i) ordinary least squares (OLS) as a baseline; ii) XGBoost (gradient boosting with tree-based models and hyperparameter tuning for learning rate and tree depth) (arxiv:1603.02754); iii) Random Forests (with default parameters) (doi:10.1023/A:1010933404324). Computations were performed using the mlr3 framework (doi:10.21105/joss.01903). Performance was measured as percentage of explained variance on hold-out data via 5-fold cross-validation, calculated as (1 - MSE/Variance(y)), where MSE represents mean squared error.

We tried adjusting for weather variables but it seems that the ML-methods rather reconstruct the site-specific patterns….