## Reading

In [22]:
# load cov list
using JSON
using ArchGDAL
using Proj
using DataFrames
using Rasters
using Base.Threads

cov = JSON.parsefile("./cov_path_full.json");
name = collect(keys(cov))
path = collect(values(cov))

using Proj, ArchGDAL

function make_grid_3035(bbox, res_m)
    # bbox = (xmin, ymin, xmax, ymax) in EPSG:3035

    xmin, ymin, xmax, ymax = bbox

    xs = collect(xmin:res_m:xmax)
    ys = collect(ymax:-res_m:ymin)  # north → south

    return xs, ys
end

function sample_tiff_onto_grid(tif_path, xs, ys)
    ArchGDAL.read(tif_path) do ds
        # 1) detect CRS
        tiff_wkt = ArchGDAL.getproj(ds)

        same_crs = occursin("3035", lowercase(tiff_wkt))

        # 2) transformation
        tf = same_crs ? nothing : Proj.Transformation(tiff_wkt, "EPSG:3035")

        # 3) prepare sampling coords
        nx = length(xs)
        ny = length(ys)

        band = ArchGDAL.getband(ds, 1)
        gt = ArchGDAL.getgeotransform(ds)
        x0, dx, _, y0, _, dy = gt

        arr = Matrix{Float32}(undef, ny, nx)

        @inbounds for i in 1:ny
            for j in 1:nx
                # grid point in 3035
                x3035 = xs[j]
                y3035 = ys[i]

                # convert → raster CRS if needed
                if same_crs
                    xr = x3035
                    yr = y3035
                else
                    xr, yr = tf(x3035, y3035)
                end

                # convert to pixel
                px = round(Int, (xr - x0) / dx) + 1
                py = round(Int, (yr - y0) / dy) + 1

                if px < 1 || py < 1 || px > ArchGDAL.width(ds) || py > ArchGDAL.height(ds)
                    arr[i,j] = NaN
                else
                    arr[i,j] = ArchGDAL.read(band, px, py)
                end
            end
        end

        return vec(arr)
    end
end

function extract_rasters(paths, names, bbox3035, xs, ys)

    out = Vector{Vector{Float32}}(undef, length(paths))

    @threads for i in eachindex(paths)
        out[i] = sample_tiff_onto_grid(paths[i], xs, ys)
    end

    df = DataFrame()
    for i in eachindex(names)
        df[!, Symbol(names[i])] = out[i]
    end

    return df
end



In [29]:
res_m = 30
xs, ys = make_grid_3035(bbox, res_m)
@time df = extract_rasters(paths, names, bbox3035, xs, ys)


"http://192.168.49.34:8333/ai4sh/indicator/lithology.66.lithology_egdi.1m_c_250m_s_20000101_20221231_eu_epsg.3035_v20240530.tif"

## load and prediction

In [None]:
version = "v20251125"
results_dir = joinpath(@__DIR__, "map");
# data_full = CSV.read(joinpath(@__DIR__, "data/lucas_preprocessed_$version.csv"), DataFrame; normalizenames=true)

# load model
model_path = ""
jld = jldopen(model_path, "r")
best_hm = jld["hybridModel"] 
best_ps = jld["ps"]      
best_st = jld["st"] 
close(jld)

# make prediction
xx = df(names)
yy, st_pred = best_hm(xx, best_ps, LuxCore.testmode(best_st))

for var in [:BD, :SOCconc, :CF, :SOCdensity, :oBD, :mBD]
    if hasproperty(yy, var)
        val = getproperty(yy, var)

        if val isa AbstractVector && length(val) == nrow(df)
            df[!, Symbol("pred_", var)] = val # per row

        elseif (val isa Number) || (val isa AbstractVector && length(val) == 1)
            df[!, Symbol("pred_", var)] = fill(Float32(val isa AbstractVector ? first(val) : val), nrow(df))
        end
    end
end


## save as tiff

In [None]:

function save_as_geotiff(output_path, xs, ys, arr; crs="EPSG:3035")
    nx = length(xs)
    ny = length(ys)

    # geotransform
    x0 = xs[1]          # top-left X
    y0 = ys[1]          # top-left Y
    dx = xs[2] - xs[1]  # resolution (positive)
    dy = ys[2] - ys[1]  # negative

    ArchGDAL.create(output_path, driver="GTiff", width=nx, height=ny, nbands=1, dtype=Float32) do ds
        ArchGDAL.setgeotransform!(ds, (x0, dx, 0.0, y0, 0.0, dy))
        ArchGDAL.setproj!(ds, crs)

        band = ArchGDAL.getband(ds, 1)

        # reshape vec back to matrix
        mat = reshape(arr, ny, nx)

        ArchGDAL.write!(band, mat)
    end
end

for var in [:pred_oBD, :pred_mBD]
    save_as_geotiff("./map/$(var)_2018.tif", xs, ys, df[!, var]; crs="EPSG:3035")
end


In [26]:
function read_cov(cov_paths,window)
    
    baseraster = Raster(red_urls[1], lazy=true)
    
    dim = (length(baseraster), length(red_urls))
    array = Matrix{F32_M}(undef, dim);

    @inbounds @batch per=thread for i in 1:length(red_urls)
        npix = dim[1]
        
        #red_data = Vector{BTE_M}(undef, npix)
        #nir_data = Vector{BTE_M}(undef, npix)
        #ndvi_data = Vector{F32_M}(undef, npix)
        
        red_data = vec(Raster(red_urls[i]).data)
        nir_data = vec(Raster(nir_urls[i]).data)
        
        array[:, i] .= @. clamp(
            ((nir_data - red_data) / (nir_data + red_data)),
            -1, 1) * 125 + 125
    end

    return array
    
end

362
362


In [None]:
using Random

#using Base.Threads
using Polyester
using MKL

using Statistics
using Base.Filesystem
using Missings

const global F32_M = Union{Float32, Missing}
const global BTE_M = Union{UInt8, Missing}

function landsat_urls(base_url, band, tile, ys, m1s, m2s)
    urls = []

    for y in ys
        for (m1,m2) in zip(m1s,m2s)
            url = replace(base_url, 
                "{IP}" => string(rand(30:46)), 
                "{TILE}" => tile,
                "{BAND}" => band, 
                "{DT1}" => (string(y) * m1), 
                "{DT2}" => (string(y) * m2)
            )
            push!(urls, url)
        end
    end

    return urls
end



function nan_mean(slice)
    non_missing = collect(skipmissing(slice))
    return isempty(non_missing) ? NaN : mean(non_missing)
end

function compute_bsf(array, agg_step, margin)

    n_pix = size(array)[1]
    n_times = size(array)[2]
    
    indices = []
    for i in 1:agg_step:n_times
        i0 = ((i - margin) >= 1 ? (i - margin) : 1)
        i1 = ((i + agg_step + margin - 1) <= n_times ? (i + agg_step + margin - 1) : n_times)
        push!(indices, (i0, i1))
    end
    
    dim = (length(indices), n_pix)
    array_y = Matrix{F32_M}(undef, dim)
    
    n_parts = Int32(ceil(n_pix / 96))
    i_threads = [ (j+n_parts) > n_pix ? (j,n_pix) : (j,j+n_parts) for j in 1:n_parts:n_pix ];
    
    @inbounds @batch per=thread for it in 1:length(i_threads)
        it0, it1 = i_threads[it]
        for i in 1:length(indices)
            i0, i1 = indices[i]
            array_y[i,it0:it1] .= mapslices(nan_mean, (array[it0:it1,i0:i1] .<= th), dims= 2)
        end
    end

    return array_y
end 

function save_rasters(baseraster, raster_files, data, nodata, s3_paths)
    
    baseraster =  Raster(baseraster, lazy=true)
    
    @inbounds @batch per=thread for i in 1:length(raster_files)
    
        outfile = raster_files[i]
        
        new_raster = Raster(
            UInt8.(replace(round.(data[i,:]), NaN => nodata, missing => nodata)),  
            dims(baseraster), 
            missingval=UInt8(nodata),
        )
        
        Rasters.write(
            outfile, 
            new_raster,
            options=(
                "COMPRESS" => "DEFLATE",
                "TILED" => "YES",
                "NUM_THREADS" => "8"
            ); force=true
        )
    
        s3_path = s3_paths[i]
        run(pipeline(`mc cp -q $outfile $s3_path`, stdout=devnull, stderr=devnull))
    end
end

const base_url = "/vsicurl/http://192.168.49.{IP}:8333/prod-landsat-ard2/{TILE}/v1_masked/{BAND}_glad.swa.ard2_m_30m_s_{DT1}_{DT2}_go_epsg.4326_v1.tif"

const y1 = 1997
const y2 = 2024
const m1s = ["0101", "0301", "0501", "0701", "0901", "1101"]
const m2s = ["0228", "0430", "0630", "0831", "1031", "1231"]

out_bucket = "tmp-julia"
tiles = readlines("/mnt/tupi/JULIA_BIDS_2025/tiles.csv")

const th = 156
const agg_step = 6
const margin = Int(agg_step / 2)

tile="024W_75N"

In [18]:
import Pkg
# Pkg.add("JSON")


[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `/mnt/tupi/HybridModeling/EasyDensity.jl-main/Project.toml`
[32m[1m  No Changes[22m[39m to `/mnt/tupi/HybridModeling/EasyDensity.jl-main/Manifest.toml`
[92m[1mPrecompiling[22m[39m project...
         [91m  ✗ [39mGLMakie
  0 dependencies successfully precompiled in 10 seconds. 692 already precompiled.
  [91m1[39m dependency errored.
  For a report of the errors see `julia> err`. To retry use `pkg> precompile`


# Reading using Rasters

In [4]:
using Rasters
using ArchGDAL

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling Rasters [a3a2b9e3-a471-40c9-b274-f788e487c689] (cache misses: wrong dep version loaded (4), mismatched flags (4))
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling RastersNCDatasetsExt [bba50282-38eb-5d15-bdf0-02a8d9bd9f97] 
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling RastersMakieExt [dcb859c5-a41b-597c-97ab-c4a11b2128cf] (cache misses: wrong dep version loaded (2), mismatched flags (2))
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling RastersStatsBaseExt [0bd19630-9938-51cf-9b6f-d8cb9aa5cba5] (cache misses: wrong dep version loaded (2), mismatched flags (2))
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling ArchGDAL [c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3] (cache misses: wrong dep version loaded (2), mismatched flags (2))
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling RastersArchGDALExt [003ceca8-0bef-59e8-b8ec-a536193683ee] (cache misses: wrong dep version loaded (4), misma

In [5]:
?Raster

search: [0m[1mR[22m[0m[1ma[22m[0m[1ms[22m[0m[1mt[22m[0m[1me[22m[0m[1mr[22m [0m[1mR[22m[0m[1ma[22m[0m[1ms[22m[0m[1mt[22m[0m[1me[22m[0m[1mr[22ms @asse[0m[1mr[22mt Patte[0m[1mr[22mn Scatte[0m[1mr[22m scatte[0m[1mr[22m [0m[1mr[22m[0m[1ma[22m[0m[1ms[22m[0m[1mt[22m[0m[1me[22m[0m[1mr[22mize [0m[1mR[22m[0m[1ma[22m[0m[1ms[22m[0m[1mt[22m[0m[1me[22m[0m[1mr[22mStack



```
Raster <: AbstractRaster

Raster(filepath::String; kw...)
Raster(A::AbstractDimArray; kw...)
Raster(A::AbstractArray, dims; kw...)
```

A generic [`AbstractRaster`](@ref) for spatial/raster array data. It can hold either memory-backed arrays or, if `lazy=true`, a [`FileArray`](@ref), which stores the `String` path to an unopened file.

If `lazy=true`, the file will only be opened lazily when it is indexed with `getindex` or when `read(A)` is called. Broadcasting, taking a view, reversing, and most other methods will *not* load data from disk; they will be applied later, lazily.

# Arguments

  * `dims`: `Tuple` of `Dimension`s needed when an `AbstractArray` is used.

# Keywords

  * `name`: a `Symbol` name for a Raster, which will also retrieve the    a named layer if `Raster` is used on a multi-layered file like a NetCDF.
  * `group`: the group in the dataset where `name` can be found. Only needed for nested datasets.   A `String` or `Symbol` will select a single group. Pairs can also used to access groups   at any nested depth, i.e `group=:group1 => :group2 => :group3`.
  * `missingval`: value representing missing data, normally detected from the file and    automatically converted to `missing`. Setting to an alternate value, such as `0`    or `NaN` may be desirable for improved perfomance. `nothing` specifies no missing value.    Using the same `missingval` the file already has removes the overhead of replacing it,   this can be done by passing the `missingval` function as `missingval`.    If the file has an incorrect value, we can manually define the transformation   as a pair like `correct_value => missing` or `correct_value => NaN`.   `correct_value => correct_value` will keep remove the overhead of changing it.    Note: When `raw=true` is set, `missingval` is not changed from the value specified   in the file.
  * `metadata`: `Dict` or `Metadata` object for the array, or `NoMetadata()`.
  * `crs`: the coordinate reference system of  the objects `XDim`/`YDim` dimensions.   Only set this if you know the detected crs is incorrect, or it is not present in   the file. The `crs` is expected to be a GeoFormatTypes.jl `CRS` or `Mixed` mode `GeoFormat` object,   like `EPSG(4326)`.
  * `mappedcrs`: the mapped coordinate reference system of the objects `XDim`/`YDim` dimensions.   for `Mapped` lookups these are the actual values of the index. For `Projected` lookups   this can be used to index in eg. `EPSG(4326)` lat/lon values, having it converted automatically.   Only set this if the detected `mappedcrs` in incorrect, or the file does not have a `mappedcrs`,   e.g. a tiff. The `mappedcrs` is expected to be a GeoFormatTypes.jl `CRS` or `Mixed` mode `GeoFormat` type.
  * `refdims`: `Tuple of` position `Dimension`s the array was sliced from, defaulting to `()`.   Usually not needed.

When a filepath `String` is used:

  * `dropband`: drop single band dimensions when creating stacks from filenames. `true` by default.
  * `lazy`: A `Bool` specifying if to load data lazily from disk. `false` by default.
  * `source`: Usually automatically detected from filepath extension.    To manually force, a `Symbol` can be passed `:gdal`, `:netcdf`, `:grd`, `:grib`.   The internal [`Rasters.Source`](@ref) objects, such as `Rasters.GDALsource()`,    `Rasters.GRIBsource()` or `Rasters.NCDsource()` can also be used.
  * `scaled`: apply scale and offset as `x * scale + offset` where    `scale` and/or `offset` are found in file metadata. `true` by default.   This is common where data has been convert to e.g. UInt8 to save disk space.   To ignore `scale` and `offset` metadata, use `scaled=false`.    Note 1: If `scale` and `offset` are `1.0` and `0.0` they will be ignored and the    original type will be used even when `scaled=true`. This is because these values    may be fallback defaults and we do not want to convert every `Real` array to larger   `Float64` values.    Note 2: `raw=true` will ignore `scaled` and `missingval` and return   the raw values.
  * `raw`: turn of all scaling and masking and load the raw values from disk.   `false` by default. If `true`, `scaled` will be set to `false` and `missingval`   will to the existing missing value in the file. A warning will be printed if    `scaled` or `missingval` are manually set to another value.

When A is an `AbstractDimArray`:

  * `data`: can replace the data in an existing `AbstractRaster`


In [6]:
using CSV, DataFrames
layers = CSV.read(joinpath("raster_files.csv"), DataFrame; normalizenames=true)
layers

Row,layers
Unnamed: 0_level_1,String
1,http://192.168.49.30:8333/ai4sh-landmasked/bsf/bsf_glad.landsat.ard2.seasconv.longterm_p50_30m_s_20000101_20221231_eu_epsg.3035_v20231218.tif
2,http://192.168.49.30:8333/ai4sh-landmasked/longterm_slopes/bsf_glad.landsat.ard2.seasconv.yearly.m.theilslopes_m_30m_s_20000101_20221231_eu_epsg.3035_v20231218.tif
3,http://192.168.49.30:8333/ai4sh-landmasked/fapar/fapar_glad.landsat.ard2.seasconv.longterm_p50_30m_s_20000101_20221231_eu_epsg.3035_v20231218.tif
4,http://192.168.49.30:8333/ai4sh-landmasked/longterm_slopes/ndti.min.slopes_glad.landsat.ard2.seasconv.yearly.min.theilslopes_m_30m_s_20000101_20221231_eu_epsg.3035_v20231218.tif
5,http://192.168.49.30:8333/ai4sh-landmasked/longterm_slopes/ndvi_glad.landsat.ard2.seasconv.yearly.m.theilslopes_m_30m_s_20000101_20221231_eu_epsg.3035_v20231218.tif
6,http://192.168.49.30:8333/ai4sh-landmasked/ndwi.gao/ndwi.gao_glad.landsat.ard2.seasconv.longterm_p50_30m_s_20000101_20221231_eu_epsg.3035_v20231218.tif
7,http://192.168.49.30:8333/ai4sh-landmasked/longterm_slopes/ndwi_glad.landsat.ard2.seasconv.yearly.m.theilslopes_m_30m_s_20000101_20221231_eu_epsg.3035_v20231218.tif
8,http://192.168.49.30:8333/ai4sh-landmasked/sar/backscatter.vh_s1gbm_m_30m_s_20160101_20171231_eu_epsg.3035.v20240613.tif
9,http://192.168.49.30:8333/ai4sh-landmasked/sar/backscatter.vv_s1gbm_m_30m_s_20160101_20171231_eu_epsg.3035.v20240613.tif
10,http://192.168.49.30:8333/ai4sh/dtm/dtm.bareearth_ensemble_p10_120m_s_20000101_20221231_eu_epsg.3035_v20240424.tif


In [14]:
raster_fn = layers[!,"layers"][1]
raster_ds = Raster(raster_fn, lazy=false)
raster_ds



LoadError: InterruptException:

## Hybrid modeling

In [1]:
using Pkg
#Pkg.activate(".")
#Pkg.instantiate()
using Revise
using EasyHybrid
using Lux
using Optimisers
using WGLMakie
using Random
using LuxCore
using CSV, DataFrames
using EasyHybrid.MLUtils
using Statistics
using Plots
using JLD2
# using CairoMakie

In [19]:
# d = load("best_model_fold3.jld2")

# hm = d["hm"]
# ps = d["ps"]   # previous weights
# st = d["st"]   # previous states
# rlt = train(hm, new_data; init_params = ps, init_state = st, ...)
### warm start###

In [2]:
testid = "03_hybridNN";
version = "v20251125"
results_dir = joinpath(@__DIR__, "eval");
target_names = [:BD, :SOCconc, :CF, :SOCdensity];

# input
df = CSV.read(joinpath(@__DIR__, "data/lucas_preprocessed_$version.csv"), DataFrame; normalizenames=true)
println(size(df))

LoadError: ArgumentError: "/mnt/tupi/HybridModeling/EasyDensity.jl-main/data/lucas_preprocessed_v20251125.csv" is not a valid file or doesn't exist