In [10]:
using CSV, DataFrames
version = "v20251219"

"v20251219"

## Overlay

In [1]:
# load cov list
using JSON
using ArchGDAL
using Proj
using DataFrames
using Rasters
using Base.Threads

cov = JSON.parsefile("./cov_path_full.json");
pnames = collect(keys(cov))
paths = collect(values(cov))

using Proj, ArchGDAL

function make_grid_3035(bbox, res_m)
    # bbox = (xmin, ymin, xmax, ymax) in EPSG:3035

    xmin, ymin, xmax, ymax = bbox

    xs = collect(xmin:res_m:xmax)
    ys = collect(ymax:-res_m:ymin)  # north → south

    return xs, ys
end

function sample_tiff_onto_grid(tif_path, xs, ys, tf)
    ArchGDAL.read("/vsicurl/" * tif_path) do ds

        # 1. 检查 raster 的 CRS（WKT）
        tiff_wkt = ArchGDAL.getproj(ds)
        same_crs = occursin("3035", lowercase(tiff_wkt))
        
        # 2. GeoTransform
        band = ArchGDAL.getband(ds, 1)
        gt = ArchGDAL.getgeotransform(ds)
        x0, dx, _, y0, _, dy = gt  # 注意 dx, dy 可能是负的

        nx = length(xs)
        ny = length(ys)
        arr = Matrix{Float32}(undef, ny, nx)

        @inbounds for i in 1:ny
            for j in 1:nx
                x3035 = xs[j]
                y3035 = ys[i]

                # 3. 如果 raster 不是 3035，把 (x3035,y3035) 转成 raster CRS 坐标
                if same_crs
                    xr = x3035
                    yr = y3035
                else
                    # 关键：这里是 tf(3035 -> raster)
                    yr, xr = tf(y3035, x3035)
                end

                # 4. 用 raster CRS 坐标变成 raster 像素 index（1-based）
                px = Int(round((xr - x0) / dx)) + 1
                py = Int(round((yr - y0) / dy)) + 1

                if px < 1 || py < 1 ||
                   px > ArchGDAL.width(ds) || py > ArchGDAL.height(ds)
                    arr[i, j] = NaN32
                    continue
                end

                val = ArchGDAL.read(band, py:py, px:px)
                arr[i, j] = Float32(val[1])
            end
        end

        return vec(arr)
    end
end


function convert_bbox_wgs84_to_3035(bbox_wgs84)
    xmin_lon, ymin_lat, xmax_lon, ymax_lat = bbox_wgs84
    
    tf = Proj.Transformation("EPSG:4326", "EPSG:3035")

    y1, x1 = tf(ymin_lat, xmin_lon)
    y2, x2 = tf(ymax_lat, xmin_lon)
    y3, x3 = tf(ymin_lat, xmax_lon)
    y4, x4 = tf(ymax_lat, xmax_lon)

    xs = (x1, x2, x3, x4)
    ys = (y1, y2, y3, y4)

    return (minimum(xs), minimum(ys), maximum(xs), maximum(ys))
end


res_m = 1000 # meters
bboxmine = (8.956051,51.815757,10.450192,53.154421) # examine area in northern DE, suggested by Bernhard
bbox3035 = convert_bbox_wgs84_to_3035(bboxmine);
xs, ys = make_grid_3035(bbox3035, res_m);

tf = Proj.Transformation("EPSG:3035", "EPSG:4326") # because it's always 4326, so we do it in lazy way

Transformation unknown
    source: ETRS89-extended / LAEA Europe
    target: WGS 84
    direction: forward


In [2]:
tnames = pnames#[270:360]
tpaths = paths#[270:360]  
println("Julia threads: ", Threads.nthreads())
println("length: ", length(tpaths))

Julia threads: 96
length: 362


In [3]:
chunk_size = 40
out = Vector{Any}(undef, length(tpaths))
m = Int[]
lock_m = ReentrantLock()
iii = 0
for chunk in Iterators.partition(eachindex(tpaths), chunk_size)
    println("chunk - $(iii)")

    @time @threads for i in chunk
        try
            out[i] = sample_tiff_onto_grid(tpaths[i], xs, ys, tf)
        catch
            lock(lock_m) do
                push!(m, i)
            end
        end
    end
    
    iii = iii + 1
end

chunk - 0
  2.562889 seconds (4.68 M allocations: 171.700 MiB, 4.54% gc time, 1725.23% compilation time)
chunk - 1
  2.451529 seconds (3.41 M allocations: 104.082 MiB, 2.21% gc time, 18.39% compilation time)
chunk - 2
  2.466750 seconds (3.34 M allocations: 103.009 MiB, 2.00% gc time, 11.94% compilation time)
chunk - 3
  2.309540 seconds (3.27 M allocations: 100.052 MiB, 1.69% gc time)
chunk - 4
  1.775924 seconds (3.27 M allocations: 100.009 MiB, 2.15% gc time)
chunk - 5
  2.300649 seconds (3.24 M allocations: 99.640 MiB, 1.63% gc time)
chunk - 6
  1.649341 seconds (3.27 M allocations: 100.000 MiB, 2.03% gc time)
chunk - 7
  1.860362 seconds (3.22 M allocations: 99.354 MiB, 3.91% gc time, 0.88% compilation time)
chunk - 8
  1.002840 seconds (3.26 M allocations: 99.890 MiB, 3.53% gc time)
chunk - 9
  0.071829 seconds (156.58 k allocations: 4.937 MiB, 5.07% gc time)


In [4]:
df = DataFrame()
nx = length(xs)
ny = length(ys)
df.x3035 = repeat(xs, inner=ny)
df.y3035 = repeat(ys, outer=nx)
for i in eachindex(tnames)
    df[!, Symbol(tnames[i])] = out[i]
end

In [36]:

CSV.write("overlaid_$(version).csv", df)

"overlaid_v20251219.csv"

## prepare the data as how it's processed before training

In [2]:
using CSV, DataFrames
# prepare data
# load in preprocessed data to get predictors
datafile = "/mnt/tupi/HybridModeling/EasyDensity.jl/data/lucas_preprocessed_v20251125.csv"
oridf = CSV.read(datafile, DataFrame; normalizenames=true)
predictors = Symbol.(names(oridf))[18:end-6]; # CHECK EVERY TIME 

In [4]:
using Statistics
# ? move the `csv` file into the `BulkDSOC/data` folder (create folder)
df_o = CSV.read("/mnt/tupi/HybridModeling/EasyDensity.jl/data/lucas_overlaid.csv", DataFrame, normalizenames=true);
println(size(df_o));

############################
###### clean targets #######
############################

# filter horizon depth = 10 cm
df_o = df_o[df_o.hzn_dep .== 10, :];
select!(df_o, Not(:hzn_dep));
println(size(df_o))

# identify noise time supervise
gdf = groupby(df_o, :id);
df_o.maxdiff = fill(0.0, nrow(df_o));  # initialize noise column
# compute max abs difference of SOCconc per id
for sub in groupby(df_o, :id)
    soc = sort(sub.soc)

    if length(soc) < 2
        maxdiff = -1
    else
        maxdiff = maximum(abs.(diff(soc)))
    end

    df_o[df_o.id .== sub.id[1], :maxdiff] .= maxdiff
    
end
println(size(df_o))
df_o = df_o[df_o.maxdiff .<= 50, :];
println(size(df_o))

# coords = collect(zip(df_o.lat, df_o.lon));

########################
###### clean cov #######
########################
# t clean covariates
names_cov = Symbol.(names(df_o))[18:end-1];

# Fix soilsuite and cropland extent columns
for col in names_cov
    if occursin("_soilsuite_", String(col))
        df_o[!, col] = replace(df_o[!, col], missing => 0)
    elseif occursin("cropland_extent_", String(col))
        df_o[!, col] = replace(df_o[!, col], missing => 0)
        df_o[!, col] .= ifelse.(df_o[!, col] .> 0, 1, 0)
    end
end

# rm missing values: 1. >5%, drop col; 2. <=5%, drop row
cols_to_drop_row = Symbol[];
cols_to_drop_col = Symbol[];
for col in names_cov
    n_missing = count(ismissing, df_o[!, col])
    frac_missing = n_missing / nrow(df_o)
    if frac_missing > 0.05
        println(n_missing, " ", col)
        select!(df_o, Not(col))  # drop the column
        push!(cols_to_drop_col, col)  
    elseif n_missing > 0
        # println(n_missing, " ", col)
        push!(cols_to_drop_row, col)  # collect column name
    end

    if occursin("CHELSA_kg", String(col)) 
        push!(cols_to_drop_col, col) 
        select!(df_o, Not(col))  # rm kg catagorical col
    end 
end

names_cov = filter(x -> !(x in cols_to_drop_col), names_cov) # remove cols-to-drop from names_cov
if !isempty(cols_to_drop_row) 
    df_o = subset(df_o, cols_to_drop_row .=> ByRow(!ismissing)) # drop rows with missing values in cols_to_drop_row
end
println(size(df_o))

cols_to_drop_col = Symbol[] 
for col in names_cov
    if std(df_o[:,col])==0
        push!(cols_to_drop_col, col)  # rm constant col (std==0)
        select!(df_o, Not(col))
    end
end
names_cov = filter(x -> !(x in cols_to_drop_col), names_cov) # remove cols-to-drop from names_cov
println(size(df_o))

# for col in names_cov # to check covairate distribution
#     println(string(col)[1:10], ' ', round(std(df[:, col]); digits=2), ' ', round(mean(df[:, col]); digits=2))
# end

# # Normalize covariates by (x-mean) / std
means = map(c -> mean(skipmissing(df_o[!, c])), predictors)
stds  = map(c -> std(skipmissing(df_o[!, c])), predictors)


(62577, 422)
(62199, 421)
(62199, 422)
(57343, 422)
33487 CHELSA_swe_1981_2010_V_2_1
(56117, 415)
(56117, 380)


In [6]:
## apply the normalizations to new training data
# get the overlaid data
version = "v20251219"
df = CSV.read("overlaid_$(version).csv", DataFrame)

# mend crop and soil suite layers
for col in predictors
    if occursin("_soilsuite_", String(col))
        df[!, col] = replace(df[!, col], missing => 0)
    elseif occursin("cropland_extent_", String(col))
        df[!, col] = replace(df[!, col], missing => 0)
        df[!, col] .= ifelse.(df[!, col] .> 0, 1, 0)
    end
end

for (i, col) in enumerate(predictors)
    df[!, col] = (Float64.(df[!, col]) .- means[i]) ./ stds[i]
end


In [8]:
clean(x) = filter(!isnan, skipmissing(x))

rows = Vector{NamedTuple}()

for col in predictors
    v_o = clean(oridf[!, col])
    v_d = clean(df[!, col])

    push!(rows, (
        variable   = String(col),
        q05_oridf  = quantile(v_o, 0.05),
        q05_df     = quantile(v_d, 0.05),
        q50_oridf  = quantile(v_o, 0.50),
        q50_df     = quantile(v_d, 0.50),
        q95_oridf  = quantile(v_o, 0.95),
        q95_df     = quantile(v_d, 0.95)
    ))
end

qt = DataFrame(rows)
CSV.write("predictor_quantiles_check.csv", qt)




"predictor_quantiles_check.csv"

In [11]:
CSV.write("production_preprocessed_$(version).csv", df)

"production_preprocessed_v20251219.csv"