# Data preparation for habitat distribution learning (Output: presence_species_name.csv and background_species_name.csv)

## Load libraries + Select species name

In [4]:
import numpy as np
import pandas as pd
import xarray as xr
from configparser import ConfigParser
import sqlalchemy as sa # conection to the database
from sqlalchemy import create_engine, text
from datetime import datetime, timedelta
import os
import rioxarray
species = 'Heracleum Mantegazzianum'
from src import db_connect, measurer

## Load occurrence data from database (output = species_occ_df dataframe)

In [6]:
def config(filename, section='postgresql'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)

    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception(
            'Section {0} not found in the {1} file'.format(section, filename))

    return db
config_path = os.environ.get("HOME") + "/uc1-urban-climate/database.ini"
engine_postgresql = db_connect.create_engine(config_path)
with engine_postgresql.begin() as conn:
    query = text(
        """    
                SELECT *
FROM luxembourg_species.neophytes_geometry
    """
    )
    species_occ_df = pd.read_sql_query(query, conn)

species_occ_df = species_occ_df[species_occ_df['species_name']==species]

x_coords = species_occ_df["gridnum2169_10m_x"].values
y_coords = species_occ_df["gridnum2169_10m_y"].values

In [7]:
species_occ_df[species_occ_df['species_name']==species]

Unnamed: 0,gbif_key,species,family,species_name,species_name_lower,observation_key,date_start,date_end,sample_date,taxon_kingdom,...,gridnum2169_100m_y,gridnum2169_100m,gridnum2169_1km_x,gridnum2169_1km_y,gridnum2169_1km,gridnum2169_10m_x,gridnum2169_10m_y,wkt_string,geometry,grid10mid
37,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,MNHNL00000002135,2014-06-06,2014-06-06,2014-06-06,Plantae,...,74800,100m_x77000_y74800,77000,74000,1km_x77000_y74000,77050,74890,"POLYGON((77050 74890 , 77050 74900 , 77060 749...",010300002079080000010000000500000000000000A0CF...,EPSG2169_GRID_10m_E77050N74890
55,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000004ZU,2007-01-01,2007-12-31,2007-12-31,Plantae,...,75700,100m_x77300_y75700,77000,75000,1km_x77000_y75000,77370,75790,"POLYGON((77370 75790 , 77370 75800 , 77380 758...",010300002079080000010000000500000000000000A0E3...,EPSG2169_GRID_10m_E77370N75790
56,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS0029100000509,2007-01-01,2007-12-31,2007-12-31,Plantae,...,77100,100m_x77500_y77100,77000,77000,1km_x77000_y77000,77590,77180,"POLYGON((77590 77180 , 77590 77190 , 77600 771...",01030000207908000001000000050000000000000060F1...,EPSG2169_GRID_10m_E77590N77180
57,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000004ZR,2007-01-01,2007-12-31,2007-12-31,Plantae,...,75500,100m_x77400_y75500,77000,75000,1km_x77000_y75000,77480,75560,"POLYGON((77480 75560 , 77480 75570 , 77490 755...",01030000207908000001000000050000000000000080EA...,EPSG2169_GRID_10m_E77480N75560
58,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000004ZQ,2007-01-01,2007-12-31,2007-12-31,Plantae,...,75500,100m_x77500_y75500,77000,75000,1km_x77000_y75000,77500,75530,"POLYGON((77500 75530 , 77500 75540 , 77510 755...",010300002079080000010000000500000000000000C0EB...,EPSG2169_GRID_10m_E77500N75530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2823,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000000O3,2001-01-01,2001-12-31,2001-12-31,Plantae,...,73900,100m_x74700_y73900,74000,73000,1km_x74000_y73000,74720,73980,"POLYGON((74720 73980 , 74720 73990 , 74730 739...",010300002079080000010000000500000000000000003E...,EPSG2169_GRID_10m_E74720N73980
2824,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000000O3,2001-01-01,2001-12-31,2001-12-31,Plantae,...,73900,100m_x74700_y73900,74000,73000,1km_x74000_y73000,74720,73980,"POLYGON((74720 73980 , 74720 73990 , 74730 739...",010300002079080000010000000500000000000000003E...,EPSG2169_GRID_10m_E74720N73980
2825,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000000NU,2001-01-01,2001-12-31,2001-12-31,Plantae,...,74800,100m_x77700_y74800,77000,74000,1km_x77000_y74000,77710,74840,"POLYGON((77710 74840 , 77710 74850 , 77720 748...",010300002079080000010000000500000000000000E0F8...,EPSG2169_GRID_10m_E77710N74840
2827,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000000NY,2000-01-01,2000-12-31,2000-12-31,Plantae,...,75100,100m_x77700_y75100,77000,75000,1km_x77000_y75000,77700,75150,"POLYGON((77700 75150 , 77700 75160 , 77710 751...",01030000207908000001000000050000000000000040F8...,EPSG2169_GRID_10m_E77700N75150


## Load data cube from tif files (output = xds_merged)

In [28]:
tif_dir = os.environ.get("HOME") +"/s3/data/d012_luxembourg/"
variable_list= [
        'air_temperature_2017_month_mean_10m_b12.tif',
        'dem_2019_10m_b1.tif',
        'dem_aspect_2019_10m_b1.tif',
        'dem_slope_2019_10m_b1.tif',
        'dem_surface_model_2019_10m_b1.tif',
        'hrl_treecover_2018_10m_b1.tif',
        'pH_CaCl_10m_b1.tif',
        'shadow_2019_10m_b1.tif',
        'soil_nitrat_10m_b1.tif',
        'twi_2019_10m_b1.tif'
]
temperature_file = "air_temperature_2017_month_mean_10m_b12.tif"
tif_files = [os.path.join(tif_dir, f) for f in variable_list if f.endswith('.tif')]
datasets = {}

for tif_file in tif_files:
    xds = rioxarray.open_rasterio(tif_file, cache=False, chunks=True, lock=False)
    if temperature_file in tif_file:
        # Select band 6 (index 5)
        band_6 = xds.isel(band=5)
        datasets[tif_file] = band_6
    else:
         datasets[tif_file] = xds

# Step 2: Rename DataArrays in the merged dataset
for name, ds in datasets.items():
    if ds.name is None:
        variable_name = os.path.basename(name).split('.')[0]
        datasets[name] = ds.rename(variable_name)

xds_merged = xr.merge(datasets.values())
xds_merged

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1048, 980)","(1048, 980)"
Dask graph,1 chunks in 12 graph layers,1 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1048, 980) (1048, 980) Dask graph 1 chunks in 12 graph layers Data type float32 numpy.ndarray",980  1048,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1048, 980)","(1048, 980)"
Dask graph,1 chunks in 12 graph layers,1 chunks in 12 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.84 MiB,7.84 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 15 graph layers,1 chunks in 15 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 7.84 MiB 7.84 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 15 graph layers Data type float64 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,7.84 MiB,7.84 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 15 graph layers,1 chunks in 15 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.92 MiB 3.92 MiB Shape (1, 1048, 980) (1, 1048, 980) Dask graph 1 chunks in 14 graph layers Data type float32 numpy.ndarray",980  1048  1,

Unnamed: 0,Array,Chunk
Bytes,3.92 MiB,3.92 MiB
Shape,"(1, 1048, 980)","(1, 1048, 980)"
Dask graph,1 chunks in 14 graph layers,1 chunks in 14 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Associate data frame to the cubes (where 'species exist')

In [13]:
x_coords_da = xr.DataArray(x_coords)
y_coords_da = xr.DataArray(y_coords)

In [29]:
nearest_habitat_values = xds_merged.sel(
    x=x_coords_da,
    y=y_coords_da,
    method="nearest"
)

# Convert to DataFrame and merge with occurrence data
nearest_habitat_df = nearest_habitat_values.to_dataframe().reset_index()
nearest_habitat_df[species] = True

# nearest_habitat_df.to_csv('presence_' + species + '.csv', index=False)

In [31]:
nearest_habitat_values.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y,spatial_ref,air_temperature_2017_month_mean_10m_b12,dem_2019_10m_b1,dem_aspect_2019_10m_b1,dem_slope_2019_10m_b1,dem_surface_model_2019_10m_b1,hrl_treecover_2018_10m_b1,pH_CaCl_10m_b1,shadow_2019_10m_b1,soil_nitrat_10m_b1,twi_2019_10m_b1
dim_0,band,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,77055.0,74895.0,0,289.899200,253.0,167.471191,12.979350,273.108307,62.0,5.251987,0.709180,2.030321,0.198561
1,1,77375.0,75795.0,0,290.497772,236.0,74.744881,8.111279,253.421173,54.0,5.278076,0.713783,2.269577,0.163474
2,1,77595.0,77185.0,0,289.913239,241.0,310.914398,13.934706,242.149597,0.0,5.457896,0.460301,2.061956,0.188581
3,1,77485.0,75565.0,0,290.693359,241.0,225.000000,4.044692,242.928757,0.0,5.242150,0.293810,2.336421,0.199506
4,1,77505.0,75535.0,0,290.693359,241.0,225.000000,5.051152,244.383347,0.0,5.245652,0.389986,2.336421,0.330567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,1,74725.0,73985.0,0,291.060974,267.0,45.000000,1.012751,269.585327,0.0,5.389778,0.401498,2.905029,0.208080
122,1,74725.0,73985.0,0,291.060974,267.0,45.000000,1.012751,269.585327,0.0,5.389778,0.401498,2.905029,0.208080
123,1,77715.0,74845.0,0,290.863098,243.0,225.000000,2.024868,258.211456,0.0,5.262441,0.607841,2.572939,0.264998
124,1,77705.0,75155.0,0,290.608978,243.0,144.462326,6.137380,243.027802,0.0,5.288374,0.134151,2.467590,0.341153


## Generate background (pseudo-absence) data 

### Approach 1: select data from the SQL table (with 'species name' is different than the selected species)

In [7]:
# SQL query to select points where species does NOT occur
query_non_occ = f"""
SELECT *
FROM luxembourg_species.neophytes_geometry
WHERE species_name != '{species}';
"""

# Fetch the non-occurrence data into a Pandas DataFrame
non_occ_df = pd.read_sql(query_non_occ, engine_postgresql)


In [15]:
# Extract x and y coordinates from non-occurrence DataFrame
non_occ_df = non_occ_df.sample(n=100)

x_non_occ_coords = non_occ_df['gridnum2169_10m_x'].values
y_non_occ_coords = non_occ_df['gridnum2169_10m_y'].values

# Step 1: Randomly select 60,000 points
num_points_to_select = len(nearest_habitat_df)
total_points = len(x_non_occ_coords)
print(num_points_to_select)
# Ensure the number of points doesn't exceed available data
if total_points > num_points_to_select:
    selected_indices = np.random.choice(total_points, num_points_to_select, replace=False)
else:
    selected_indices = np.arange(total_points)  # Use all points if fewer than 60,000

# Step 2: Apply the selection
x_selected = x_non_occ_coords[selected_indices]
y_selected = y_non_occ_coords[selected_indices]

# Step 3: Extract habitat values for the selected non-occurrence coordinates
non_occ_habitat_values = xds_merged.sel(
    x=x_selected,
    y=y_selected,
    method="nearest"
)

# Step 4: Convert the non-occurrence habitat data to a DataFrame
non_occ_habitat_df = non_occ_habitat_values.to_dataframe().reset_index()

# Step 5: Mark these samples as "False" for species presence
non_occ_habitat_df[species] = False

# Step 6: Save the DataFrame to CSV
non_occ_habitat_df.to_csv('background_' + species + '.csv', index=False)

63504


### Approach 2: excluding areas where the species is known to occur and selecting background points from the remaining areastotal_points

In [None]:
# Step 1: Ensure valid coordinates are used for dropping occurrences
# Filter the coordinates to only include those within the dataset's bounds
valid_x = species_occ_df['gridnum2169_10m_x'][species_occ_df['gridnum2169_10m_x'].isin(xds_merged['x'])].values
valid_y = species_occ_df['gridnum2169_10m_y'][species_occ_df['gridnum2169_10m_y'].isin(xds_merged['y'])].values

# Step 2: Exclude known occurrence points from the environmental dataset
xds_no_occ = xds_merged.drop_sel(
    y=valid_y,  # Use valid y-coordinates from species occurrence
    x=valid_x   # Use valid x-coordinates from species occurrence
)

# Step 3: Stack the remaining data for easier sampling
xds_stack = xds_no_occ.drop_indexes(["x", "y"]).drop_vars(['spatial_ref']).stack(sample=("x", "y"))

# Step 4: Randomly sample non-occurrence points (adjust number of samples as needed)
xds_sample = xds_stack.isel(
    sample=sorted(np.random.randint(0, xds_stack.sample.shape, 12 * len(species_occ_df)))  # Sample more points
).compute()


# Filter out invalid values (e.g., areas where wetness == 0)
xds_sample = xds_sample.where(xds_sample.wetness != 0, drop=True)

# Step 6: Optionally reduce the number of samples
xds_sample = xds_sample.isel(sample=sorted(np.random.randint(0, xds_sample.sample.shape, 10 * len(species_occ_df))))

# Step 7: Create a DataFrame from the sampled non-occurrence data
non_occ_habitat_df = xds_sample.to_dataframe().reset_index(drop=True)

# Mark the non-occurrence samples as "False" for species presence
non_occ_habitat_df[species] = False

# Print a few rows to check
print(non_occ_habitat_df.head())
