# Species distribution modeling for Luxembourg 

## Load libraries + Select species name

In [1]:
import numpy as np
import pandas as pd
import xarray as xr
from configparser import ConfigParser
import sqlalchemy as sa # conection to the database
from sqlalchemy import create_engine, text
from datetime import datetime, timedelta
import os
import rioxarray
import xgboost as xgb
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from sklearn.metrics import mean_squared_error, r2_score
from pyproj import Transformer
from scipy.ndimage import gaussian_filter
import elapid
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from pyproj import Transformer
import folium
from branca.colormap import linear
from folium.plugins import HeatMap



species = 'Heracleum Mantegazzianum'

## Data Preparation

### Load occurrence data from database )

In [2]:
def config(filename, section='postgresql'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)

    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception(
            'Section {0} not found in the {1} file'.format(section, filename))

    return db
keys = config(filename='database_nilu.ini')
POSTGRESQL_SERVER_NAME=keys['host']
PORT=                  keys['port']
Database_name =        keys['database']
USER =                 keys['user']
PSW =                  keys['password']
##################################################

engine_postgresql = sa.create_engine('postgresql://'+USER+':'+PSW+ '@'+POSTGRESQL_SERVER_NAME+':'+str(PORT)+ '/' + Database_name)
print (engine_postgresql)
connection = engine_postgresql.raw_connection()
cursor = connection.cursor()
connection.commit()
print ("done")

query = """
SELECT *
FROM luxembourg_species.neophytes_geometry
"""
species_occ_df = pd.read_sql(query, engine_postgresql)

species_occ_df = species_occ_df[species_occ_df['species_name']==species]

x_coords = species_occ_df["gridnum2169_10m_x"].values
y_coords = species_occ_df["gridnum2169_10m_y"].values

Engine(postgresql://nilu_bachir:***@5.75.190.71:6947/fairicube)
done


In [3]:
species_occ_df[species_occ_df['species_name']==species]
species_occ_df.head()

Unnamed: 0,gbif_key,species,family,species_name,species_name_lower,observation_key,date_start,date_end,sample_date,taxon_kingdom,...,gridnum2169_100m_y,gridnum2169_100m,gridnum2169_1km_x,gridnum2169_1km_y,gridnum2169_1km,gridnum2169_10m_x,gridnum2169_10m_y,wkt_string,geometry,grid10mid
37,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,MNHNL00000002135,2014-06-06,2014-06-06,2014-06-06,Plantae,...,74800,100m_x77000_y74800,77000,74000,1km_x77000_y74000,77050,74890,"POLYGON((77050 74890 , 77050 74900 , 77060 749...",010300002079080000010000000500000000000000A0CF...,EPSG2169_GRID_10m_E77050N74890
55,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000004ZU,2007-01-01,2007-12-31,2007-12-31,Plantae,...,75700,100m_x77300_y75700,77000,75000,1km_x77000_y75000,77370,75790,"POLYGON((77370 75790 , 77370 75800 , 77380 758...",010300002079080000010000000500000000000000A0E3...,EPSG2169_GRID_10m_E77370N75790
56,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS0029100000509,2007-01-01,2007-12-31,2007-12-31,Plantae,...,77100,100m_x77500_y77100,77000,77000,1km_x77000_y77000,77590,77180,"POLYGON((77590 77180 , 77590 77190 , 77600 771...",01030000207908000001000000050000000000000060F1...,EPSG2169_GRID_10m_E77590N77180
57,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000004ZR,2007-01-01,2007-12-31,2007-12-31,Plantae,...,75500,100m_x77400_y75500,77000,75000,1km_x77000_y75000,77480,75560,"POLYGON((77480 75560 , 77480 75570 , 77490 755...",01030000207908000001000000050000000000000080EA...,EPSG2169_GRID_10m_E77480N75560
58,3034824,Heracleum mantegazzianum Somm. et Lev.,Apiaceae,Heracleum Mantegazzianum,Heracleum mantegazzianum,DSS00291000004ZQ,2007-01-01,2007-12-31,2007-12-31,Plantae,...,75500,100m_x77500_y75500,77000,75000,1km_x77000_y75000,77500,75530,"POLYGON((77500 75530 , 77500 75540 , 77510 755...",010300002079080000010000000500000000000000C0EB...,EPSG2169_GRID_10m_E77500N75530


### Load data cube from tif files (output = xds_merged)

In [4]:
tif_dir = os.environ.get("HOME") +"/s3/data/d012_luxembourg/"
variable_list= [
        'air_temperature_2017_month_mean_10m_b12.tif',
        'dem_2019_10m_b1.tif',
        'dem_aspect_2019_10m_b1.tif',
        'dem_slope_2019_10m_b1.tif',
        'dem_surface_model_2019_10m_b1.tif',
        'hrl_treecover_2018_10m_b1.tif',
        'pH_CaCl_10m_b1.tif',
        'shadow_2019_10m_b1.tif',
        'soil_nitrat_10m_b1.tif',
        'twi_2019_10m_b1.tif'
]
temperature_file = "air_temperature_2017_month_mean_10m_b12.tif"
tif_files = [os.path.join(tif_dir, f) for f in variable_list if f.endswith('.tif')]
datasets = {}

for tif_file in tif_files:
    xds = rioxarray.open_rasterio(tif_file, cache=False, chunks=True, lock=False)
    if temperature_file in tif_file:
        # Select band 6 (index 5)
        band_6 = xds.isel(band=5)
        datasets[tif_file] = band_6
    else:
         datasets[tif_file] = xds

# Step 2: Rename DataArrays in the merged dataset
for name, ds in datasets.items():
    if ds.name is None:
        variable_name = os.path.basename(name).split('.')[0]
        datasets[name] = ds.rename(variable_name)

xds_merged = xr.merge(datasets.values())

### Associate data frame to the cubes (where 'species exist')

In [5]:
x_coords_da = xr.DataArray(x_coords)
y_coords_da = xr.DataArray(y_coords)

In [6]:
nearest_habitat_values = xds_merged.sel(
    x=x_coords_da,
    y=y_coords_da,
    method="nearest"
)

# Convert to DataFrame and merge with occurrence data
nearest_habitat_df = nearest_habitat_values.to_dataframe().reset_index()
nearest_habitat_df[species] = True

print(len(nearest_habitat_df))

126


In [7]:
nearest_habitat_df.head()

Unnamed: 0,dim_0,band,x,y,spatial_ref,air_temperature_2017_month_mean_10m_b12,dem_2019_10m_b1,dem_aspect_2019_10m_b1,dem_slope_2019_10m_b1,dem_surface_model_2019_10m_b1,hrl_treecover_2018_10m_b1,pH_CaCl_10m_b1,shadow_2019_10m_b1,soil_nitrat_10m_b1,twi_2019_10m_b1,Heracleum Mantegazzianum
0,0,1,77055.0,74895.0,0,289.8992,253.0,167.471191,12.97935,273.108307,62.0,5.251987,0.70918,2.030321,0.198561,True
1,1,1,77375.0,75795.0,0,290.497772,236.0,74.744881,8.111279,253.421173,54.0,5.278076,0.713783,2.269577,0.163474,True
2,2,1,77595.0,77185.0,0,289.913239,241.0,310.914398,13.934706,242.149597,0.0,5.457896,0.460301,2.061956,0.188581,True
3,3,1,77485.0,75565.0,0,290.693359,241.0,225.0,4.044692,242.928757,0.0,5.24215,0.29381,2.336421,0.199506,True
4,4,1,77505.0,75535.0,0,290.693359,241.0,225.0,5.051152,244.383347,0.0,5.245652,0.389986,2.336421,0.330567,True


### Generate background (pseudo-absence) data 

Select data from the SQL table (with species different than the selected one + different from any location in the species data)

In [8]:
# SQL query to select points where species does NOT occur but share the same grid coordinates
query_non_occ = f"""
SELECT *
FROM luxembourg_species.neophytes_geometry
WHERE species_name != '{species}'
  AND (gridnum2169_10m_x, gridnum2169_10m_y) NOT IN (
    SELECT gridnum2169_10m_x, gridnum2169_10m_y
    FROM luxembourg_species.neophytes_geometry
    WHERE species_name = '{species}'
  );
"""

# Fetch the non-occurrence data into a Pandas DataFrame
non_occ_df_all = pd.read_sql(query_non_occ, engine_postgresql)
len(non_occ_df_all)

2746

In [9]:
#Specify number of background points 
nb_background = len(nearest_habitat_df)*3
non_occ_df = non_occ_df_all.sample(n=nb_background)

x_non_occ_coords = non_occ_df['gridnum2169_10m_x'].values
y_non_occ_coords = non_occ_df['gridnum2169_10m_y'].values


x_selected = x_non_occ_coords
y_selected = y_non_occ_coords

x_selected_da = xr.DataArray(x_selected)
y_selected_da = xr.DataArray(y_selected)

# Step 3: Extract habitat values for the selected non-occurrence coordinates
non_occ_habitat_values = xds_merged.sel(
    x=x_selected_da,
    y=y_selected_da,
    method="nearest"
)

# Step 4: Convert the non-occurrence habitat data to a DataFrame
non_occ_habitat_df = non_occ_habitat_values.to_dataframe().reset_index()

# Step 5: Mark these samples as "False" for species presence
non_occ_habitat_df[species] = False


print(len(non_occ_habitat_df))
#non_occ_habitat_df.to_csv('background_' + species + '.csv', index=False)

378


In [10]:
non_occ_habitat_df.head()

Unnamed: 0,dim_0,band,x,y,spatial_ref,air_temperature_2017_month_mean_10m_b12,dem_2019_10m_b1,dem_aspect_2019_10m_b1,dem_slope_2019_10m_b1,dem_surface_model_2019_10m_b1,hrl_treecover_2018_10m_b1,pH_CaCl_10m_b1,shadow_2019_10m_b1,soil_nitrat_10m_b1,twi_2019_10m_b1,Heracleum Mantegazzianum
0,0,1,77005.0,78525.0,0,289.512939,253.0,78.690063,14.302914,261.669189,68.0,5.462172,0.754171,2.369296,0.259701,False
1,1,1,75925.0,76895.0,0,289.728851,287.0,345.465546,19.221525,285.865387,61.0,5.300293,0.814873,2.433146,0.154411,False
2,2,1,75935.0,74045.0,0,291.422089,275.0,135.0,3.035724,273.817535,0.0,5.819727,0.08878,2.208288,0.227748,False
3,3,1,77705.0,77555.0,0,290.436005,235.0,85.601295,9.256681,248.299973,44.0,5.62371,0.663398,2.118402,0.216308,False
4,4,1,77845.0,75665.0,0,290.969055,305.0,225.0,3.035724,305.472443,0.0,5.332519,0.337024,2.24423,0.182143,False


## Machine Learning for Modeling species distribution 

### Preparation of the data

In [11]:
# Rename columns for consistency
background_data = non_occ_habitat_df.rename(columns={'x': 'longitude', 'y': 'latitude'})
presence_data = nearest_habitat_df.rename(columns={'x': 'longitude', 'y': 'latitude'})
background_data = background_data.replace(-9999, np.nan)
presence_data = presence_data.replace(-9999, np.nan)

# Drop rows with NaN values
background_data = background_data.dropna()
presence_data = presence_data.dropna()

# Combine presence and background data
presence_labels = np.ones(len(presence_data))  # 1 for presence
background_labels = np.zeros(len(background_data))  # 0 for background

# Combine the data into one dataset
combined_data = pd.concat([presence_data, background_data], ignore_index=True)
labels = np.concatenate([presence_labels, background_labels])

# Select environmental variables (excluding species and coordinates)
features = combined_data.drop(columns=[species, 'longitude', 'latitude'])


coords = combined_data[['longitude', 'latitude']].to_numpy()


# Create a transformer to convert EPSG:2169 to EPSG:4326
transformer = Transformer.from_crs("EPSG:2169", "EPSG:4326", always_xy=True)

# Transform coordinates (EPSG:2169 to EPSG:4326)
lon, lat = transformer.transform(coords[:, 0], coords[:, 1])
coords_transformed = np.column_stack([lat, lon])  # Reprojected coordinates

# Filter transformed coordinates within Luxembourg's bounding box
lux_mask = (
    (coords_transformed[:, 0] >= 49.4) & (coords_transformed[:, 0] <= 50.2) &
    (coords_transformed[:, 1] >= 5.7) & (coords_transformed[:, 1] <= 6.5)
)

coords_lux = coords_transformed[lux_mask]

### 1- XGboost (considers Background data as real absence!) --> not recommended!

In [12]:
# Step 1: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 2: Initialize and train the XGBoost regression model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
xgb_model.fit(X_train, y_train)

# Step 3: Make predictions on the test set
y_pred_XG = xgb_model.predict(X_test)

# Step 4: Evaluate the model using Mean Squared Error (MSE) and R-squared (R2)
mse = mean_squared_error(y_test, y_pred_XG)
r2 = r2_score(y_test, y_pred_XG)
auc_XG = roc_auc_score(y_test, y_pred_XG)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-Squared (R2): {r2}")
print(f"AUC: {auc_XG}")

Mean Squared Error (MSE): 0.05066502766259542
R-Squared (R2): 0.7562623269430043
AUC: 0.9749466950959489


In [13]:
# Plot
lux_center = [49.8153, 6.1296]  # Approximate center of Luxembourg
m = folium.Map(location=lux_center, zoom_start=10, tiles="CartoDB.Positron")  # Light background map

# Define a colormap for XGBoost predictions
colormap = linear.viridis.scale(min(y_pred_XG), max(y_pred_XG))
colormap.caption = "Predicted Suitability (XGBoost)"

# Add XGBoost predictions as points
for lat, lon, suitability in zip(coords_lux[:, 0], coords_lux[:, 1], y_pred_XG):
    folium.CircleMarker(
        location=[lat, lon],
        radius=5,  # Adjust point size as needed
        color=colormap(suitability),  # Color based on suitability
        fill=True,
        fill_color=colormap(suitability),
        fill_opacity=0.8,
        popup=f"Predicted Suitability: {suitability:.2f}"  # Display suitability on click
    ).add_to(m)

# Add colormap legend to the map
colormap.add_to(m)

# Save the map
#m.save("luxembourg_xgboost_points_map.html")
m


### 2- Maxent (Elapid)

2.1 Split the data into training and testing (positive or negative)

In [14]:
# Step 2: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


# Step 3: Train the Maxent model
maxent = elapid.MaxentModel()
maxent.fit(X_train, y_train)

# Step 4: Make predictions
y_pred_MX1 = maxent.predict(X_test)

# Step 5: Evaluate the model
auc_score = roc_auc_score(y_test, y_pred_MX1)
print(f"Maxent AUC Score: {auc_score}")


Maxent AUC Score: 0.939498933901919


In [15]:
# Center the map on Luxembourg
lux_center = [49.8153, 6.1296]  # Approximate center of Luxembourg
m = folium.Map(location=lux_center, zoom_start=10, tiles="CartoDB.Positron")  # White/light background map

# Define a colormap for suitability
colormap = linear.viridis.scale(min(y_pred_MX1), max(y_pred_MX1))
colormap.caption = "Predicted Suitability"

# Add points to the map
for lat, lon, suitability in zip(coords_lux[:, 0], coords_lux[:, 1], y_pred_MX1):
    folium.CircleMarker(
        location=[lat, lon],
        radius=5,  # Size of the points
        color=colormap(suitability),  # Color determined by suitability
        fill=True,
        fill_color=colormap(suitability),
        fill_opacity=0.8,
        popup=f"Suitability: {suitability:.2f}"  # Display suitability on click
    ).add_to(m)

# Add colormap legend to the map
colormap.add_to(m)

# Save the map
#m.save("luxembourg_maxent_points_map.html")
m


#### 2.1- Train on all data

In [16]:
maxent = elapid.MaxentModel()
maxent.fit(features, labels)

# Step 4: Make predictions
pred_prob = maxent.predict(features)

# Step 5: Evaluate the model
auc_score = roc_auc_score(labels, pred_prob)
print(f"Maxent AUC Score: {auc_score}")


Maxent AUC Score: 0.9844672780525279


In [17]:
# Center the map on Luxembourg
lux_center = [49.8153, 6.1296]  # Approximate center of Luxembourg
m = folium.Map(location=lux_center, zoom_start=10, tiles="CartoDB.Positron")  # White/light background map

# Define a colormap for suitability
colormap = linear.viridis.scale(min(pred_prob), max(pred_prob))
colormap.caption = "Predicted Suitability"

# Add points to the map
for lat, lon, suitability in zip(coords_lux[:, 0], coords_lux[:, 1], pred_prob):
    folium.CircleMarker(
        location=[lat, lon],
        radius=5,  # Size of the points
        color=colormap(suitability),  # Color determined by suitability
        fill=True,
        fill_color=colormap(suitability),
        fill_opacity=0.8,
        popup=f"Suitability: {suitability:.2f}"  # Display suitability on click
    ).add_to(m)

# Add colormap legend to the map
colormap.add_to(m)

# Save the map
#m.save("luxembourg_maxent_points_map.html")
m


#### 2.3 Maxent that trains on: Part of the positives + all background data and tests on: the second part of positives + all background data

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

# Step 1: Split presence data into training and testing sets
presence_indices = np.where(labels == 1)[0]  # Indices for presence points

# Split presence data
presence_train_idx, presence_test_idx = train_test_split(
    presence_indices, test_size=0.2, random_state=42
)

# Combine presence training data with all background data for training
train_indices = np.concatenate([presence_train_idx, np.where(labels == 0)[0]])
X_train = features.iloc[train_indices]
y_train = labels[train_indices]

# Testing data: Combine test presence points with all background points
test_indices = np.concatenate([presence_test_idx, np.where(labels == 0)[0]])
X_test = features.iloc[test_indices]
y_test = np.concatenate([np.ones(len(presence_test_idx)), np.zeros(np.where(labels == 0)[0].shape[0])])

# Train the Maxent model
maxent = elapid.MaxentModel()
maxent.fit(X_train, y_train)

# Predict suitability scores for the test set
y_pred_prob = maxent.predict(X_test)

# Evaluate using AUC
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"Maxent AUC Score: {auc_score}")


Maxent AUC Score: 0.9395656279508972


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


In [19]:
# Center the map on Luxembourg
lux_center = [49.8153, 6.1296]  # Approximate center of Luxembourg
m = folium.Map(location=lux_center, zoom_start=10, tiles="CartoDB.Positron")  # White/light background map

# Define a colormap for suitability
colormap = linear.viridis.scale(min(y_pred_prob), max(y_pred_prob))
colormap.caption = "Predicted Suitability"

# Add points to the map
for lat, lon, suitability in zip(coords_lux[:, 0], coords_lux[:, 1], y_pred_prob):
    folium.CircleMarker(
        location=[lat, lon],
        radius=5,  # Size of the points
        color=colormap(suitability),  # Color determined by suitability
        fill=True,
        fill_color=colormap(suitability),
        fill_opacity=0.8,
        popup=f"Suitability: {suitability:.2f}"  # Display suitability on click
    ).add_to(m)

# Add colormap legend to the map
colormap.add_to(m)

# Save the map
#m.save("luxembourg_maxent_points_map.html")
m
