In [33]:
# --- imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [34]:
from pathlib import Path
from xgboost.callback import EarlyStopping
import xgboost as xgb

In [35]:
import seaborn as sns
import matplotlib.pyplot as plt
import torch

In [36]:
from sklearn.utils.class_weight import compute_class_weight

In [37]:
# ============================
# Custom transformers
# ============================
class SBERTEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes a single text column into dense embeddings using a SentenceTransformer.
    Produces a 2D array [n_samples, emb_dim].
    """
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2",
                 device=None, batch_size=64, normalize=True):
        self.model_name = model_name
        self.device = device
        self.batch_size = batch_size
        self.normalize = normalize
        self._model = None

    def fit(self, X, y=None):
        dev = self.device
        if dev is None:
            # Use CUDA if available
            try:
                import torch
                dev = "cuda" if torch.cuda.is_available() else "cpu"
            except Exception:
                dev = "cpu"
        self._model = SentenceTransformer(self.model_name, device=dev)
        return self

    def transform(self, X):
        # Accept Series/array/list; cast to strings to be safe, fill NaN with ""
        if isinstance(X, pd.Series):
            texts = X.fillna("").astype(str).tolist()
        elif isinstance(X, (pd.DataFrame,)):
            # If a DataFrame arrives (single column), flatten it
            texts = X.iloc[:, 0].fillna("").astype(str).tolist()
        else:
            texts = pd.Series(X).fillna("").astype(str).tolist()
        emb = self._model.encode(
            texts,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            normalize_embeddings=self.normalize
        )
        return emb


class GeoDateEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes latitude, longitude, and a date column into numeric features:
    - raw lat/lon
    - sin/cos(lat), sin/cos(lon)
    - day-of-week (cyclical sin/cos)
    - day-of-year (cyclical sin/cos)
    """
    def __init__(self, lat_col="latitude", lon_col="longitude", date_col="date"):
        self.lat_col = lat_col
        self.lon_col = lon_col
        self.date_col = date_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Expect a pandas DataFrame with the three columns
        if not isinstance(X, pd.DataFrame):
            # If we get a numpy array, rebuild a DataFrame with known col order
            X = pd.DataFrame(X, columns=[self.lat_col, self.lon_col, self.date_col])

        lat = pd.to_numeric(X[self.lat_col], errors="coerce").astype(float)
        lon = pd.to_numeric(X[self.lon_col], errors="coerce").astype(float)
        dt = pd.to_datetime(X[self.date_col], errors="coerce")

        # radians
        lat_rad = np.radians(lat)
        lon_rad = np.radians(lon)

        # cyclical encodings
        dayofweek = dt.dt.dayofweek.fillna(0).astype(int)       # 0..6
        dayofyear = dt.dt.dayofyear.fillna(1).astype(int)       # 1..366

        dow_sin = np.sin(2 * np.pi * dayofweek / 7.0)
        dow_cos = np.cos(2 * np.pi * dayofweek / 7.0)
        doy_sin = np.sin(2 * np.pi * dayofyear / 365.25)
        doy_cos = np.cos(2 * np.pi * dayofyear / 365.25)

        out = np.column_stack([
            #lat.values, lon.values,
            #np.sin(lat_rad), np.cos(lat_rad),
            #np.sin(lon_rad), np.cos(lon_rad),
            #dow_sin, dow_cos,
            doy_sin, doy_cos,
        ])
        return out

In [38]:
# Open file
folder = Path("/user/enjomgue/home/Thesis/Summer_school/DTU")
img_folder = folder / "FungiImages"
meta_folder = folder / "Metadata"
meta_file = "metadata_2025-08-14_01.csv"

# Open Metadata file
full_meta_data = pd.read_csv(meta_folder / meta_file)
full_meta_data.shape

(36015, 7)

In [39]:
# Metadata overview
meta_features = ["Habitat", "Latitude", "Longitude", "Substrate", "eventDate"]
# meta_features = ["Substrate", "eventDate"]
target_feature = "taxonID_index"
full_meta_data.head()

Unnamed: 0,filename_index,Habitat,Latitude,Longitude,Substrate,eventDate,taxonID_index
0,fungi_final000000.jpg,Mixed woodland (with coniferous and deciduous ...,54.994548,11.958746,soil,2018-09-01,
1,fungi_final000001.jpg,Unmanaged deciduous woodland,54.67314,11.46406,soil,2013-07-14,
2,fungi_final000002.jpg,Forest bog,56.12362,9.49204,leaf or needle litter,2011-05-29,
3,fungi_final000003.jpg,coniferous woodland/plantation,55.69515,12.3386,soil,2013-11-06,
4,fungi_final000004.jpg,Deciduous woodland,56.148944,9.961819,soil,2020-10-12,


In [40]:
# Metadata overview : train
train_meta_data = full_meta_data[full_meta_data['filename_index'].str.startswith('fungi_train')]

print(f"Train data shape {len(train_meta_data)/len(full_meta_data):.2f}:  {train_meta_data.shape}")

Train data shape 0.72:  (25863, 7)


In [41]:
# Filter on non missing data
#df_train_clean = train_meta_data.dropna()
df_train_clean = train_meta_data.dropna(subset=["Habitat", "Substrate", "eventDate"])
print("Training sample size: ", df_train_clean.shape)
df_train_clean.head()

Training sample size:  (14070, 7)


Unnamed: 0,filename_index,Habitat,Latitude,Longitude,Substrate,eventDate,taxonID_index
10153,fungi_train000001.jpg,Forest bog,,,wood,2013-09-10,174.0
10154,fungi_train000002.jpg,park/churchyard,55.644964,12.047071,soil,2018-09-08,50.0
10156,fungi_train000004.jpg,Deciduous woodland,,,wood,2013-09-14,30.0
10158,fungi_train000006.jpg,Mixed woodland (with coniferous and deciduous ...,,,leaf or needle litter,2017-09-27,70.0
10161,fungi_train000009.jpg,coniferous woodland/plantation,,,soil,2018-09-22,180.0


In [42]:
# ============================

# text_cols = ["Substrate"]
# text_cols = ["Habitat"]
#geo_date_cols = ["eventDate"]
target_col = "taxonID_index"

text_cols = ["Habitat", "Substrate"]
geo_date_cols = ["Latitude", "Longitude", "eventDate"]
#target_col = "taxonID_index"

X = df_train_clean[text_cols + geo_date_cols]
#X = df_train_clean[text_cols]
y = df_train_clean[target_col]

#X = train_meta_data[text_cols + geo_date_cols]
#y = train_meta_data[target_col]

# stratified split to keep class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ============================
# Build the preprocessing
# ============================
preprocess = ColumnTransformer(
    transformers=[
        ("Habitat", SBERTEncoder(), "Habitat"),
        ("Substrate", SBERTEncoder(), "Substrate"),
        #("Habitat", SentimentEncoder(), "Habitat"),
        #("Substrate", SentimentEncoder(), "Substrate"),
        ("geo_date", GeoDateEncoder(lat_col="Latitude", lon_col="Longitude", date_col="eventDate"),
         ["Latitude", "Longitude", "eventDate"]),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

# ============================
# Training: (uses transformed arrays)
# ============================
# Fit/transform once
# Z (train) and Zt (test) are dense numeric arrays after all encodings
Z = preprocess.fit_transform(X_train)
Zt = preprocess.transform(X_test)

clf_es = XGBClassifier(
    n_estimators=5000,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    #predictor="cpu_predictor",   # force CPU prediction
    random_state=42,
    n_jobs=-1,
    eval_metric="mlogloss",
    callbacks=[xgb.callback.EarlyStopping(rounds=100, save_best=True)],
)

# Make a validation split for early stopping
Z_tr, Z_val, y_tr, y_val = train_test_split(Z, y_train, test_size=0.2, stratify=y_train, random_state=42)

clf_es.fit(
    Z_tr, y_tr,
    eval_set=[(Z_val, y_val)],
    #early_stopping_rounds=100,
    verbose=False
)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,[<xgboost.call...x7f91db0442d0>]
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [43]:
# Predict on test set

pred_es = clf_es.predict(Zt)
print("\nWith early stopping:")
print(classification_report(y_test, pred_es))


With early stopping:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         2
         1.0       1.00      0.33      0.50         3
         2.0       0.00      0.00      0.00         2
         3.0       0.00      0.00      0.00         2
         4.0       0.00      0.00      0.00         2
         5.0       0.00      0.00      0.00         2
         6.0       0.00      0.00      0.00         3
         7.0       0.11      0.07      0.09        27
         8.0       0.44      0.31      0.36        13
         9.0       0.00      0.00      0.00         2
        10.0       0.00      0.00      0.00         2
        11.0       0.37      0.33      0.35        33
        12.0       0.20      0.18      0.19        28
        13.0       0.00      0.00      0.00         2
        14.0       0.00      0.00      0.00         4
        15.0       0.25      0.30      0.27        70
        16.0       0.00      0.00      0.00         5
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [44]:
rep = classification_report(y_test, pred_es, output_dict=True, zero_division=0)
df = pd.DataFrame(rep).T  # rows: each class + 'accuracy' + 'macro avg' + 'weighted avg'

# keep nice column order
df = df[["precision", "recall", "f1-score", "support"]]

# select only per-class rows (drop summary rows), then sort by F1
classes_df = df.drop(index=["accuracy", "macro avg", "weighted avg"], errors="ignore")
classes_df = classes_df.sort_values("f1-score", ascending=False).round(3)

print(classes_df.head(10))          # top classes by F1
print("\nMacro/Weighted averages:")
print(df.loc[["macro avg", "weighted avg"]].round(3))

       precision  recall  f1-score  support
158.0      0.600   1.000     0.750      9.0
30.0       0.800   0.667     0.727     12.0
151.0      0.714   0.714     0.714      7.0
177.0      0.667   0.750     0.706      8.0
32.0       0.588   0.800     0.678     25.0
62.0       1.000   0.500     0.667      2.0
41.0       0.625   0.714     0.667      7.0
149.0      0.750   0.600     0.667      5.0
128.0      0.714   0.625     0.667      8.0
90.0       0.600   0.667     0.632      9.0

Macro/Weighted averages:
              precision  recall  f1-score  support
macro avg         0.187   0.186     0.177   2814.0
weighted avg      0.233   0.243     0.230   2814.0


In [45]:
# 
print(classes_df.tail(10))          # tail classes by F1
print("\nMacro/Weighted averages:")
print(df.loc[["macro avg", "weighted avg"]].round(3))

       precision  recall  f1-score  support
153.0        0.0     0.0       0.0      5.0
155.0        0.0     0.0       0.0      6.0
152.0        0.0     0.0       0.0      2.0
167.0        0.0     0.0       0.0      7.0
168.0        0.0     0.0       0.0      2.0
169.0        0.0     0.0       0.0      9.0
172.0        0.0     0.0       0.0      2.0
175.0        0.0     0.0       0.0      6.0
178.0        0.0     0.0       0.0      6.0
180.0        0.0     0.0       0.0     11.0

Macro/Weighted averages:
              precision  recall  f1-score  support
macro avg         0.187   0.186     0.177   2814.0
weighted avg      0.233   0.243     0.230   2814.0
