# pipeline to train CatBoost classification models for usage in openEO inference pipelines
This pipeline consist of two steps. <br> <br>
First, the CatBoost model training in an offline pipeline documented here: https://confluence.vito.be/x/LhN1Ew <br>
This approach contains training data preparation for the hierarchical classification in which models for level 1 -3 of the EUNIS2012 topology are created. For each model a best feature selection and training data outlier detection is carried out before the 5-folded cross-validated model training is done. <br> <br>
Second, the models are transferred into the ONNX file format and saved on the CreoDIAS S3 storage for usage in the openEO inference pipelines. Important: in the metadata of each hierarchical ONNX model the selected feature bands for the model, the band encoder and additional information is saved.

### Catboost models
The offline generated CatBoost models are saved in the following location: \\netapp03.vgt.vito.be\habitat\slovakia\openEO_tests\alpha-1\2_model_training <br>
The models_v1.json provides an overview of the generated models and the storage locations.

In [None]:
import os
import sys
import json
import boto3
from botocore.exceptions import NoCredentialsError

sys.path.append(os.path.abspath('C:/Git_projects/eo_processing/src'))
from eo_processing.utils.onnx_model_utilities import convert_catboost_model_to_onnx_with_metadata

# get path to model overview
model_overview = os.path.normpath(r'\\netapp03.vgt.vito.be\habitat\slovakia\openEO_tests\alpha-1\2_model_training\models_v1.json')
model_overview

'\\\\netapp03.vgt.vito.be\\habitat\\slovakia\\openEO_tests\\alpha-1\\2_model_training\\models_v1.json'

In [None]:
# each model consists of CBM file for the CatBoost model, an encoder file holding the raster value to clear text name and the selected features (predictors)

### ONNX model generation incl metadata

In [4]:
# filled by Hans
import os
import json
from catboost import CatBoostClassifier
import onnx

def load_input_features(json_path: str):
    """Load input features from a JSON file."""
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"JSON file not found at: {json_path}")
    
    with open(json_path, 'r') as f:
        input_features = json.load(f)
    
    if not isinstance(input_features, list):
        raise ValueError(f"Expected a list of input features in the JSON file, but got {type(input_features)}.")
    
    return input_features

# Function to fetch output features (class names) from a CatBoost model
def get_output_features_from_catboost(model: CatBoostClassifier):
    """Fetch the output features (class names) from a CatBoost model."""
    if hasattr(model, 'classes_') and model.classes_ is not None:
        return list(map(str, model.classes_))  # Convert class names to string
    else:
        raise ValueError("CatBoost model does not have class information or is not a classification model.")

def convert_models_from_json(models_json_path, output_dir):
    """
    Convert CatBoost models to ONNX format based on a JSON file containing model paths and metadata.
    
    Args:
        models_json_path (str): Path to the JSON file containing model information.
        output_dir (str): Directory where ONNX models will be saved.
    """
    if not os.path.exists(models_json_path):
        raise FileNotFoundError(f"Models JSON file not found: {models_json_path}")
    
    # Load model paths from JSON
    with open(models_json_path, 'r') as f:
        models_info = json.load(f)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    for model_name, model_path in models_info.items():
        # Normalize model path for the current operating system
        model_path = os.path.normpath(model_path)
        
        # Build the ONNX file path
        onnx_path = os.path.join(output_dir, f"{model_name}.onnx")
        
        # Path to predictors.json (if exists)
        predictors_json_path = os.path.join(model_path, "predictors.json")
        
        # Extract input features from predictors.json if it exists
        input_features = []
        if os.path.exists(predictors_json_path):
            input_features = load_input_features(predictors_json_path)
        else:
            print(f"Warning: 'predictors.json' not found in {model_path}. Metadata will be empty.")
        
        # Skip conversion if ONNX model already exists
        if os.path.exists(onnx_path):
            print(f"ONNX model already exists: {onnx_path}. Skipping.")
            continue
        
        # Convert CatBoost model to ONNX format
        cbm_path = os.path.join(model_path, "catboost_v1.cbm")
        if not os.path.exists(cbm_path):
            print(f"Error: CatBoost model not found at {cbm_path}. Skipping.")
            continue
        
        try:
            
            # Fetch the output features from the CatBoost model
            catboost_model = CatBoostClassifier()
            catboost_model.load_model(cbm_path, format="cbm")
            output_features = get_output_features_from_catboost(catboost_model)
            
            convert_catboost_model_to_onnx_with_metadata(
                catboost_model_path=cbm_path,
                input_features=input_features,
                output_features=output_features,
                output_onnx_path=onnx_path
            )
            print(f"Successfully converted: {onnx_path}")
        except Exception as e:
            print(f"Error converting {cbm_path}: {e}")


# TODO; directly use netapp instad
models_json = "H:/slovakia/openEO_tests/alpha-1/2_model_training/updated_models_v1.json"
output_directory = "H:/slovakia/openEO_tests/onnx_test"
convert_models_from_json(models_json, output_directory)


Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level1_class-0_129predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-C_71predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-D_68predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-G_164predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-F_90predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-X_54predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-I_50predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-H_65predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-E_85predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests/onnx_test\Level2_class-J_62predictors_v1.onnx
Successfully converted: H:/slovakia/openEO_tests

### transfer of ONNX model to S3 storage

In [None]:

def upload_files_to_s3(local_directory, s3_directory, bucket_name, s3_client):
    for filename in os.listdir(local_directory):
        local_file_path = os.path.join(local_directory, filename)
        
        # Ensure the file path includes the target directory within the bucket
        s3_file_path = os.path.join(s3_directory, filename).replace("\\", "/")
        
        try:
            print(f"Uploading {local_file_path} to s3://{bucket_name}/{s3_file_path}...")
            s3_client.upload_file(local_file_path, bucket_name, s3_file_path)
            print(f"Successfully uploaded {filename} to s3://{bucket_name}/{s3_file_path}")
        except FileNotFoundError:
            print(f"The file {local_file_path} was not found.")
        except NoCredentialsError:
            print("Credentials not available.")
        except Exception as e:
            print(f"An error occurred: {e}")


# Replace with your CloudFerro S3 endpoint and credentials
s3_endpoint = 'https://s3.waw3-1.cloudferro.com'
access_key = 'xx'
secret_key = 'yy'

bucket_name = 'ecdc-waw3-1-ekqouvq3otv8hmw0njzuvo0g4dy0ys8r985n7dggjis3erkpn5o' 

session = boto3.session.Session()
s3_client = session.client('s3',
                            endpoint_url=s3_endpoint,
                            aws_access_key_id=access_key,
                            aws_secret_access_key=secret_key)


# Specify your local directory containing the COGs and the target S3 directory
local_cog_directory = "H:/slovakia/openEO_tests/onnx_test"  # Adjust this path
s3_directory = 'models'  # Path in the S3 bucket

# Use the function with your setup
upload_files_to_s3(
    local_directory=local_cog_directory, 
    s3_directory=s3_directory, 
    bucket_name=bucket_name, 
    s3_client=s3_client
)

Uploading H:/slovakia/openEO_tests/onnx_test\Level1_class-0_129predictors_v1.onnx to s3://ecdc-waw3-1-ekqouvq3otv8hmw0njzuvo0g4dy0ys8r985n7dggjis3erkpn5o/models/Level1_class-0_129predictors_v1.onnx...
Successfully uploaded Level1_class-0_129predictors_v1.onnx to s3://ecdc-waw3-1-ekqouvq3otv8hmw0njzuvo0g4dy0ys8r985n7dggjis3erkpn5o/models/Level1_class-0_129predictors_v1.onnx
Uploading H:/slovakia/openEO_tests/onnx_test\Level2_class-C_71predictors_v1.onnx to s3://ecdc-waw3-1-ekqouvq3otv8hmw0njzuvo0g4dy0ys8r985n7dggjis3erkpn5o/models/Level2_class-C_71predictors_v1.onnx...
Successfully uploaded Level2_class-C_71predictors_v1.onnx to s3://ecdc-waw3-1-ekqouvq3otv8hmw0njzuvo0g4dy0ys8r985n7dggjis3erkpn5o/models/Level2_class-C_71predictors_v1.onnx
Uploading H:/slovakia/openEO_tests/onnx_test\Level2_class-D_68predictors_v1.onnx to s3://ecdc-waw3-1-ekqouvq3otv8hmw0njzuvo0g4dy0ys8r985n7dggjis3erkpn5o/models/Level2_class-D_68predictors_v1.onnx...
Successfully uploaded Level2_class-D_68predictors_v1.