# Data Extractions from OpenEO

To run the extractions, you need an account in the [Copernicus Data Space Ecosystem (CDSE)](https://dataspace.copernicus.eu/).

In [None]:
!pip install git+https://github.com/WorldCereal/prometheo.git@scaleag_augmentations
!pip install git+https://github.com/ScaleAGData/scaleag-vito.git@prometheo-integration

In [None]:
from loguru import logger
import geopandas as gpd
from pathlib import Path
from scaleagdata_vito.openeo.extract_sample_scaleag import (
    generate_input_for_extractions,
    extract
)
from scaleagdata_vito.presto.presto_df import load_dataset

### Assess data correctness before launching the OpenEO jobs 
You can run some checks on your input file to make sure they are suitable to run the extractions successfully. In particular, it is important to check the validity of the geometries and, ideally, also to have a column containing a unique id for each sample 


In [None]:
def check_unique_id(df_path, unique_id):
    df = gpd.read_file(df_path)
    if df[unique_id].nunique() != df.shape[0]:
        logger.info("IDs are not unique!")
        return df[df[unique_id].duplicated(keep=False)]
    else:
        logger.info("IDs are unique")
        return None


def check_valid_geometry(df_path, save_to=""):
    df = gpd.read_file(df_path)
    df_valid = df[df.geometry.is_valid]
    if len(df_valid) < len(df):
        logger.info("Invalid geometries found! Returning invalid geometries")
        df_invalid = df[~df.geometry.is_valid]
        if save_to:
            filename = Path(save_to) / f"{Path(df_path).stem}_invalid.geojson"
            logger.info(f"Saving invalid geometries to {filename}")
            Path(save_to).mkdir(parents=True, exist_ok=True)
            df_invalid.to_file(filename)
        return df_invalid
    else:
        logger.info("All geometries are valid")
        return None

In [None]:
input_file = "/projects/TAP/HEScaleAgData/data/GEOMaize/Maize_Yield_Polygon_North_Ghana/Polygon_North/Maize_2021_valid.geojson"
invalid_geom = check_valid_geometry(input_file, save_to="")
non_unique_ids = check_unique_id(input_file, unique_id="Field_ID")

#### Provide job instructions and start OpenEO extractions

1) Indicate the following fields in order to guide the extraction
2) In the cell below you will be asked for authentication and be provided with a link. click on the link and login with your CDSE credentials.  
3) Once the extraction process will be over, you will find your extracted dataset in the output folder you indicated. You can load it by running `load_dataset` as shown below

    ```python
    job_params = dict(
        output_folder=..., # where to save the extracted dataset
        input_df=..., # input georeferenced dataset to run the extractions for 
        start_date=..., # string indicating from which date to extract data  
        end_date=..., # string indicating until which date to extract the data 
        unique_id_column=..., # name of the column in the input_df containing the unique ID of the samples  
        composite_window=..., # "month" or "dekad" are supported. Default is "dekad"
    )

    ```

In [None]:
job_params = dict(
    output_folder="/home/giorgia/Private/data/geomaize/Maize_yield_2021/",
    input_df="/home/giorgia/Private/data/geomaize/Maize_2021.shp",
    start_date="2021-07-01",
    end_date="2021-10-31",
    unique_id_column="Field_ID",
    composite_window="dekad",
)
extract(generate_input_for_extractions(job_params))

In [None]:
dataset_df = load_dataset(job_params["output_folder"])