## Geometry data preparation

In [19]:
import geopandas as gpd 
import pandas as pd
import os
from shapely.geometry import Polygon


# convert 3D geometries to 2D
def convert_to_2d(geom):
    if geom is not None:
        # Check if the geometry is a Polygon and contains Z-dimension
        if isinstance(geom, Polygon) and geom.has_z:
            # Convert to 2D by taking only the X and Y coordinates
            return Polygon([(x, y) for x, y, z in geom.exterior.coords])
        return geom  # Return as-is if already 2D
    return None

### load shapefiles

In [36]:
argentina = gpd.read_file('/app/dev/AgML/location_encoding/argentina/arg_adm_unhcr2017_shp/arg_admbnda_adm2_unhcr2017.shp')
australia = gpd.read_file('/app/dev/AgML/location_encoding/australia/Shapefiles/ABARES_regions_boundaries.shp')
brazil = gpd.read_file('/app/dev/AgML/location_encoding/brazil/bra_admbnda_adm2_ibge_2020.shp')
china = gpd.read_file('/app/dev/AgML/location_encoding/china/chn_admbnda_adm1_ocha_2020.shp')
eu = gpd.read_file('/app/dev/AgML/location_encoding/eu/NUTS_RG_03M_2016_4326.shp')
india = gpd.read_file('/app/dev/AgML/location_encoding/india/India_585districts_adm2.shp')
mali = gpd.read_file('/app/dev/AgML/location_encoding/mali/cmdt_boundaries/cmdt_boundary.shp')
mexico = gpd.read_file('/app/dev/AgML/location_encoding/mexico/INEGI Census/geometries/updated/00ent_edited.shp')
us = gpd.read_file('/app/dev/AgML/location_encoding/us/cb_2018_us_county_500k.shp')

fewsnet = gpd.read_file('/app/dev/AgML/location_encoding/fewsnet/adm_shapefile_AgML_v0.1.shp')
fewsnet['country_iso'] = fewsnet['adm_id'].apply(lambda x: x[:2])# add country code

### create geometries: change path to main cybench crop path

- geometries are created from shapefile and attributed with adm_id
- choose crop to execute 

In [39]:
crop = 'wheat'  #maize, wheat
root_path = '/app/dev/AgML/cy_bench/cybench-data'
d = 'GeoJSON'  #geojson driver


# get list of 2-character iso, remove unique
iso = sorted(list(set(os.listdir(os.path.join(root_path, 'maize')) + 
                      os.listdir(os.path.join(root_path, 'wheat')))))


# iterate country and save geometries
for country in iso:

    try:
        country_path = os.path.join(root_path, crop, country)

        if country == 'AR':
            argentina['adm_id'] = argentina['ADM2_PCODE']
            argentina['geometry'] = argentina['geometry'].apply(convert_to_2d)
            argentina[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)
            
        elif country == 'AU':
            australia[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)
            
        elif country == 'BR':
            brazil.crs = "EPSG:4326"  #set crs
            brazil['adm_id'] = brazil['ADM2_PCODE']
            brazil[['adm_id','geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)  

        elif country == 'CN':
            china['adm_id'] = china['ADM1_PCODE']
            china[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)  
                  
        elif country == 'IN':
            india[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)    
                
        elif country == 'ML':
             mali[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)       

        elif country == 'MX':
            mexico['adm_id'] = mexico['adm_id'].apply(lambda x: x[:2] + '-' + x[2:] if pd.notnull(x) else x)
            mexico[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)   
                                       
        elif country == 'US':
            us['adm_id'] = 'US-' + us['STATEFP'].astype(str) + '-'+ us['COUNTYFP'].astype(str)
            us[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d) 

        elif country in fewsnet['country_iso'].tolist():
            country_filtered = fewsnet[fewsnet['country_iso']== country]
            country_filtered[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)

        elif country in eu['CNTR_CODE'].tolist():
            country_filtered = eu[eu['CNTR_CODE'].str.startswith(country)]

            # find and filter highest nut level
            adm_id_highest = country_filtered['LEVL_CODE'].max()
            country_filtered = country_filtered[country_filtered['LEVL_CODE']==adm_id_highest]
            country_filtered['adm_id'] = country_filtered['NUTS_ID']
            country_filtered[['adm_id', 'geometry']].to_file(os.path.join(country_path, 'geometry.geojson'), driver=d)                
                                                              

    except Exception as e:
        print(f"{country} skipped due to {e}")
        continue


## checklist

In [126]:
## checkers
# adm_id, country, geometry
# crs should be consistent e.g 4326
# missing shapefiles for independent DE
# geodataframe or pandas df
# fewnet countries

# gdown https://drive.google.com/uc?id=1KzgECw0xac04Mbdkaq5uMNuTvT4sOlt2
# for eu, fewsnet, usa, the shapefiles are downloaded from google drive: agml activities/snyf/predictor data preparation/shapefiles
# currently only MALI from fewsnet is used

## 