In [None]:
import pandas as pd
import geopandas as gpd
import os
from pathlib import Path

# This code is responsible for merging the cy-bench yield data with its centroid data as well as aggregating it

### We do not provide the cy-bench data here, since it weights too much, you can download it from https://zenodo.org/records/17279151

In [None]:
def process_all_countries(base_path='cybench-data', centroids_path='centroids'):
    all_data = []
    
    # Iterate through crop types (only wheat in our case, but if you want maize, just add 'maize' to the array bellow)
    for crop_type in ['wheat']:
        crop_path = Path(base_path) / crop_type
        
        if not crop_path.exists():
            print(f"Warning: {crop_path} not found, skipping {crop_type}")
            continue
        
        # Iterate through all country directories
        for country_dir in crop_path.iterdir():
            if not country_dir.is_dir():
                continue
            
            country_code = country_dir.name
            
            # Load yield data
            yield_file = country_dir / f"yield_{crop_type}_{country_code}.csv"
            
            if not yield_file.exists():
                print(f"Warning: {yield_file} not found, skipping")
                continue
            
            try:
                # Read yield data
                yield_df = pd.read_csv(yield_file)
                
                # Aggregate yield by adm_id (average across all years)
                yield_agg = yield_df.groupby('adm_id').agg({
                    'yield': 'mean',
                    'harvest_area': 'mean',  
                    'production': 'mean'     
                }).reset_index()
                
                # Load centroids for coordinates
                centroid_file = Path(centroids_path) / country_code / f"{country_code}.shp"
                
                if centroid_file.exists():
                    centroids = gpd.read_file(centroid_file)
                    
                    # Extract lat/lon from geometry
                    centroids['lon'] = centroids.geometry.x
                    centroids['lat'] = centroids.geometry.y
                    
                    # Merge with yield data
                    merged = yield_agg.merge(
                        centroids[['adm_id', 'lat', 'lon']], 
                        on='adm_id', 
                        how='left'
                    )
                else:
                    print(f"Warning: Centroids not found for {country_code}")
                    merged = yield_agg
                    merged['lat'] = None
                    merged['lon'] = None
                
                # Add metadata
                merged['country_code'] = country_code
                merged['crop_type'] = crop_type
                
                all_data.append(merged)
                print(f"Processed {crop_type} - {country_code}: {len(merged)} regions")
                
            except Exception as e:
                print(f"Error processing {crop_type}/{country_code}: {e}")
                continue
    
    # Combine all data into single DataFrame
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        
        # Reorder columns for clarity
        cols = ['country_code', 'crop_type', 'adm_id', 'yield', 'lat', 'lon', 
                'harvest_area', 'production']
        final_df = final_df[[c for c in cols if c in final_df.columns]]
        
        return final_df
    else:
        return pd.DataFrame()

### Provide the path to the main cy-bench data to base_path and the path to the centroid data to centroids_path

In [None]:
# Execute
df = process_all_countries(base_path='/Users/artur/Downloads/cybench-data', centroids_path='centroids')

# Display summary
print(f"\n{'='*60}")
print(f"Total regions: {len(df)}")
print(f"Countries: {df['country_code'].nunique()}")
print(f"Crops: {df['crop_type'].unique()}")
print(f"\nSample data:")
print(df.head(10))
df.to_csv('combined data\yield_data.csv', index=False)

✓ Processed wheat - SK: 6 regions
✓ Processed wheat - SE: 17 regions
✓ Processed wheat - PL: 17 regions
✓ Processed wheat - US: 2458 regions
✓ Processed wheat - BE: 11 regions
✓ Processed wheat - CN: 30 regions
✓ Processed wheat - IN: 520 regions
✓ Processed wheat - EE: 5 regions
✓ Processed wheat - EL: 48 regions
✓ Processed wheat - LV: 5 regions
✓ Processed wheat - IT: 97 regions
✓ Processed wheat - CZ: 14 regions
✓ Processed wheat - RO: 40 regions
✓ Processed wheat - PT: 7 regions
✓ Processed wheat - AR: 380 regions
✓ Processed wheat - AU: 22 regions
✓ Processed wheat - HR: 2 regions
✓ Processed wheat - HU: 20 regions
✓ Processed wheat - NL: 12 regions
✓ Processed wheat - BG: 6 regions
✓ Processed wheat - AT: 9 regions
✓ Processed wheat - DE: 397 regions
✓ Processed wheat - DK: 11 regions
✓ Processed wheat - BR: 1579 regions
✓ Processed wheat - FI: 18 regions
✓ Processed wheat - FR: 95 regions
✓ Processed wheat - ES: 46 regions
✓ Processed wheat - IE: 3 regions
✓ Processed wheat - L