In [1]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

# This file is responsible for combining our Agri4Cast + SoilGrid grid data with they cy_bench yield data

### This code uses haversine distance, since it gets the distance between two points on a sphere (in our case earth)

In [2]:
def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r

In [3]:
def merge_nearest_location_haversine(
    target_df, source_df, source_lat='lat', source_lon='lon',
    target_lat='lat', target_lon='lon', columns_to_add=None
):
    result_df = target_df.copy()
    
    if columns_to_add is None:
        columns_to_add = [
            col for col in source_df.columns 
            if col not in [source_lat, source_lon]
        ]
    
    for col in columns_to_add:
        result_df[col] = None
    result_df['nearest_distance_km'] = None
    
    for idx, row in target_df.iterrows():
        # Calculate haversine distances
        distances = source_df.apply(
            lambda x: haversine_distance(
                row[target_lat], row[target_lon],
                x[source_lat], x[source_lon]
            ),
            axis=1
        )
        
        nearest_idx = distances.idxmin()
        
        result_df.at[idx, 'nearest_distance_km'] = distances[nearest_idx]
        for col in columns_to_add:
            result_df.at[idx, col] = source_df.at[nearest_idx, col]
    
    return result_df

### Path to the soilGrids and Agri4Cast data and the yield data with centroids

In [4]:
df_data = pd.read_csv('combined data/soil_grids_puls_agricast.csv')
df_yield = pd.read_csv('combined data/yield_data.csv')

### Combine them, can take 20 minutres, if you dont want to, this has been provided in "combined data/soilgrids_agri4cast_yield.csv"

In [None]:
df_data = merge_nearest_location_haversine(
    target_df=df_data,
    source_df=df_yield,
    columns_to_add=['yield', 'harvest_area', 'production', 'adm_id']
)
df_data.to_csv('combined data/soilgrids_agri4cast_yield.csv', index=False)