This notebook imports yield data for Germany from 1979 to 2021 on district level and transforms the data to a standardized format that is used for the sub-national yield forecasts within AgML.

Here you can find more information on the data set: https://www.openagrar.de/receive/openagrar_mods_00092044 and on the preparation of the data set: https://doi.org/10.1038/s41597-024-02951-8

Author: Rahel Laudien

Contact: laudien@pik-potsdam.de

Date: 21/02/2024

# Preparations

In [4]:
# Import libraries
import pandas as pd  # Version 2.0.2
import numpy as np  # Version 1.25.0

# Define the path to the directory containing the yield data set called 'Final_data.csv'
import_path = "your/path/"

# Define the path to the directory where the final dataset will be saved
export_path = "your/path/"

# Define crops of interest 
crops = ['winter wheat', 'grain maize'] # rice is not available in the dataset
# These are all available crops in the dataset:
# 'spring barley', 'winter barley', 'grain maize', 'silage maize', 'oats', 'potatoes', 
# 'winter rape', 'rye', 'sugarbeet', 'triticale', 'winter wheat'

# Provide a dictionary to map original crop names to standard names suggested by AgML
crops_names = {'sb': 'spring barley', 
               'wb': 'winter barley', 
               'grain_maize': 'grain maize',
               'silage_maize': 'silage maize', 
               'oats': 'oats', 
               'potat_tot': 'potatoes',
               'wrape': 'winter rape', 
               'rye': 'rye',
               'sugarbeet': 'sugarbeet', 
               'triticale': 'triticale', 
               'ww': 'winter wheat'}          
               
# Provide a dictionary to map original column names to standard names suggested by AgML
column_names = {'district_no': 'adm_id', 
                'year': 'harvest_year', 
                'area': 'harvest_area'}

# Define the order of variables in the dataset
column_order = ['crop_name', 'country_code', 'adm_id', 'season_name', 'planting_year', 
                'planting_date', 'harvest_year', 'harvest_date', 'yield', 'production', 
                'planted_area', 'harvest_area', 'source']

# Define country code
country_code = 'DEU'

# Read data

In [5]:
# Read the data set into a pandas DataFrame
data = pd.read_csv(import_path + "Final_data.csv", sep=",")

data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'your/path/Final_data.csv'

# Reformat and pre-process the data

In [3]:
# Convert from long to wide format, rename column names and crop names, sort by crop name, admin and year
data_formatted = data.pivot(index=['district_no', 'year', 'var'], columns='measure', values='value').reset_index()\
    .rename(columns={'var': 'crop_name'})\
    .rename(columns=column_names)\
    .replace({'crop_name': crops_names})\
    .sort_values(by=['crop_name', 'adm_id', 'harvest_year'])

# Select crops of interest    
data_formatted = data_formatted[data_formatted['crop_name'].isin(crops)]

# Add country code 
data_formatted = data_formatted.assign(country_code=country_code)\
    .reindex(columns=column_order)
    
# Remove observations with missing yield values 
data_formatted = data_formatted.dropna(subset=['yield'])

data_formatted.head()

measure,crop_name,country_code,adm_id,season_name,planting_year,planting_date,harvest_year,harvest_date,yield,production,planted_area,harvest_area,source
4901,grain maize,DEU,1057,,,,1983,,5.29,,,7.0,
7548,grain maize,DEU,1062,,,,1979,,5.81,,,150.0,
7560,grain maize,DEU,1062,,,,1980,,5.74,,,,
7572,grain maize,DEU,1062,,,,1981,,6.32,,,,
7596,grain maize,DEU,1062,,,,1983,,5.29,,,50.0,


# Export the data

In [4]:
data_formatted.to_csv("{}/{}".format(export_path, "DEU_crop_yields.csv"), index=False)