# Harmonized subnational crop statistics of the EU

## Data access

The data is available from [Agri4Cast Data Portal](https://agri4cast.jrc.ec.europa.eu/DataPortal/RequestDataResource.aspx?idResource=36&o=&r=n) of the European Commission's Joint Research Centre. You need to create an account to access the data.

## Data exploration

### Columns and data types

In [2]:
import pandas as pd

data_path = "<path to data>"
# filename = "crop_statistics_EU_2021.csv"
# crop_stats_df = pd.read_csv(data_path + "/" + filename,
#                             delimiter=";",
#                             header=0)
# print(crop_stats_df.head(5).to_string())
# print("\n")

filename = "crop_statistics_EU_2023.csv"
crop_stats_df = pd.read_csv(data_path + "/" + filename,
                            delimiter=";",
                            # set all to str, some columns have mixed types (e.g. str and nan)
                            dtype="str",
                            header=0)
print("\n")
print(crop_stats_df.head(5).to_string())
# set YEAR and VALUE to numeric
crop_stats_df = crop_stats_df.astype({"YEAR" : "int64", "VALUE" : "float64"})
print("\n")
# print(crop_stats_df.dtypes)
for c in crop_stats_df.columns[7:]:
  print(c, crop_stats_df[c].unique())



  REGION    CROP_NAME  YEAR    VARIABLE     VALUE   UoM SOURCE CALCULATED_R CALCULATED_C CALCULATED_V ZERO_AS_NULL COHERENCE_APY COHERENCE_CROP
0   AT11  Total wheat  1975        Area   32797.0    ha    NSI          NaN          Yes          NaN          NaN           Yes            NaN
1   AT11  Total wheat  1975  Production  108564.0     t    NSI          NaN          Yes          NaN          NaN           Yes            NaN
2   AT11  Total wheat  1975       Yield      3.31  t/ha    NSI          NaN          Yes          NaN          NaN           Yes            NaN
3   AT11  Total wheat  1976        Area   37650.0    ha    NSI          NaN          Yes          NaN          NaN           Yes            NaN
4   AT11  Total wheat  1976  Production  151341.0     t    NSI          NaN          Yes          NaN          NaN           Yes            NaN


CALCULATED_R [nan 'Yes']
CALCULATED_C ['Yes' nan]
CALCULATED_V [nan 'Yes']
ZERO_AS_NULL [nan 'Yes']
COHERENCE_APY ['Yes' nan 'No']
C

### Select crops

In [3]:
crops = crop_stats_df["CROP_NAME"].unique()
print("\n")
print("Crops", crops)

selected_crops = ["Soft wheat", "Grain maize"]
crop_stats_df = crop_stats_df[crop_stats_df["CROP_NAME"].isin(selected_crops)]

for cr in selected_crops:
  print("\n")
  print("Crop:", cr)
  print("---------------------")
  # # crop area
  # print(crop_stats_df[(crop_stats_df["VARIABLE"] == "Area") &
  #                     (crop_stats_df["CROP_NAME"] == cr)].head(5).to_string())
  # print("\n")

  # crop yield
  print(crop_stats_df[(crop_stats_df["VARIABLE"] == "Yield") &
                      (crop_stats_df["CROP_NAME"] == cr)].head(5).to_string())



Crops ['Total wheat' 'Soft wheat' 'Durum wheat' 'Total barley' 'Winter barley'
 'Spring barley' 'Grain maize' 'Sunflower' 'Sugar beet']


Crop: Soft wheat
---------------------
    REGION   CROP_NAME  YEAR VARIABLE  VALUE   UoM SOURCE CALCULATED_R CALCULATED_C CALCULATED_V ZERO_AS_NULL COHERENCE_APY COHERENCE_CROP
67    AT11  Soft wheat  1995    Yield  4.798  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
76    AT11  Soft wheat  1996    Yield  4.770  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
85    AT11  Soft wheat  1997    Yield  4.339  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
94    AT11  Soft wheat  1998    Yield  4.563  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
103   AT11  Soft wheat  1999    Yield  4.938  t/ha    NSI          NaN          NaN          Yes          NaN           Yes   

### Summary by crop and country

In [13]:
def getCropCountrySummary(crop, yield_df, adm_id_col, year_col):
  countries_summary = {}
  countries = yield_df[adm_id_col].str[:2].unique()
  row_idx = 0
  column_names = ["crop_name", "country_code", "min_year", "max_year", "num_years",
                  "num_regions", "data_size"]
  for cn in countries:
    yield_cn_df = yield_df[yield_df[adm_id_col].str[:2] == cn]
    if (len(yield_cn_df.index) <= 1):
      continue

    min_year = yield_cn_df[year_col].min()
    max_year = yield_cn_df[year_col].max()
    num_years = len(yield_cn_df[year_col].unique())
    num_regions = yield_cn_df[yield_cn_df[year_col] == max_year][adm_id_col].count()
    data_size = yield_cn_df[year_col].count()
    countries_summary["row" + str(row_idx)] = [crop, cn, min_year, max_year, num_years,
                                              num_regions, data_size]
    row_idx += 1

  return countries_summary, column_names

#### Summary for Soft wheat

In [14]:
crop = "Soft wheat"
crop_yield_df = crop_stats_df[(crop_stats_df["VARIABLE"] == "Yield") &
                              (crop_stats_df["CROP_NAME"] == crop)]
countries_summary, column_names = getCropCountrySummary(crop, crop_yield_df, "REGION", "YEAR")
countries_summary_df = pd.DataFrame.from_dict(countries_summary, columns=column_names,
                                              orient="index")
print(countries_summary_df.head(30).to_string())

        crop_name country_code  min_year  max_year  num_years  num_regions  data_size
row0   Soft wheat           AT      1995      2020         26            9        234
row1   Soft wheat           BE      1975      2020         43           10        438
row2   Soft wheat           BG      1995      2020         16            6         65
row3   Soft wheat           CY      2000      2020         21            1         21
row4   Soft wheat           CZ      1998      2020         23           14        322
row5   Soft wheat           DE      1999      2020         22          253       6973
row6   Soft wheat           DK      2006      2020         15           11        165
row7   Soft wheat           EE      2004      2020         17            5         85
row8   Soft wheat           EL      1998      2019         22           52       1047
row9   Soft wheat           ES      1998      2020         23           46       1108
row10  Soft wheat           FI      1998      2020    

#### Summary for Grain maize

In [15]:
crop = "Grain maize"
crop_yield_df = crop_stats_df[(crop_stats_df["VARIABLE"] == "Yield") &
                              (crop_stats_df["CROP_NAME"] == crop)]
countries_summary, column_names = getCropCountrySummary(crop, crop_yield_df, "REGION", "YEAR")
countries_summary_df = pd.DataFrame.from_dict(countries_summary, columns=column_names,
                                              orient="index")
print(countries_summary_df.head(30).to_string())

         crop_name country_code  min_year  max_year  num_years  num_regions  data_size
row0   Grain maize           AT      1975      2020         46            9        414
row1   Grain maize           BE      2011      2020         10           10        106
row2   Grain maize           BG      1991      2020         30            6        180
row3   Grain maize           CY      2000      2020         21            1         21
row4   Grain maize           CZ      2005      2020         16           14        219
row5   Grain maize           DE      2010      2020         11           12        131
row6   Grain maize           DK      2011      2020         10           11        110
row7   Grain maize           EE      2000      2020         21            5        105
row8   Grain maize           EL      2009      2019         11           52        563
row9   Grain maize           ES      1998      2020         23           46       1157
row10  Grain maize           FI      2000  

## Data preparation

### Data cleaning

Filter based on
* Yield not null
* Coherence tests (e.g. COHERENCE_APY (yield = production/area), COHERENCE_CROP (values for subcategories of crops add to the total))
* Data size check (e.g. >= 100)
* Number of years (e.g. >= 15)

In [22]:
# Data size requirements
min_num_years = 15
min_data_size = 100

def cleanStats(df, crop_name):
  # filter out country-level stats (country code is 2 letters)
  df = df[df["REGION"].str.len() > 2]

  # filter out data that don't pass coherence tests
  # COHERENCE_CROP test only applies to wheat and barley.
  if (("wheat" in crop_name.lower()) or
      ("barley" in crop_name.lower())):
    df = df[(df["COHERENCE_APY"] == "Yes") &
            (df["COHERENCE_CROP"] == "Yes")]
  else:
    df = df[df["COHERENCE_APY"] == "Yes"]

  # filter out nan values
  return df.dropna(subset=["VALUE"])


variables = ["Yield", "Area", "Production"]
variables_rename = {
  "Yield" : "yield",
  "Area" : "harvest_area",
  "Production" : "production"
}

rename_cols = {
  "CROP_NAME" : "crop_name",
  "REGION" : "adm_id",
  "YEAR" : "harvest_year",
}

sel_cols = ["crop_name", "adm_id", "harvest_year", "production", "harvest_area", "yield"]
final_stats = None
for cr in selected_crops:
  print("\n")
  print("Crop:", cr)
  crop_stats = None
  for v in variables:
    var_stats = crop_stats_df[(crop_stats_df["VARIABLE"] == "Yield") &
                           (crop_stats_df["CROP_NAME"] == cr)].copy()
    var_stats = cleanStats(var_stats, cr)
    rename_cols["VALUE"] = variables_rename[v]
    var_stats = var_stats.rename(columns=rename_cols)
    var_stats = var_stats.drop(columns=[c for c in var_stats.columns if c not in sel_cols])
    if (crop_stats is None):
      crop_stats = var_stats
    else:
      crop_stats = crop_stats.merge(var_stats, on=["crop_name", "adm_id", "harvest_year"])

  if (final_stats is None):
    final_stats = crop_stats
  else:
    final_stats = pd.concat([final_stats, crop_stats], axis=0)

  print(crop_stats.head(10).to_string())
  # countries = crop_stats["adm_id"].str[:2].unique()
  # for cn in countries:
  #   cn_stats = crop_stats[crop_stats["adm_id"].str[:2] == cn]
  #   num_years = len(cn_stats["harvest_year"].unique())
  #   data_size = cn_stats["harvest_year"].count()
  #   if ((num_years >= min_num_years) &
  #       (data_size >= min_data_size)):
  #     crop_stats = crop_stats[crop_stats["adm_id"].str[:2] != cn]

  countries_summary, column_names = getCropCountrySummary(cr, crop_stats, "adm_id", "harvest_year")
  countries_summary_df = pd.DataFrame.from_dict(countries_summary, columns=column_names,
                                                orient="index")
  print(countries_summary_df.head(30).to_string())



Crop: Soft wheat
  adm_id   crop_name  harvest_year  yield  harvest_area  production
0   AT11  Soft wheat          1995  4.798         4.798       4.798
1   AT11  Soft wheat          1996  4.770         4.770       4.770
2   AT11  Soft wheat          1997  4.339         4.339       4.339
3   AT11  Soft wheat          1998  4.563         4.563       4.563
4   AT11  Soft wheat          1999  4.938         4.938       4.938
5   AT11  Soft wheat          2000  3.542         3.542       3.542
6   AT11  Soft wheat          2001  4.340         4.340       4.340
7   AT11  Soft wheat          2002  4.160         4.160       4.160
8   AT11  Soft wheat          2003  3.640         3.640       3.640
9   AT11  Soft wheat          2004  5.250         5.250       5.250
        crop_name country_code  min_year  max_year  num_years  num_regions  data_size
row0   Soft wheat           AT      1995      2020         26            9        171
row1   Soft wheat           BE      2000      2020         21

### Save the data

In [5]:
final_stats["source"] = "https://agri4cast.jrc.ec.europa.eu/DataPortal/"

print(final_stats.head(5).to_string())
# final_stats.to_csv(data_path + "/" + "YIELD_EU.csv", index=False)

  adm_id   crop_name  harvest_year  yield  harvest_area  production                                          source
0   AT11  Soft wheat          1995  4.798         4.798       4.798  https://agri4cast.jrc.ec.europa.eu/DataPortal/
1   AT11  Soft wheat          1996  4.770         4.770       4.770  https://agri4cast.jrc.ec.europa.eu/DataPortal/
2   AT11  Soft wheat          1997  4.339         4.339       4.339  https://agri4cast.jrc.ec.europa.eu/DataPortal/
3   AT11  Soft wheat          1998  4.563         4.563       4.563  https://agri4cast.jrc.ec.europa.eu/DataPortal/
4   AT11  Soft wheat          1999  4.938         4.938       4.938  https://agri4cast.jrc.ec.europa.eu/DataPortal/
