# Harmonized subnational crop statistics of the EU

## Data access

The data is available from [Agri4Cast Data Portal](https://agri4cast.jrc.ec.europa.eu/DataPortal/RequestDataResource.aspx?idResource=36&o=&r=n) of the European Commission's Joint Research Centre. You need to create an account to access the data.

## Data exploration

### Columns and data types

In [53]:
import pandas as pd

data_path = "C:/Users/paude006/Documents/AgML/Data"
# filename = "crop_statistics_EU_2021.csv"
# crop_stats_df = pd.read_csv(data_path + "/" + filename,
#                             delimiter=";",
#                             header=0)
# print(crop_stats_df.head(5).to_string())
# print("\n")

filename = "crop_statistics_EU_2023.csv"
crop_stats_df = pd.read_csv(data_path + "/" + filename,
                            delimiter=";",
                            # set all to str, some columns have mixed types (e.g. str and nan)
                            dtype="str",
                            header=0)
print("\n")
print(crop_stats_df.head(5).to_string())
# set YEAR and VALUE to numeric
crop_stats_df = crop_stats_df.astype({"YEAR" : "int64", "VALUE" : "float64"})
print("\n")
# print(crop_stats_df.dtypes)
for c in crop_stats_df.columns[7:]:
  print(c, crop_stats_df[c].unique())



  REGION    CROP_NAME  YEAR    VARIABLE     VALUE   UoM SOURCE CALCULATED_R CALCULATED_C CALCULATED_V ZERO_AS_NULL COHERENCE_APY COHERENCE_CROP
0   AT11  Total wheat  1975        Area   32797.0    ha    NSI          NaN          Yes          NaN          NaN           Yes            NaN
1   AT11  Total wheat  1975  Production  108564.0     t    NSI          NaN          Yes          NaN          NaN           Yes            NaN
2   AT11  Total wheat  1975       Yield      3.31  t/ha    NSI          NaN          Yes          NaN          NaN           Yes            NaN
3   AT11  Total wheat  1976        Area   37650.0    ha    NSI          NaN          Yes          NaN          NaN           Yes            NaN
4   AT11  Total wheat  1976  Production  151341.0     t    NSI          NaN          Yes          NaN          NaN           Yes            NaN


CALCULATED_R [nan 'Yes']
CALCULATED_C ['Yes' nan]
CALCULATED_V [nan 'Yes']
ZERO_AS_NULL [nan 'Yes']
COHERENCE_APY ['Yes' nan 'No']
C

### Select crops

In [55]:
crops = crop_stats_df["CROP_NAME"].unique()
print("\n")
print("Crops", crops)

selected_crops = ["Soft wheat", "Grain maize"]
crop_stats_df = crop_stats_df[crop_stats_df["CROP_NAME"].isin(selected_crops)]

for cr in selected_crops:
  print("\n")
  print("Crop:", cr)
  print("---------------------")
  # # crop area
  # print(crop_stats_df[(crop_stats_df["VARIABLE"] == "Area") &
  #                     (crop_stats_df["CROP_NAME"] == cr)].head(5).to_string())
  # print("\n")

  # crop yield
  print(crop_stats_df[(crop_stats_df["VARIABLE"] == "Yield") &
                      (crop_stats_df["CROP_NAME"] == cr)].head(5).to_string())



Crops ['Soft wheat' 'Grain maize']


Crop: Soft wheat
---------------------
    REGION   CROP_NAME  YEAR VARIABLE  VALUE   UoM SOURCE CALCULATED_R CALCULATED_C CALCULATED_V ZERO_AS_NULL COHERENCE_APY COHERENCE_CROP
67    AT11  Soft wheat  1995    Yield  4.798  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
76    AT11  Soft wheat  1996    Yield  4.770  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
85    AT11  Soft wheat  1997    Yield  4.339  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
94    AT11  Soft wheat  1998    Yield  4.563  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes
103   AT11  Soft wheat  1999    Yield  4.938  t/ha    NSI          NaN          NaN          Yes          NaN           Yes            Yes


Crop: Grain maize
---------------------
       REGION    CROP_NAME  YEAR VARIABLE  VAL

### Summary by crop and country

In [56]:
def getCropCountrySummary(crop, yield_df):
  countries_summary = {}
  countries = yield_df["REGION"].str[:2].unique()
  row_idx = 0
  column_names = ["CROP", "COUNTRY", "MIN_YEAR", "MAX_YEAR", "NUM_YEARS",
                  "NUM_REGIONS", "DATA_SIZE"]
  for cn in countries:
    yield_cn_df = yield_df[yield_df["REGION"].str[:2] == cn]
    if (len(yield_cn_df.index) <= 1):
      continue

    min_year = yield_cn_df["YEAR"].min()
    max_year = yield_cn_df["YEAR"].max()
    num_years = len(yield_cn_df["YEAR"].unique())
    num_regions = yield_cn_df[yield_cn_df["YEAR"] == max_year]["REGION"].count()
    data_size = yield_cn_df["YEAR"].count()
    countries_summary["row" + str(row_idx)] = [crop, cn, min_year, max_year, num_years,
                                              num_regions, data_size]
    row_idx += 1

  return countries_summary, column_names

for cr in selected_crops:
  print("\n")
  print("Crop:", cr)
  crop_yield_df = crop_stats_df[(crop_stats_df["VARIABLE"] == "Yield") &
                                (crop_stats_df["CROP_NAME"] == cr)]
  countries_summary, column_names = getCropCountrySummary(cr, crop_yield_df)
  countries_summary_df = pd.DataFrame.from_dict(countries_summary, columns=column_names,
                                                orient="index")
  print(countries_summary_df.head(30).to_string())



Crop: Soft wheat
             CROP COUNTRY  MIN_YEAR  MAX_YEAR  NUM_YEARS  NUM_REGIONS  DATA_SIZE
row0   Soft wheat      AT      1995      2020         26            9        234
row1   Soft wheat      BE      1975      2020         43           10        438
row2   Soft wheat      BG      1995      2020         16            6         65
row3   Soft wheat      CY      2000      2020         21            1         21
row4   Soft wheat      CZ      1998      2020         23           14        322
row5   Soft wheat      DE      1999      2020         22          253       6973
row6   Soft wheat      DK      2006      2020         15           11        165
row7   Soft wheat      EE      2004      2020         17            5         85
row8   Soft wheat      EL      1998      2019         22           52       1047
row9   Soft wheat      ES      1998      2020         23           46       1108
row10  Soft wheat      FI      1998      2020         23           19        334
row11  So

## Data preparation

### Data cleaning

Filter based on
* Yield not null
* Coherence tests (e.g. COHERENCE_BETWEEN_A_P_Y, COHERENCE_TOTAL_WHEAT)
* Data size check (e.g. >= 100)
* Number of years (e.g. >= 15)

In [57]:
# Data size requirements
min_num_years = 15
min_data_size = 100

yield_df = None
for cr in selected_crops:
  print("\n")
  print("Crop:", cr)
  crop_yield_df = crop_stats_df[(crop_stats_df["VARIABLE"] == "Yield") &
                                (crop_stats_df["CROP_NAME"] == cr)]
  crop_yield_df = crop_yield_df[(crop_yield_df["COHERENCE_APY"] == "Yes") &
                                (crop_yield_df["COHERENCE_CROP"] == "Yes")]
  crop_yield_df = crop_yield_df.dropna(subset=["VALUE"])

  countries = crop_yield_df["REGION"].str[:2].unique()
  for cn in countries:
    yield_cn_df = crop_yield_df[crop_yield_df["REGION"].str[:2] == cn]
    num_years = len(yield_cn_df["YEAR"].unique())
    data_size = yield_cn_df["YEAR"].count()
    if ((num_years >= min_num_years) &
        (data_size >= min_data_size)):
      if (yield_df is None):
        yield_df = yield_cn_df
      else:
        yield_df = pd.concat([yield_df, yield_cn_df], axis=0)

  countries_summary, column_names = getCropCountrySummary(cr, yield_df)
  countries_summary_df = pd.DataFrame.from_dict(countries_summary, columns=column_names,
                                                orient="index")
  print(countries_summary_df.head(30).to_string())



Crop: Soft wheat
             CROP COUNTRY  MIN_YEAR  MAX_YEAR  NUM_YEARS  NUM_REGIONS  DATA_SIZE
row0   Soft wheat      AT      1995      2020         26            9        171
row1   Soft wheat      BE      2000      2020         21           10        216
row2   Soft wheat      CZ      2000      2020         21           14        294
row3   Soft wheat      DK      2006      2020         15           11        165
row4   Soft wheat      EL      1998      2019         22           46        992
row5   Soft wheat      ES      1998      2020         23           45        721
row6   Soft wheat      FI      2000      2020         21           17        201
row7   Soft wheat      FR      1989      2020         32           93       2571
row8   Soft wheat      HU      2002      2020         19           18        336
row9   Soft wheat      IT      1995      2020         26           80       1833
row10  Soft wheat      LT      2000      2020         21           10        209
row11  So

### Save the data

In [58]:
yield_df = yield_df.rename(columns={ "VALUE" : "YIELD"})
yield_df = yield_df.drop(columns=["VARIABLE", "UoM", "SOURCE",
                                  "CALCULATED_R", "CALCULATED_C", "CALCULATED_V",
                                  "ZERO_AS_NULL", "COHERENCE_APY", "COHERENCE_CROP"])
print(yield_df.head(5).to_string())
yield_df.to_csv(data_path + "/" + "YIELD_EU.csv")

    REGION   CROP_NAME  YEAR  YIELD
67    AT11  Soft wheat  1995  4.798
76    AT11  Soft wheat  1996  4.770
85    AT11  Soft wheat  1997  4.339
94    AT11  Soft wheat  1998  4.563
103   AT11  Soft wheat  1999  4.938
