# Crop statistics from National Agricultural Statistics Service (NASS)

## Data access

The data is available from [Quickstats](https://quickstats.nass.usda.gov/) of the US Department of Agriculture, National Agricultural Statistics Service (NASS). You need to create an account and request an API key to access the data. After you have an API key, you can use the `nass_stats.R` script to download data.

## Data exploration

### Columns and data types

In [1]:
import pandas as pd

data_path = "C:/Users/paude006/Documents/git-repos/AgML-crop-yield-forecasting/data/data_US"

# Data were downloaded into 2 CSV files because NASS limits number of entries in the query result.
# NOTE: The downloaded data includes years 1995-2023.
# We started from 1995 for two reasons:
# 1. Remote sensing data starts from 2000.
# 2. Data from 1995 to 1999 is useful in case we use yield trend (trend window 5).
# It is possible to download yield statistics from earlier years if necessary.
# Change the years in nass_stats.R. Downloading too many years will cause hit the NASS limit.
# Then you can download in multiple runs of the script with different year ranges.
filename1 = "nass_stats1.csv"
filename2 = "nass_stats2.csv"
crop_stats_df1 = pd.read_csv(data_path + "/" + filename1,
                             delimiter=",",
                             # set all to str, some columns have mixed types (e.g. str and nan)
                             dtype="str",
                             header=0)
# print("\n")
# print(crop_stats_df1.head(5).to_string())
# print(crop_stats_df1["Year"].min(), crop_stats_df1["Year"].max())

crop_stats_df2 = pd.read_csv(data_path + "/" + filename2,
                             delimiter=",",
                             # set all to str, some columns have mixed types (e.g. str and nan)
                             dtype="str",
                             header=0)
# print("\n")
# print(crop_stats_df2.head(5).to_string())
# print(crop_stats_df2["Year"].min(), crop_stats_df2["Year"].max())

crop_stats_df = pd.concat([crop_stats_df1, crop_stats_df2], axis=0)

# set YEAR and VALUE to numeric
crop_stats_df = crop_stats_df.astype({"Year" : "int64", "Yield" : "float64"})
print("\n")
crop_stats_df["adm_id"] = "US-" + crop_stats_df["statefp"] + "-" + crop_stats_df["countyfp"] 
print(crop_stats_df.head(5).to_string())



   Year    State   County statefp countyfp         Crop   Area  Yield Production     adm_id
0  2010  ALABAMA  COLBERT      01      033  corn_grain   20800  142.8    2971000  US-01-033
1  2009  ALABAMA  COLBERT      01      033  corn_grain   16400  126.0    2064000  US-01-033
2  2007  ALABAMA  COLBERT      01      033  corn_grain   18300   85.0    1550000  US-01-033
3  2006  ALABAMA  COLBERT      01      033  corn_grain   11400   90.0    1030000  US-01-033
4  2005  ALABAMA  COLBERT      01      033  corn_grain   10000  147.0    1472000  US-01-033


### Select crops

In [2]:
crops = crop_stats_df["Crop"].unique()
print("\n")
print("Crops", crops)

selected_crops = ["corn_grain ", "wheat_winter "]
crop_stats_df = crop_stats_df[crop_stats_df["Crop"].isin(selected_crops)]

for cr in selected_crops:
  print("\n")
  print("Crop:", cr)
  print("---------------------")
  print(crop_stats_df[crop_stats_df["Crop"] == cr].head(5).to_string())



Crops ['corn_grain ' 'wheat_winter ']


Crop: corn_grain 
---------------------
   Year    State   County statefp countyfp         Crop   Area  Yield Production     adm_id
0  2010  ALABAMA  COLBERT      01      033  corn_grain   20800  142.8    2971000  US-01-033
1  2009  ALABAMA  COLBERT      01      033  corn_grain   16400  126.0    2064000  US-01-033
2  2007  ALABAMA  COLBERT      01      033  corn_grain   18300   85.0    1550000  US-01-033
3  2006  ALABAMA  COLBERT      01      033  corn_grain   11400   90.0    1030000  US-01-033
4  2005  ALABAMA  COLBERT      01      033  corn_grain   10000  147.0    1472000  US-01-033


Crop: wheat_winter 
---------------------
       Year    State   County statefp countyfp           Crop  Area  Yield Production     adm_id
41549  2003  ALABAMA  COLBERT      01      033  wheat_winter   3300   41.0     135000  US-01-033
41550  2000  ALABAMA  COLBERT      01      033  wheat_winter    500   54.0      27000  US-01-033
41551  1999  ALABAMA  COLBERT  

### Summary by crop and country

In [3]:
def getCropCountrySummary(crop, yield_df, adm_id_col, year_col):
  countries_summary = {}
  countries = yield_df[adm_id_col].str[:2].unique()
  row_idx = 0
  column_names = ["crop_name", "country_code", "min_year", "max_year", "num_years",
                  "num_regions", "data_size"]
  for cn in countries:
    yield_cn_df = yield_df[yield_df[adm_id_col].str[:2] == cn]
    if (len(yield_cn_df.index) <= 1):
      continue

    min_year = yield_cn_df[year_col].min()
    max_year = yield_cn_df[year_col].max()
    num_years = len(yield_cn_df[year_col].unique())
    num_regions = yield_cn_df[yield_cn_df[year_col] == max_year][adm_id_col].count()
    data_size = yield_cn_df[year_col].count()
    countries_summary["row" + str(row_idx)] = [crop, cn, min_year, max_year, num_years,
                                              num_regions, data_size]
    row_idx += 1

  return countries_summary, column_names

#### Summary for winter wheat

In [4]:
crop = "wheat_winter "
crop_yield_df = crop_stats_df[crop_stats_df["Crop"] == crop]
countries_summary, column_names = getCropCountrySummary(crop, crop_yield_df, "adm_id", "Year")
countries_summary_df = pd.DataFrame.from_dict(countries_summary, columns=column_names,
                                              orient="index")
print(countries_summary_df.head(30).to_string())

          crop_name country_code  min_year  max_year  num_years  num_regions  data_size
row0  wheat_winter            US      1990      2023         34          826      49602


#### Summary for Grain maize

In [5]:
crop = "corn_grain "
crop_yield_df = crop_stats_df[crop_stats_df["Crop"] == crop]
countries_summary, column_names = getCropCountrySummary(crop, crop_yield_df, "adm_id", "Year")
countries_summary_df = pd.DataFrame.from_dict(countries_summary, columns=column_names,
                                              orient="index")
print(countries_summary_df.head(30).to_string())

        crop_name country_code  min_year  max_year  num_years  num_regions  data_size
row0  corn_grain            US      1990      2023         34         1461      61703


## Data preparation

### Data cleaning

Filter based on
* Yield not null

In [6]:
variables = ["Yield", "Area", "Production"]

rename_cols = {
  "Yield" : "yield",
  "Area" : "harvest_area",
  "Production" : "production",
  "Crop" : "crop_name",
  "Year" : "harvest_year",
}

sel_cols = ["crop_name", "adm_id", "harvest_year", "production", "harvest_area", "yield"]
final_stats = None

for cr in selected_crops:
  print("\n")
  print("Crop:", cr)
  crop_stats = crop_stats_df[crop_stats_df["Crop"] == cr].copy()
  crop_stats = crop_stats.rename(columns=rename_cols)
  crop_stats = crop_stats.drop(columns=[c for c in crop_stats.columns if c not in sel_cols])
  crop_stats = crop_stats.dropna(axis=0)

  if (final_stats is None):
    final_stats = crop_stats
  else:
    final_stats = pd.concat([final_stats, crop_stats], axis=0)

  print(crop_stats.head(10).to_string())



Crop: corn_grain 
   harvest_year    crop_name harvest_area  yield production     adm_id
0          2010  corn_grain         20800  142.8    2971000  US-01-033
1          2009  corn_grain         16400  126.0    2064000  US-01-033
2          2007  corn_grain         18300   85.0    1550000  US-01-033
3          2006  corn_grain         11400   90.0    1030000  US-01-033
4          2005  corn_grain         10000  147.0    1472000  US-01-033
5          2004  corn_grain          9200  138.0    1271000  US-01-033
6          2003  corn_grain          7800  161.0    1253000  US-01-033
7          2002  corn_grain          6700  118.0     790000  US-01-033
8          2001  corn_grain          6700  145.0     972000  US-01-033
9          2000  corn_grain          7700   99.0     760000  US-01-033


Crop: wheat_winter 
       harvest_year      crop_name harvest_area  yield production     adm_id
41549          2003  wheat_winter          3300   41.0     135000  US-01-033
41550          2000  wh

### Add 2 letter country code

In [7]:
final_stats["country_code"] = final_stats["adm_id"].str[:2]
# country_code is added as the last column, reorder columns
col_order = ["crop_name", "country_code", "adm_id", "harvest_year", "harvest_area", "yield", "production"]
final_stats = final_stats[col_order]

### Save the data

In [8]:
print(final_stats.head(5).to_string())
final_stats.to_csv(data_path + "/" + "YIELD_COUNTY_US.csv", index=False)

     crop_name country_code     adm_id  harvest_year harvest_area  yield production
0  corn_grain            US  US-01-033          2010        20800  142.8    2971000
1  corn_grain            US  US-01-033          2009        16400  126.0    2064000
2  corn_grain            US  US-01-033          2007        18300   85.0    1550000
3  corn_grain            US  US-01-033          2006        11400   90.0    1030000
4  corn_grain            US  US-01-033          2005        10000  147.0    1472000
