In [24]:
import pandas as pd
import requests
import io

In [25]:
API_KEY = "861E6E6E-6BD2-3C50-8731-82E4EC90A5E3"


In [26]:
# API Documentation at https://quickstats.nass.usda.gov/api/

params = {
    "source_desc": "SURVEY",
    "sector_desc": "CROPS",
    "group_desc": "FIELD CROPS",
    "commodity_desc": "CORN",
    "statisticcat_desc": "YIELD",
    "short_desc": "CORN, GRAIN - YIELD, MEASURED IN BU / ACRE",
    "domain_desc": "TOTAL",
    "agg_level_desc": "COUNTY",
    "year__GE": "2013"
}

query = "&".join([f"{key}={val}" for key, val in params.items()])

URL = f"https://quickstats.nass.usda.gov/api/api_GET/?key={API_KEY}&{query}&format=CSV"
headers = {'User-Agent': ''} # NEEDED BECAUSE USDA BLOCKS 'non-browser' traffic


In [27]:
response = requests.get(URL,headers = {'User-Agent': ''})

In [28]:
df = pd.read_csv(io.StringIO(response.content.decode("UTF-8")), thousands=",")

In [29]:
print(len(df))
print(df.columns)

14524
Index(['source_desc', 'sector_desc', 'group_desc', 'commodity_desc',
       'class_desc', 'prodn_practice_desc', 'util_practice_desc',
       'statisticcat_desc', 'unit_desc', 'short_desc', 'domain_desc',
       'domaincat_desc', 'agg_level_desc', 'state_ansi', 'state_fips_code',
       'state_alpha', 'state_name', 'asd_code', 'asd_desc', 'county_ansi',
       'county_code', 'county_name', 'region_desc', 'zip_5', 'watershed_code',
       'watershed_desc', 'congr_district_code', 'country_code', 'country_name',
       'location_desc', 'year', 'freq_desc', 'begin_code', 'end_code',
       'reference_period_desc', 'week_ending', 'load_time', 'Value', 'CV (%)'],
      dtype='object')


That's a lot of columns. Let's keep the ones we care about.

In [30]:
columnsToKeep = ["year", "state_name", 'county_name', "Value"]
df = df[columnsToKeep]

In [31]:
df.head()

Unnamed: 0,year,state_name,county_name,Value
0,2019,ALABAMA,OTHER (COMBINED) COUNTIES,129.6
1,2018,ALABAMA,OTHER (COMBINED) COUNTIES,146.4
2,2017,ALABAMA,OTHER (COMBINED) COUNTIES,169.3
3,2016,ALABAMA,OTHER (COMBINED) COUNTIES,103.8
4,2015,ALABAMA,OTHER (COMBINED) COUNTIES,135.6


Now that we have all the corn production data by county for all of the states,
let's further prune this for the counties in the state's that we want.

In [32]:
corn_belt_states = set([
    "MINNESOTA",
    "SOUTH DAKOTA",
    "NEBRASKA",
    "KANSAS",
    "IOWA",
    "WISCONSIN",
    "ILLINOIS",
    "MISSOURI",
    "INDIANA",
    "OHIO"
])

df_corn = df[df["state_name"].isin(set(corn_belt_states))].reset_index(drop=True)

In [33]:
print(df_corn['state_name'].unique())
print(len(df_corn['county_name'].unique()))
print(len(df_corn['year'].unique()))

['ILLINOIS' 'INDIANA' 'IOWA' 'KANSAS' 'MINNESOTA' 'MISSOURI' 'NEBRASKA'
 'OHIO' 'SOUTH DAKOTA' 'WISCONSIN']
568
9


In [34]:
df_corn.groupby(by=['county_name']).agg({'county_name': 'count'}).rename(columns={"county_name": "count"}).reset_index().sort_values("count", ascending=False)

Unnamed: 0,county_name,count
372,OTHER (COMBINED) COUNTIES,349
236,JEFFERSON,67
536,WASHINGTON,67
233,JACKSON,67
83,CLAY,61
...,...,...
442,RUSSELL,1
480,STANLEY,1
494,SUMMIT,1
263,KOOCHICHING,1


There appears to be some duplicate counties going on.

Other (COMBINED) Counties make sense if multiple states describe extraneous counties with the same name.

However, the same case shouldn't exist for other counties, unless any given 2 states simply share a county name despite
these 2 counties actually being different.

In [35]:
# Validate these "duplicate" counties

df_corn[df_corn["county_name"] == "CLAY"]['state_name'].unique()

array(['ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 'MINNESOTA', 'MISSOURI',
       'NEBRASKA', 'SOUTH DAKOTA'], dtype=object)

That's impressive. By a simple google search, it's pretty easy to validate that "Clay" county is a distinct county in all of these states.


Now that that's out of the way, we can proceed with trust in our data.

Currently, these values are in terms of bushels/acre. Let's convert it to metric tons/acre.

In [36]:
df_corn["Value"] = df_corn["Value"]/39.3679

In [37]:
df_dist = df_corn.groupby('state_name').agg(count=('Value', 'count'), total=('Value', 'sum')).reset_index()
df_dist

Unnamed: 0,state_name,count,total
0,ILLINOIS,841,3981.74655
1,INDIANA,758,3385.387587
2,IOWA,859,4082.917809
3,KANSAS,685,2289.591266
4,MINNESOTA,657,2821.407797
5,MISSOURI,641,2458.147374
6,NEBRASKA,704,3158.878172
7,OHIO,733,3075.858758
8,SOUTH DAKOTA,453,1594.090617
9,WISCONSIN,556,2232.267914


In [38]:
df_dist['percentage'] = round(df_dist['total']/sum(df_dist['total']), 2)
df_dist

Unnamed: 0,state_name,count,total,percentage
0,ILLINOIS,841,3981.74655,0.14
1,INDIANA,758,3385.387587,0.12
2,IOWA,859,4082.917809,0.14
3,KANSAS,685,2289.591266,0.08
4,MINNESOTA,657,2821.407797,0.1
5,MISSOURI,641,2458.147374,0.08
6,NEBRASKA,704,3158.878172,0.11
7,OHIO,733,3075.858758,0.11
8,SOUTH DAKOTA,453,1594.090617,0.05
9,WISCONSIN,556,2232.267914,0.08


In [39]:
df_corn["Value"].describe()

count    6887.000000
mean        4.222491
std         0.811576
min         0.762042
25%         3.759408
50%         4.338560
75%         4.804676
max         6.266527
Name: Value, dtype: float64

It's clear that we have a pretty representative data set of corn yield for all the states in the United States corn belt.

Let's save the dataset as a csv and get the rest of our data.

In [40]:
df_corn.to_csv("../data/crop_data_yield.csv", index=False)

In [41]:
df_corn.head()

Unnamed: 0,year,state_name,county_name,Value
0,2019,ILLINOIS,OTHER (COMBINED) COUNTIES,4.427465
1,2021,ILLINOIS,BUREAU,5.169186
2,2020,ILLINOIS,BUREAU,5.06504
3,2019,ILLINOIS,BUREAU,4.569713
4,2018,ILLINOIS,BUREAU,5.438441
