# Data Acquisition
### Table of Contents
- [Requirements](#requirements)
- [Obtaining Historical Yield Data](#obtaining-historical-yield-data)
- [Obtaining Historical Price Received Data](#obtaining-historical-price-received-data)
- [Obtaining Historical Weather Data](#obtaining-historical-weather-data)

## Requirements

In [3]:
import requests
import urllib.parse
import pandas as pd
from io import StringIO

## Obtaining Historical Yield Data
#### (Grain & Silage)

In [4]:
# retrieving the access key for the USDA NASS API

print('\nLoading...\n')

with open('../../keys/api_key.txt', 'r') as file: # file path appears as `keys/api_key.txt` in `acquisition.py`
    usda_nass_key = file.read()

print('Access key loaded.')


Loading...

Access key loaded.


In [5]:
# assignment of variables for later use

states = ['ILLINOIS', 'INDIANA', 'IOWA', 'MINNESOTA', 'MISSOURI', 'NEBRASKA']

parameters = {
    'key': usda_nass_key,
    'source_desc': 'SURVEY',
    'sector_desc': 'CROPS',
    'group_desc': 'FIELD CROPS',
    'commodity_desc': 'CORN',
    'statisticcat_desc': 'YIELD',
    'agg_level_desc': 'STATE',
    'format': 'csv'
}

In [6]:
## encoding query parameters according to URL standards
encoded_parameters = urllib.parse.urlencode(parameters)

## establishing the URL base for the query
base_url = 'https://quickstats.nass.usda.gov/api/api_GET/'

## constructing the API query from the URL base and the URL encoded parameters
query = base_url+'?'+encoded_parameters

## fetching the query response
response = requests.get(query)

In [7]:
## if the query was successful:
if response.status_code == 200:

    ### convert query response to text
    data = StringIO(response.text)

    ### convert response text to a pandas dataframe
    yield_raw = pd.read_csv(data)

    ### narrow the obtained dataset to only the necessary columns to reduce storage waste
    yield_raw = yield_raw[['year', 'util_practice_desc', 'state_name', 'reference_period_desc', 'Value']]

    ### we are only interested in the yearly recorded yield, not forecasts or monthly estimates
    yield_raw = yield_raw[yield_raw['reference_period_desc'] == 'YEAR']

    ### save the dataframe as a local CSV
    yield_raw.to_csv('../../data/raw/yield_raw.csv', index=False) # file path appears as `data/raw/yield_raw.csv` in `acquisition.py`
    print('Data saved as yield_raw.csv')

## error handling
else:
    print(f'Request failed with status code {response.status_code}')

Data saved as yield_raw.csv


## Obtaining Historical Price Received Data
#### (By Month as well as Marketing Year)

In [None]:
# since the historical price received data is too large to be fetched in one query, we fetch each state's data separately

print('\nLoading...\n')

for i, state in enumerate(states):

    ## assignment of variables for later use
    parameters = {
        'key':usda_nass_key,
        'source_desc':'SURVEY',
        'sector_desc':'CROPS',
        'group_desc':'FIELD CROPS',
        'commodity_desc':'CORN',
        'statisticcat_desc':'PRICE RECEIVED',
        'agg_level_desc':'STATE',
        'state_name':state,
        'format':'csv'
    }

    ## encoding query parameters according to URL standards
    encoded_parameters = urllib.parse.urlencode(parameters)

    ## establishing the URL base for the query
    base_url = 'https://quickstats.nass.usda.gov/api/api_GET/'

    ## constructing the API query from the URL base and the URL encoded parameters
    query = base_url+'?'+encoded_parameters

    ## fetching the query response
    response = requests.get(query)

    ## if the query was successful:
    if response.status_code == 200:

        ### for the first state, ILLINOIS, create the `price_received_raw` dataframe
        if i == 0:
            data = StringIO(response.text)
            price_received_raw = pd.read_csv(data)
            price_received_raw = price_received_raw[['year', 'state_name', 'reference_period_desc', 'Value']]

        ### for every state thereafter, create a temporary dataframe with the 'next' state's data and merge it with the `price_received_raw` dataframe
        else:
            data = StringIO(response.text)
            temp_df = pd.read_csv(data)
            temp_df = temp_df[['year', 'state_name', 'reference_period_desc', 'Value']]
            price_received_raw = pd.concat([price_received_raw, temp_df])

    ## error handling
    else:
        print(f'Request failed for {state} with status code {response.status_code}')


Loading...



In [9]:
# save the dataframe as a local CSV

price_received_raw.to_csv('../../data/raw/price_received_raw.csv', index=False) # file path appears as `data/raw/price_received_raw.csv` in `acquisition.py`
print('Data saved as price_received_raw.csv')

Data saved as price_received_raw.csv


## Obtaining Historical Weather Data
#### (Average/Maximum/Minimum Temperature, Total Precipitation, & PDSI by Month from April through November)

In [10]:
# assignment of variables for later use

print('\nLoading...\n')

months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
state_numbers = [11, 12, 13, 21, 23, 25]


Loading...



In [None]:
concatenation_list = []
## iterate through each state, variable, & month combination
for state, state_number in zip(states, state_numbers):

    merge_list = []
    for variable in ['tavg', 'tmax', 'tmin', 'pcp', 'pdsi']:

        ### skip the appropriate number of rows when reading the CSV
        skiprows = 1 if variable == 'pdsi' else 2

        link = f'https://www.ncei.noaa.gov/access/monitoring/climate-at-a-glance/statewide/time-series/{state_number}/{variable}/1/0/1895-2024/data.csv'
        temp = pd.read_csv(link, skiprows=skiprows)

        temp['State'] = state
        temp['Year'] = temp['Date'].astype(str).str[:4].astype(int)
        temp['Month'] = temp['Date'].astype(str).str[4:6].astype(int)
        temp = temp\
            .rename(columns={'Value':variable})\
            .drop(columns='Date')

        merge_list.append(temp)
    
    for i in range(1, len(merge_list)):
        merge_list[0] = merge_list[0].merge(merge_list[i], on=['Year', 'Month', 'State'], how='outer')
    concatenation_list.append(merge_list[0])

weather_raw = pd.concat(concatenation_list, ignore_index=True)

In [26]:
# save the dataframe as a local CSV

weather_raw.to_csv('../../data/raw/weather_raw.csv', index=False) # file path appears as `data/raw/weather_raw.csv` in `acquisition.py`
print('Data saved as weather_raw.csv')

Data saved as weather_raw.csv
