# Data Acquisition
### Table of Contents
- [Requirements](#requirements)
- [Obtaining Historical Yield Data](#obtaining-historical-yield-data)
- [Obtaining Historical Price Received Data](#obtaining-historical-price-received-data)
- [Obtaining Historical Weather Data](#obtaining-historical-weather-data)

## Requirements

In [71]:
import requests
import urllib.parse
import pandas as pd
from io import StringIO

## Obtaining Historical Yield Data
#### (Grain & Silage)

In [72]:
# retrieving the access key for the USDA NASS API

with open('../keys/api_key.txt', 'r') as file:
    usda_nass_key = file.read()

In [73]:
# assignment of variables for later use

states = ['ILLINOIS', 'INDIANA', 'IOWA', 'MINNESOTA', 'MISSOURI', 'NEBRASKA']

parameters = {
    'key': usda_nass_key,
    'source_desc': 'SURVEY',
    'sector_desc': 'CROPS',
    'group_desc': 'FIELD CROPS',
    'commodity_desc': 'CORN',
    'statisticcat_desc': 'YIELD',
    'agg_level_desc': 'STATE',
    'format': 'csv'
}

In [74]:
## encoding query parameters according to URL standards
encoded_parameters = urllib.parse.urlencode(parameters)

## establishing the URL base for the query
base_url = 'https://quickstats.nass.usda.gov/api/api_GET/'

## constructing the API query from the URL base and the URL encoded parameters
query = base_url+'?'+encoded_parameters

## fetching the query response
response = requests.get(query)

In [75]:
## if the query was successful:
if response.status_code == 200:

    ### convert query response to text
    data = StringIO(response.text)

    ### convert response text to a pandas dataframe
    yield_raw = pd.read_csv(data)

    ### narrow the obtained dataset to only the necessary columns to reduce storage waste
    yield_raw = yield_raw[['year', 'util_practice_desc', 'state_name', 'reference_period_desc', 'Value']]

    ### we are only interested in the yearly recorded yield, not forecasts or monthly estimates
    yield_raw = yield_raw[yield_raw['reference_period_desc'] == 'YEAR']

    ### save the dataframe as a local CSV
    yield_raw.to_csv('../data/raw/yield_raw.csv', index=False) # file path appears as `data/raw/yield_raw.csv` in `acquisition.py`
    print('Data saved as yield_raw.csv')

## error handling
else:
    print(f'Request failed with status code {response.status_code}')

Data saved as yield_raw.csv


## Obtaining Historical Price Received Data
#### (By Month as well as Marketing Year)

In [76]:
# since the historical price received data is too large to be fetched in one query, we fetch each state's data separately
for i, state in enumerate(states):

    ## assignment of variables for later use
    parameters = {
        'key': usda_nass_key,
        'source_desc': 'SURVEY',
        'sector_desc': 'CROPS',
        'group_desc': 'FIELD CROPS',
        'commodity_desc': 'CORN',
        'statisticcat_desc': 'PRICE RECEIVED',
        'agg_level_desc': 'STATE',
        'state_name':state,
        'format': 'csv'
    }

    ## encoding query parameters according to URL standards
    encoded_parameters = urllib.parse.urlencode(parameters)

    ## establishing the URL base for the query
    base_url = 'https://quickstats.nass.usda.gov/api/api_GET/'

    ## constructing the API query from the URL base and the URL encoded parameters
    query = base_url+'?'+encoded_parameters

    ## fetching the query response
    response = requests.get(query)

    ## if the query was successful:
    if response.status_code == 200:

        ### for the first state, ILLINOIS, create the `price_received_raw` dataframe
        if i == 0:
            data = StringIO(response.text)
            price_received_raw = pd.read_csv(data)
            price_received_raw = price_received_raw[['year', 'state_name', 'reference_period_desc', 'Value']]

        ### for every state thereafter, create a temporary dataframe with the 'next' state's data and merge it with the `price_received_raw` dataframe
        else:
            data = StringIO(response.text)
            temp_df = pd.read_csv(data)
            temp_df = temp_df[['year', 'state_name', 'reference_period_desc', 'Value']]
            price_received_raw = pd.concat([price_received_raw, temp_df])

    ## error handling
    else:
        print(f'Request failed for {state} with status code {response.status_code}')

In [77]:
# save the dataframe as a local CSV
price_received_raw.to_csv('../data/raw/price_received_raw.csv', index=False) # file path appears as `data/raw/price_received_raw.csv` in `acquisition.py`
print('Data saved as price_received_raw.csv')

Data saved as price_received_raw.csv


## Obtaining Historical Weather Data
#### (Average/Maximum/Minimum Temperature, Total Precipitation, & PDSI by Month from April through November)

In [78]:
## assignment of variables for later use
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'semptember', 'october', 'november', 'december']
state_numbers = [11, 12, 13, 21, 23, 25]
count1 = 0
count2 = 0

## iterate through each state, variable, & month combination
for state_number in state_numbers:
    for variable in ['tavg', 'tmax', 'tmin', 'pcp', 'pdsi']:
        for month_number in range(4, 12):

            ### skip the appropriate number of rows when reading the CSV
            skiprows = 3 if variable == 'pdsi' else 4

            ### construct the appropriate CSV link
            link = f'https://www.ncei.noaa.gov/access/monitoring/climate-at-a-glance/statewide/time-series/{state_number}/{variable}/12/{month_number}/1895-2024.csv?base_prd=true&begbaseyear=1895&endbaseyear=2000'

            ### for the first month (April) of each state's iteration:
            if count1 == 0:

                #### create the `state_weather` dataframe
                state_weather = pd.read_csv(link, skiprows=skiprows)

                #### drop unnecessary columns
                state_weather.drop(
                    columns=state_weather.columns[2:],
                    inplace=True
                )

                #### rename the value column to its appropriate name
                state_weather.rename(
                    columns={state_weather.columns[1]:f'{months[month_number]}_{variable}'},
                    inplace=True
                )

                #### convert the `Date` (YYYYMM) column to YYYY format for later merging
                state_weather['Date'] = state_weather['Date'].apply(lambda x: int(str(x)[:4]))
                count1 += 1
            
            ### for every month thereafter for each state's iteration:
            else:

                #### create a temporary dataframe to store this month's variable's data
                temp = pd.read_csv(link, skiprows=skiprows)

                #### drop unnecessary columns
                temp.drop(
                    columns=temp.columns[2:],
                    inplace=True
                )

                #### rename the value column to its appropriate name
                temp.rename(
                    columns={temp.columns[1]:f'{months[month_number]}_{variable}'},
                    inplace=True
                )

                #### convert the `Date` (YYYYMM) column to YYYY format for later merging
                temp['Date'] = temp['Date'].apply(lambda x: int(str(x)[:4]))

                #### merge the temporary dataframe with the `state_weather` dataframe
                state_weather = state_weather.merge(
                    temp,
                    on='Date',
                    how='outer'
                )
                count1 += 1
        
    ### assign the appropriate state name for each record in the new column, `state`
    state_weather['state'] = states[state_numbers.index(state_number)]

    ### for the first state, ILLINOIS, create the `weather_raw` dataframe
    if count2 == 0:
        weather_raw = state_weather.copy()
        count2 += 1
        count1 = 0

    ### for every state thereafter, concatenate that state's dataframe with the `weather_raw` dataframe
    else:
        weather_raw = pd.concat([weather_raw, weather_raw])
        count2 += 1
        count1 = 0

In [79]:
# save the dataframe as a local CSV
weather_raw.to_csv('../data/raw/weather_raw.csv', index=False) # file path appears as `data/raw/weather_raw.csv` in `acquisition.py`
print('Data saved as weather_raw.csv')

Data saved as weather_raw.csv
