In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

Get crop categorization codes from metadata file

In [2]:
# Iterate over years and place crop codes in a DataFrame
rows_list = []

for year in range(2008, 2024):

    # Extract table
    metadata_htm_path = list(Path(f"../Data/croplandcros_cdl/{year}_30m_cdls").glob('*.htm'))

    with open(metadata_htm_path[0]) as fp:
        soup = BeautifulSoup(fp)

    # The categorization codes for the crop types are in tables that are in preformatted blocks of text
    # Get the last block, which should be the categorization table
    cat_table = soup.find_all('pre')[-1].string

    # Extract lines with crop and categorization code
    for line in cat_table.splitlines():

        crop_code_dict = {}

        # Regex match number enclosed in double quotes
        result = re.search(r'\"\d*\"', line)

        # If line has categorization code, extract info from line
        if result is None:
            continue
        else:
            crop_code = re.search(r'\"\d*\"', line)[0].strip('\"')
            crop_name = re.search(r'[A-Z].*', line)[0]

            crop_code_dict['year'] = year
            crop_code_dict['crop_code'] = crop_code
            crop_code_dict['crop_name'] = crop_name

            rows_list.append(crop_code_dict)

    # For years prior to 2022, code 176 is not defined for some reason
    crop_code_dict = {}
    crop_code_dict['year'] = year
    crop_code_dict['crop_code'] = '176'
    crop_code_dict['crop_name'] = 'Grassland/Pasture'
    rows_list.append(crop_code_dict)

crop_code_df = pd.DataFrame(rows_list)

Add crop names to county crop aggregates

In [3]:
# Read CDL county aggregates
cdl_df = pd.DataFrame()
for year in range(2008, 2024):
    df = pd.read_parquet(f"../binaries/county_crop_pixel_count_{year}.parquet")
    df['year'] = year
    cdl_df = pd.concat([cdl_df, df])

In [4]:
# Coerce types to match
crop_code_df['crop_code'] = crop_code_df['crop_code'].astype(int)

cdl_df = cdl_df.rename(columns = {'crop':'crop_code'})
cdl_df['crop_code'] = cdl_df['crop_code'].astype(int)

In [5]:
# Merge
county_crop_df = cdl_df.merge(crop_code_df, how = 'left', on = ['year', 'crop_code'])

Calculate acreage, clean FIPS code, then export

In [6]:
# Convert pixels to acres
# Each pixel is 30mx30m = 900m^2
# 1 acre = 4046.86m^2
# 900m^2 = 900/4046.86 acres = 0.22239464671 acres
county_crop_df['acres'] = county_crop_df['pixel_count']*(900/4046.86)

In [7]:
# Clean FIPS code
county_crop_df['fips'] = county_crop_df['fips'].astype('string')
county_crop_df['fips'] = county_crop_df['fips'].str.pad(width = 5, side = 'left', fillchar = '0')

county_crop_df['state_fips_code'] = county_crop_df['fips'].str.slice(start = 0, stop = 2)
county_crop_df['county_fips_code'] = county_crop_df['fips'].str.slice(start = 2, stop = 5)

county_crop_df = county_crop_df.drop(columns = ['fips'])

In [8]:
# Save binary
county_crop_df.to_parquet("../binaries/croplandcros_county_crop_acres.parquet")
county_crop_df.to_parquet("../files_for_phil/croplandcros_county_crop_acres.parquet")