In [1]:
import os
import zipfile
import csv
from datetime import datetime
import boto3
from io import StringIO
import pandas as pd

In [2]:
## set-up for AWS  
s3_client = boto3.client('s3')  
bucket_name = 'ca-climate-index'  
directory = '1_pull_data/climate_risk/extreme_heat/loss/usda/usda_crop_loss_heat_files/'  

In [3]:
# Function to extract and read text files from zip files
def extract_and_read(file_path):
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()
        file_contents = []
        for file in file_list:
            with zip_ref.open(file) as txt_file:
                lines = txt_file.readlines()
                lines = [line.decode('utf-8').strip() for line in lines]
                file_contents.append((file, lines))
    return file_contents

In [4]:
# Function to extract year from line
def get_year_from_line(line):
    # Year is separated by '|' and is the first element
    elements = line.split('|')
    if len(elements) > 1:
        potential_year = elements[0].strip()
        if len(potential_year) == 4 and potential_year.isdigit():
            return potential_year
    return ''

In [5]:
# Function to sort files based on the first column
def sort_files(file_contents):
    sorted_files = []
    for file, lines in file_contents:
        lines.sort(key=lambda x: get_year_from_line(x))
        sorted_files.append((file, lines))
    return sorted_files

In [6]:
# Modify the merge_files function to use the client for S3 operations
def merge_files(sorted_files, output_file):
    headers = ['year', 'state_code', 'state_abbreviation', 'county_code',
               'county_name', 'commodity_code', 'commodity_name',
               'insurance_plan_code', 'insurance_plan_abbreviation',
               'stage_code', 'damage_cause_code', 'damage_description',
               'determined_acres', 'indemnity_amount']

    # Create an empty list to hold rows
    rows = []

    for file, lines in sorted_files:
        for line in lines:
            row_data = [element.strip() for element in line.split('|')]
            if len(row_data) > 2 and row_data[2] == 'CA':
                rows.append(row_data)

    # Create DataFrame from the list of rows
    df = pd.DataFrame(rows, columns=headers)

    # Save DataFrame to AWS using the client
    new_buffer = StringIO()
    df.to_csv(new_buffer, index=False)
    content = new_buffer.getvalue()
    s3_client.put_object(Bucket=bucket_name, Body=content, Key=directory + output_file)

    # Optionally, return the DataFrame
    return df

In [7]:
# Main function
def sort_and_merge(folder_name, output_file):
    '''
    Extracts data from multiple files within a folder, assigns each column an appropriate header, merges all
    datasets together, sorts by data year, writes the final output as a .csv file, and uploads to designated
    AWS bucket.
    
    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    folder_name: string
              The name of the folder containing zipped files for 'Cause of Loss Historical Data Files' under 'Indemnities Only'
              from the USDA's Risk Management Agency: https://legacy.rma.usda.gov/data/cause.html
    output_file: string
              Final output as a .csv file
    '''
    file_contents = []
    for file in os.listdir(folder_name):
        if file.endswith('.zip'):
            file_path = os.path.join(folder_name, file)
            file_contents.extend(extract_and_read(file_path))

    sorted_files = sort_files(file_contents)
    final_df = merge_files(sorted_files, output_file)
    print(f"Merged and sorted files written to {output_file}")

    # Return the final DataFrame if needed
    return final_df

In [8]:
# Provide folder path containing zip files and output file name
# Zip files were downloaded and placed in a folder within our local env
folder_name = 'usda_crop_loss_heat_files'
output_file = 'usda_crop_loss_CA_final.csv'

# Run the main function
sort_and_merge(folder_name, output_file)

Merged and sorted files written to usda_crop_loss_CA_final.csv


Unnamed: 0,year,state_code,state_abbreviation,county_code,county_name,commodity_code,commodity_name,insurance_plan_code,insurance_plan_abbreviation,stage_code,damage_cause_code,damage_description,determined_acres,indemnity_amount
0,1989,06,CA,001,Alameda,9999,All Other Crops,90,APH,0H,12,Heat,000000000052,
1,1989,06,CA,001,Alameda,9999,All Other Crops,90,APH,UH,12,Heat,000000006508,
2,1989,06,CA,007,Butte,0028,ALMONDS,90,APH,04,31,Excess Moisture/Precip/Rain,000000615897,
3,1989,06,CA,007,Butte,0028,ALMONDS,90,APH,0H,31,Excess Moisture/Precip/Rain,000001606100,
4,1989,06,CA,007,Butte,0028,ALMONDS,90,APH,0H,32,Poor Drainage,000000012866,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24655,2018,06,CA,115,Yuba,9999,All Other Crops,90,APH,H,41,Frost,.5000000000,1967.7000000000
24656,2018,06,CA,115,Yuba,9999,All Other Crops,13,RI,FL,55,ARPI/SCO/STAX/MP Crops Only,.0000000000,57568.0000000000
24657,2018,06,CA,115,Yuba,9999,All Other Crops,90,APH,H,61,Wind/Excess Wind,4.5000000000,17709.3000000000
24658,2018,06,CA,999,All Other Counties,9999,All Other Crops,90,APH,H,31,Excess Moisture/Precipitation/Rain,70.8000000000,19454.0000000000
