## Cal-CRAI Data Pull -- Crop Loss Data
This notebook processes crop loss data sourced from USDA Risk Management: \
https://legacy.rma.usda.gov/data/cause.html

Data pulling includes:
* Isolates data to a list of desired columns
* Merges all data to single .csv file

Output is uploaded to 1_pull_data directory within AWS

In [1]:
import os
import zipfile
import csv
from datetime import datetime
import boto3
from io import StringIO
import pandas as pd
import io
import sys

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [9]:
@append_metadata
def pull_process_crop_loss(main_folder, output_folder, export=False, search_zipped=True, varname=''):
    '''
Pulls manually downloaded crop loss files sourced from USDA: https://legacy.rma.usda.gov/data/cause.html
Pulled TXT files are cleaned and converted to CSV's, then merged into one single CSV file.
    
    Parameters
    ----------
    main_folder: string
        Local main folder where the pulled files will be saved.
    output_folder: string
        Local folder where the merged output CSV will be saved.
    gathered_links: string
        Name a place to store all of the url references before data is converted to a folder with all the data
    export: bool
        If True, upload results to AWS bucket.
    searched_zipped: bool
        If True, search for csv, xls, or txt files within zip files from AWS. If false, look for these file types 
        directly

    Script
    ------
    usda_crop_loss.ipynb
    '''
    print('Data Transformation: data is converted from .txt to .csv files and merged into one final .csv')
    # Set-up for AWS
    s3_client = boto3.client('s3')
    bucket_name = 'ca-climate-index'
    directory = '1_pull_data/climate_risk/extreme_heat/loss/usda/usda_crop_loss_heat_files/'

    file_contents = []  # Initialize file_contents list
    # Function to extract year from line
    def get_year_from_line(line):
        elements = line.split('|')
        if len(elements) > 1:
            potential_year = elements[0].strip()
            if len(potential_year) == 4 and potential_year.isdigit():
                return potential_year
        return ''

    # Function to sort files based on the first column
    def sort_files(file_contents):
        sorted_files = []
        for file, lines in file_contents:
            lines.sort(key=lambda x: get_year_from_line(x))
            sorted_files.append((file, lines))
        return sorted_files

    # Function to merge files
    def merge_files(sorted_files, output_file):
        headers = ['year', 'state_code', 'state_abbreviation', 'county_code',
                   'county_name', 'commodity_code', 'commodity_name',
                   'insurance_plan_code', 'insurance_plan_abbreviation',
                   'stage_code', 'damage_cause_code', 'damage_description',
                   'determined_acres', 'indemnity_amount']

        # Create an empty list to hold rows
        rows = []

        for file, lines in sorted_files:
            for line in lines:
                row_data = [element.strip() for element in line.split('|')]
                if len(row_data) > 2 and row_data[2] == 'CA':
                    rows.append(row_data)

        # Create DataFrame from the list of rows
        df = pd.DataFrame(rows, columns=headers)

        # Save DataFrame to local output folder
        output_file_path = os.path.join(output_folder, output_file)
        df.to_csv(output_file_path, index=False)
        #print(f"Merged and sorted files written to {output_file_path}")

        # Save DataFrame to AWS using the client
        new_buffer = io.StringIO()
        df.to_csv(new_buffer, index=False)
        content = new_buffer.getvalue()
        s3_client.put_object(Bucket=bucket_name, Body=content, Key=directory + output_file)

        # Optionally, return the DataFrame
        return df

    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Ensure main folder exists
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)
        #print(f"Created main folder: {main_folder}")

    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        #print(f"Created output folder: {output_folder}")

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']
            #print(f"Processing file: {key}")

            # Determine the local file path
            relative_path = os.path.relpath(key, directory)  # Get relative path
            local_file_path = os.path.join(main_folder, relative_path)
            local_dir_path = os.path.dirname(local_file_path)

            # Ensure local directory exists
            os.makedirs(local_dir_path, exist_ok=True)
            #print(f"Ensured directory exists: {local_dir_path}")

            # Check if the object is a .zip file
            if search_zipped and key.endswith('.zip'):
                #print(f"Downloading zip file: {key}")
                # Download the zip file into memory
                zip_object = s3.get_object(Bucket=bucket_name, Key=key)
                zip_data = io.BytesIO(zip_object['Body'].read())

                # Open the zip file
                with zipfile.ZipFile(zip_data, 'r') as zip_ref:
                    # Iterate through each file in the zip
                    for file_name in zip_ref.namelist():
                        # Check if the file is a .csv, .xls, or .txt file
                        if file_name.lower().endswith('.csv') or file_name.lower().endswith('.xls') or file_name.lower().endswith('.txt'):
                            # Extract and save the file
                            zip_ref.extract(file_name, local_dir_path)
                            extracted_file_path = os.path.join(local_dir_path, file_name)
                            #print(f"Extracted and saved '{file_name}' to '{local_dir_path}'")
                            with open(extracted_file_path, 'r') as file:
                                file_contents.append((file_name, file.readlines()))
            elif not search_zipped and (key.endswith('.csv') or key.endswith('.xls') or key.endswith('.txt') or key.endswith('.TXT')):
                #print(f"Downloading file: {key}")
                # Directly download the CSV, XLS, or TXT file
                file_object = s3.get_object(Bucket=bucket_name, Key=key)
                file_data = io.BytesIO(file_object['Body'].read())

                # Save the file locally
                with open(local_file_path, 'wb') as f:
                    f.write(file_data.getbuffer())
                #print(f"Saved '{key}' to '{local_file_path}'")

                # Read file contents
                with open(local_file_path, 'r') as file:
                    file_contents.append((key, file.readlines()))

    sorted_files = sort_files(file_contents)
    output_file = "usda_crop_loss_merged.csv"
    merge_files(sorted_files, output_file)

    if export == True:
        directory = '1_pull_data/climate_risk/extreme_heat/loss/usda'
        # Save the file to AWS S3 using the client
        output_file_path = os.path.join(output_folder, output_file)
        with open(output_file_path, 'rb') as data:
            s3_client.upload_fileobj(data, bucket_name, f"{directory}/{output_file}")
    if export == False:
        print(f'{output_file} uploaded to AWS.')

In [10]:
# Call the function
pull_process_crop_loss('usda_crop_loss_heat_files', 'final_output_folder', export=False, search_zipped=True, varname='test')#'climate_usda_heat_crop_loss')

In [8]:
pull_process_crop_loss('usda_crop_loss_heat_files', 'final_output_folder', export=False, search_zipped=True, varname='test')#'climate_usda_heat_crop_cost')