In [10]:
import os
import sys
import pandas as pd
import io
import numpy as np
import boto3
import zipfile
import shutil

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)

# IMPORT WHEN PR #42 IS MERGED
'''
sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import (
    pull_zipped_csv, upload_csv_aws
)
'''

"\nsys.path.append(os.path.expanduser('../../'))\nfrom scripts.utils.file_helpers import (\n    pull_zipped_csv, upload_csv_aws\n)\n"

In [17]:
def pull_csv_from_directory(bucket_name, directory, search_zipped=True):
    """
    Pulls CSV files from a specified directory in an S3 bucket.
    
    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - directory (str): The directory within the bucket to search for CSV files.
    - search_zipped (bool): If True, search for CSV files within zip files. If False, search for CSV files directly.
    """
    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']
            
            # Check if the object is a .zip file
            if search_zipped and key.endswith('.zip'):
                # Download the zip file into memory
                zip_object = s3.get_object(Bucket=bucket_name, Key=key)
                zip_data = io.BytesIO(zip_object['Body'].read())
                
                # Open the zip file
                with zipfile.ZipFile(zip_data, 'r') as zip_ref:
                    # Iterate through each file in the zip
                    for file_name in zip_ref.namelist():
                        # Check if the file is a .csv file
                        if file_name.endswith('.csv'):
                            # Read the .csv file
                            with zip_ref.open(file_name) as csv_file:
                                # Convert the csv content to pandas DataFrame
                                df = pd.read_csv(csv_file)
                                # Save the DataFrame with a similar name as the .csv file
                                df_name = file_name[:-4]  # Remove .csv extension
                                df.to_csv(f"{df_name}.csv", index=False)
                                print(f"Saved DataFrame as '{df_name}.csv'")
                                # You can now manipulate df as needed
            elif not search_zipped and key.endswith('.csv'):
                # Directly download the CSV file
                csv_object = s3.get_object(Bucket=bucket_name, Key=key)
                csv_data = io.BytesIO(csv_object['Body'].read())
                # Convert the csv content to pandas DataFrame
                df = pd.read_csv(csv_data)
                # Save the DataFrame with a similar name as the .csv file
                df_name = key.split('/')[-1][:-4]  # Extract filename from key
                df.to_csv(f"{df_name}.csv", index=False)
                print(f"Saved DataFrame as '{df_name}.csv'")
                # You can now manipulate df as needed

    else:
        print("No objects found in the specified directory.")

bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/dummy_data/'

# Search for non-zipped files
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'DUMMY_built_summed_indicators.csv'
Saved DataFrame as 'DUMMY_climate_indicator_product.csv'
Saved DataFrame as 'DUMMY_governance_summed_indicators.csv'
Saved DataFrame as 'DUMMY_natural_summed_indicators.csv'
Saved DataFrame as 'DUMMY_society_summed_indicators.csv'


In [18]:
source_files = [
    'DUMMY_built_summed_indicators.csv',
    'DUMMY_governance_summed_indicators.csv',
    'DUMMY_natural_summed_indicators.csv',
    'DUMMY_society_summed_indicators.csv'
]

# Define the output folder path
output_folder = 'output_folder'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate through source files and copy them to the output folder
for file in source_files:
    # Construct the source file path
    source_path = file
    
    # Construct the destination file path
    destination_path = os.path.join(output_folder, os.path.basename(file))
    
    # Copy the file to the output folder
    shutil.copyfile(source_path, destination_path)

    os.remove(file)


In [22]:
def merge_csv_files(input_folder):
    # Initialize an empty DataFrame to store the merged data
    master_df = pd.DataFrame()

    # Iterate over each CSV file in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            filepath = os.path.join(input_folder, filename)
            # Read the CSV file
            df = pd.read_csv(filepath)
            # Extract 'GEOID' and 'min_max_standardized' columns
            subset_df = df[['GEOID', 'min_max_standardized']].copy()
            # Rename 'min_max_standardized' column with filename prefix
            subset_df.rename(columns={'min_max_standardized': f"{os.path.splitext(filename)[0]}_min_max_standardized"}, inplace=True)
            # Merge with master DataFrame based on 'GEOID'
            if master_df.empty:
                master_df = subset_df
            else:
                master_df = pd.merge(master_df, subset_df, on='GEOID', how='outer')

    # Save the master DataFrame to a new CSV file
    output_filename = 'merged_data.csv'
    output_filepath = os.path.join(input_folder, output_filename)
    master_df.to_csv(output_filepath, index=False)
    print(f"Merged data saved to {output_filepath}")

# Specify the input folder containing CSV files
input_folder = 'your_input_folder_path'

# Call the function
merge_csv_files('output_folder')


Merged data saved to output_folder\merged_data.csv
