In [1]:
import pandas as pd
import os
import sys
import numpy as np
import shutil
import glob

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/index_data/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'built_housing_median_age_metric.csv'
Saved DataFrame as 'built_housing_mobile_homes_metric.csv'
Saved DataFrame as 'built_housing_quality_metric.csv'
Saved DataFrame as 'built_metric_housing_vacancy_metric.csv'
Saved DataFrame as 'built_power_plant_metric.csv'
Saved DataFrame as 'climate_wildfire_redbooks_loss_metric.csv'
Saved DataFrame as 'climate_wildfire_redbooks_loss_metric_buildings.csv'
Saved DataFrame as 'climate_wildfire_redbooks_loss_metric_fatalities.csv'
Saved DataFrame as 'governance_emergency_management_metric.csv'
Saved DataFrame as 'governance_hazard_mitigation_metric.csv'
Saved DataFrame as 'natural_air_quality_metric.csv'
Saved DataFrame as 'society_ambulatory_disability_metric.csv'
Saved DataFrame as 'society_american_indian_alaska_native_metric.csv'
Saved DataFrame as 'society_cognitive_disability_metric.csv'
Saved DataFrame as 'society_financial_assistance_metric.csv'
Saved DataFrame as 'society_food_access_metric.csv'
Saved DataFrame as 'societ

In [3]:
# Define the output folder path
output_folder = 'output_folder'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Find all CSV files starting with 'society_'
source_files = [file for file in glob.glob('society_*.csv') if 'social' not in file]

# Iterate through the source files and copy only CSV files to the output folder
for file in source_files:
    # Construct the destination file path
    destination_path = os.path.join(output_folder, os.path.basename(file))
    
    # Copy the file to the output folder
    shutil.copyfile(file, destination_path)
    
    # Remove the original file
    os.remove(file)

print(f"Copied and removed {len(source_files)} CSV files.")

# Delete all CSV files in the current directory that are not in the output folder
current_files = glob.glob('*.csv')
for file in current_files:
    if file not in [os.path.basename(f) for f in source_files]:
        os.remove(file)

print(f"Deleted {len(current_files) - len(source_files)} local CSV files.")


Copied and removed 21 CSV files.
Deleted 1 local CSV files.


In [4]:
# Define the output folder path
output_folder = 'output_folder'

# Get a list of all CSV files in the output folder
csv_files = glob.glob(os.path.join(output_folder, '*.csv'))

# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()
# Iterate through each CSV file and merge them on the 'census_tract' column
for file in csv_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Keep only the 'census_tract' and the last column from each file
    last_column = df.columns[-1]
    df = df[['census_tract', last_column]]
    
    # Merge the DataFrame with the existing merged DataFrame
    if merged_df.empty:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on='census_tract', how='outer')

# Save the merged DataFrame to a CSV file
merged_df.to_csv('concatenate_society_economy_metrics.csv', index=False)

print(f"Merged CSV saved as concatenate_society_economy_metrics.csv")

Merged CSV saved as concatenate_society_economy_metrics.csv


In [6]:
society_economy_metrics = pd.read_csv('concatenate_society_economy_metrics.csv')

In [9]:
# Function to handle outliers
def handle_outliers(df, output_csv):
    # Columns to process (exclude 'census_tract')
    columns_to_process = [col for col in df.columns if col != 'census_tract']
    for column in columns_to_process:
        # Convert the column to numeric, forcing any errors to NaN
        df[column] = pd.to_numeric(df[column], errors='coerce')
        
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        max_fence = 3 * Q3
        min_fence = -3 * Q1

        # Clip the outliers
        df[column] = df[column].clip(lower=min_fence, upper=max_fence)
    # Save the updated DataFrame back to CSV
    df.to_csv(output_csv, index=False)
    return df

In [26]:
def handle_outliers(df, output_csv):
    # Columns to process (exclude 'census_tract')
    columns_to_process = [col for col in df.columns if col != 'census_tract']
    
    for column in columns_to_process:
        # Convert the column to numeric, forcing any errors to NaN
        df[column] = pd.to_numeric(df[column], errors='coerce')
        
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        if IQR == 0:
            print(f"Column '{column}' has no IQR (Q1 == Q3). Skipping outlier handling for this column.")
            continue

        max_fence = Q3 + 1.5 * IQR
        min_fence = Q1 - 1.5 * IQR

        print(f'For column {column}:')
        print(f'  Q1 (25th percentile): {Q1}')
        print(f'  Q3 (75th percentile): {Q3}')
        print(f'  IQR: {IQR}')
        print(f'  Max fence: {max_fence}')
        print(f'  Min fence: {min_fence}')

        # Identify outliers
        outliers = df[(df[column] > max_fence) | (df[column] < min_fence)]
        
        # Print outliers and their corresponding 'census_tract'
        if not outliers.empty:
            print(f"Outliers detected in column '{column}':")
            for _, row in outliers.iterrows():
                print(f"census_tract: {row['census_tract']}, value: {row[column]}")
        
        # Clip the outliers
        df[column] = df[column].clip(lower=min_fence, upper=max_fence)
    
    # Save the updated DataFrame back to CSV
    df.to_csv(output_csv, index=False)
    return df

# Handle outliers
handle_outlier_csv = 'no_outlier_society_economy_metrics.csv'
no_outlier_society_economy_metrics = handle_outliers(society_economy_metrics, handle_outlier_csv)
print(f"Processed and saved {handle_outlier_csv} with outlier handling.")

Column 'percent_population_ambulatory_disabilities' has no IQR (Q1 == Q3). Skipping outlier handling for this column.
For column percent_total_pop_american_indian_alaska_native:
  Q1 (25th percentile): 0.0
  Q3 (75th percentile): 1.3
  IQR: 1.3
  Max fence: 3.25
  Min fence: -1.9500000000000002
Outliers detected in column 'percent_total_pop_american_indian_alaska_native':
census_tract: 6001403301.0, value: 3.9000000000000004
census_tract: 6001404800.0, value: 3.9000000000000004
census_tract: 6001406201.0, value: 3.9000000000000004
census_tract: 6001406202.0, value: 3.9000000000000004
census_tract: 6001406500.0, value: 3.9000000000000004
census_tract: 6001407102.0, value: 3.9000000000000004
census_tract: 6001407400.0, value: 3.9000000000000004
census_tract: 6001407600.0, value: 3.9000000000000004
census_tract: 6001407800.0, value: 3.6
census_tract: 6001408600.0, value: 3.9000000000000004
census_tract: 6001409100.0, value: 3.9000000000000004
census_tract: 6001409300.0, value: 3.8
census_

Processed and saved no_outlier_society_economy_metrics.csv with outlier handling.
