## Cal-CRAI Metric Calculation
Domain: Society & Economy \
Indicator: Vulnerable Populations

This notebook calculates one metric, sourced from the California Homelessness Data Integration System:
* Metric 1: Average percentage of population receiving homeless response services per California county

In [1]:
import pandas as pd
import os
import sys
import math
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/ca_homelessness_data_integration_system/'
folder = 'homeless_data'

pull_csv_from_directory(bucket_name, aws_dir, folder, search_zipped=False)

In [None]:
homelessness_data = pd.read_csv(r'homeless_data/experiencing_homelessness_gender_demographics.csv')
print(len(homelessness_data))
homelessness_data.head(10)
#os.remove('homelessness_data.csv')

In [None]:
# Remove any non-county entries within the location column
homelessness_data['COUNTY'] = homelessness_data['LOCATION'].str.replace(' County CoC', '', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace(' Counties CoC', '', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace(' CoC', '', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace('Glendale (Los Angeles County)', 'Los Angeles', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace('Pasadena (Los Angeles County)', 'Los Angeles', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace('Long Beach (Los Angeles County)', 'Los Angeles', case=False)

# Convert 'EXPERIENCING_HOMELESSNESS' column to numeric
homelessness_data['EXPERIENCING_HOMELESSNESS'] = pd.to_numeric(homelessness_data['EXPERIENCING_HOMELESSNESS'], errors='coerce')

# Drop any leading or trailing whitespace from the 'county' column
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.strip()
unique_entries = homelessness_data['COUNTY'].unique()
unique_entries

In [5]:
filtered_homelessness_data, omitted_data = filter_counties(homelessness_data, county_column='COUNTY')

In [None]:
# View unique entries from the df that holds all entries not in our California counties function
# After getting rid of California as an entry, all entries hold multiple counties
omitted_data = omitted_data[omitted_data['COUNTY'] != 'California']
unique_entries = omitted_data['COUNTY'].unique()
unique_entries

In [None]:
print(len(omitted_data))
omitted_data.head(5)

### Function to handle the df holding rows with multiple counties

* splits multiple counties into their own rows
* divides the 'Experiencing_homelessness' values by the number of counties

In [None]:
# Function to split rows with multiple counties
def split_county(row):
    counties = row['COUNTY'].split(', ')
    num_counties = len(counties)
    new_rows = []
    for county in counties:
        new_row = row.copy()
        new_row['COUNTY'] = county
        new_row['EXPERIENCING_HOMELESSNESS'] //= num_counties
        new_rows.append(new_row)
    return pd.DataFrame(new_rows)

# Apply the function to each row and concatenate the results
new_rows = pd.concat([split_county(row) for _, row in omitted_data.iterrows()], ignore_index=True)
cleaned_dropped_rows = new_rows.dropna()
print(len(cleaned_dropped_rows))
cleaned_dropped_rows.head(5)

In [None]:
# Fact checking Alpine county with Alpine county after grouping the data
alpine = cleaned_dropped_rows[cleaned_dropped_rows['COUNTY']=='Alpine']
alpine.head()

In [None]:
# Group by the 'Name' column and aggregate the 'Value' column
summed_dropped_rows = cleaned_dropped_rows.groupby(['COUNTY', 'CALENDAR_YEAR'])['EXPERIENCING_HOMELESSNESS'].sum().reset_index()
print(len(summed_dropped_rows))
summed_dropped_rows.head(5)

### Now further cleaning the data that got didnt get omitted from the filter county function

In [None]:
filtered_homelessness_data = filtered_homelessness_data[['CALENDAR_YEAR', 'COUNTY', 'EXPERIENCING_HOMELESSNESS']]
filtered_homelessness_data

In [None]:
# Fact checking the totals from Alameda with the totals in Alameda after grouping the data below
alameda = filtered_homelessness_data[filtered_homelessness_data['COUNTY']=='Alameda']
alameda.head(6)

In [None]:
# Group the homeless data by county and calendar year and add the experiencing homeless values
# This eliminates the multiple entries from the gender column into total counts per county per year 
summed_filtered_homelessness_data = filtered_homelessness_data.groupby(['COUNTY', 'CALENDAR_YEAR'])['EXPERIENCING_HOMELESSNESS'].sum().reset_index()
summed_filtered_homelessness_data.head(5)

### Adding the cleaned dataframes containing homeless data together

In [None]:
# Concatenate the dataframes
concatenated_df = pd.concat([summed_filtered_homelessness_data, summed_dropped_rows], ignore_index=True)
homelessness_county_year = concatenated_df.sort_values(by='COUNTY')
homelessness_county_year.head(5)

### Import California county population estimate data from AWS

In [None]:
est_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
ca_est_county_pop = pd.read_csv(est_pop)
# Rename columns for future merging
ca_est_county_pop = ca_est_county_pop.rename(columns={'county': 'COUNTY'})
ca_est_county_pop = ca_est_county_pop.rename(columns={'est_total_pop': 'est_population_2021'})
ca_est_county_pop = ca_est_county_pop.drop(columns=['Unnamed: 0'])

ca_est_county_pop.head()

In [None]:
# Merge population estimates and cleaned homelessness data based on the shared 'COUNTY' column
population_homelessness_merge = pd.merge(homelessness_county_year, ca_est_county_pop, on='COUNTY')
population_homelessness_merge.head(5)

In [None]:
# Calculate percent population homeless
population_homelessness_merge['percent_pop_homeless'] = (population_homelessness_merge['EXPERIENCING_HOMELESSNESS']/population_homelessness_merge['est_population_2021'])*100
population_homelessness_merge.head(5)

In [None]:
# Group by COUNTY and calculate the average of percent_pop_homeless
average_percent_pop_homeless = population_homelessness_merge.groupby('COUNTY')['percent_pop_homeless'].mean().reset_index()
average_percent_pop_homeless.tail(5)

In [None]:
ca_tract = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2021.csv"
ca_tract_county = pd.read_csv(ca_tract)
ca_tract_county = ca_tract_county.drop(columns='Unnamed: 0')
ca_tract_county = ca_tract_county.rename(columns={'County': 'COUNTY'})

ca_tract_county

In [None]:
county_to_tract_homelessness = pd.merge(ca_tract_county, average_percent_pop_homeless, on='COUNTY', how='right')
county_to_tract_homelessness = county_to_tract_homelessness[['Census Tract', 'COUNTY', 'percent_pop_homeless']]

county_to_tract_homelessness

In [24]:
county_to_tract_homelessness.to_csv('society_vulnerable_homelessness_metric.csv')

Function Call(s)

In [25]:
@append_metadata
def calc_homelessness_services_percent(input_csv, export=False, varname = ''):
    '''
    Calculates the average percentage of population receiving homelessness response services per California
    county, sourced from the California Homelessness Data Integration System: 
    https://data.ca.gov/dataset/homelessness-demographics. 

    "Homelessness response services" is defined by CHDIS as individuals who at any point in the selected timeframe:
    (1) accessed lodging services through emergency shelter, transitional housing, and/or safe haven projects
    (2) entered into a permanent housing project from homelessness
    (3) reported living in a homeless situation (including living in a location not meant for habitation) at
    the time they accessed other services

    Methods
    -------
    Data was cleaned to translate CHDIS "Continuum of Care" regions to county. While not an exact 1 to 1 relationship, 
    estimates were divided amongst counties so that the the values sum to the Continuum of Care count.
    Estimates for each county and year were divided by 2021 American Community Survey (ACS) population
    to calculate percentage of population receiving homelessness response services.
    Percent values per county were averaged across the ~7 year dataset to calculate average percent
    of population receiving homelessness response services per CA county.
    The percent values were then merged in with 2021 ACS tract data so each tract had the avg percent homelessness
    value from its respective county.
    
    Parameters
    ----------
    df: string
        the dataframe containing the initial homelessness response services data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI homelessness response services metric to AWS
        True = will upload resulting df containing CAL CRAI homelessness response services metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    society_vulnerable_homelessness.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    if export == False:
        print('Data transformation: translating location column to exclusively state county name.')
        print('Data transformation: adjusting column types to numeric for future calculations.')
        print('Data transformation: splitting multi-county locations to per county. Values sum to raw data location count.')
        print('Data transformation: import 2021 ACS county data and calculate percent of population receiving homelessness response services.')
        print('Data transformation: merge with 2021 ACS tract data so each tract within a given county has that respective counties metric value.')
        
    if export == True: 
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        upload_csv_aws([input_csv], bucket_name, directory)

    # Check if the file exists before attempting to remove it
    if os.path.exists('experiencing_homelessness_gender_demographics.csv'):
        os.remove('experiencing_homelessness_gender_demographics.csv')  # remove from local to clear up directory

    if os.path.exists(input_csv[0]):
        os.remove(input_csv[0])

In [26]:
homelessness_metric = 'society_vulnerable_homelessness_metric.csv'

calc_homelessness_services_percent(homelessness_metric, export=False, varname='society_vulnerable_percent_homelessness_services')