## This notebook calulates the following Cal-CRAI metric within the society domain:
* the average percentage of population receiving homeless response services per California county,
 
This data is sourced from the California Homelessness Data Integration System: 
https://data.ca.gov/dataset/homelessness-demographics

In [1]:
import pandas as pd
import os
import sys
import math
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/ca_homelessness_data_integration_system/'
folder = 'homeless_data'

pull_csv_from_directory(bucket_name, aws_dir, folder, search_zipped=False)

Saved DataFrame as 'homeless_data\experiencing_homelessness_gender_demographics.csv'


In [3]:
homelessness_data = pd.read_csv(r'homeless_data/experiencing_homelessness_gender_demographics.csv')
print(len(homelessness_data))
homelessness_data.head(10)
#os.remove('homelessness_data.csv')

1480


Unnamed: 0,CALENDAR_YEAR,LOCATION_ID,LOCATION,GENDER,EXPERIENCING_HOMELESSNESS
0,2017,All,California,Female,79670
1,2017,All,California,Male,101901
2,2017,All,California,Non-Singular Gender,148
3,2017,All,California,Questioning Gender,*
4,2017,All,California,Transgender,676
5,2017,All,California,Unknown,1505
6,2017,CA-500,Santa Clara County CoC,Female,3877
7,2017,CA-500,Santa Clara County CoC,Male,6225
8,2017,CA-500,Santa Clara County CoC,Non-Singular Gender,*
9,2017,CA-500,Santa Clara County CoC,Transgender,39


In [4]:
# Remove any non-county entries within the location column
homelessness_data['COUNTY'] = homelessness_data['LOCATION'].str.replace(' County CoC', '', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace(' Counties CoC', '', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace(' CoC', '', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace('Glendale (Los Angeles County)', 'Los Angeles', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace('Pasadena (Los Angeles County)', 'Los Angeles', case=False)
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.replace('Long Beach (Los Angeles County)', 'Los Angeles', case=False)

# Convert 'EXPERIENCING_HOMELESSNESS' column to numeric
homelessness_data['EXPERIENCING_HOMELESSNESS'] = pd.to_numeric(homelessness_data['EXPERIENCING_HOMELESSNESS'], errors='coerce')

# Drop any leading or trailing whitespace from the 'county' column
homelessness_data['COUNTY'] = homelessness_data['COUNTY'].str.strip()
unique_entries = homelessness_data['COUNTY'].unique()
unique_entries

array(['California', 'Santa Clara', 'San Francisco', 'Alameda',
       'Sacramento', 'Sonoma', 'Contra Costa', 'Monterey, San Benito',
       'Marin', 'Santa Cruz', 'Mendocino', 'Stanislaus', 'San Joaquin',
       'San Mateo', 'Kings, Tulare', 'Fresno, Madera', 'Placer',
       'Shasta, Siskiyou, Lassen, Plumas, Del Norte, Modoc, Sierra',
       'Napa', 'Solano', 'Butte', 'Merced', 'Yolo', 'Humboldt',
       'Colusa, Glenn, Trinity', 'Yuba, Sutter', 'El Dorado',
       'Amador, Calaveras, Mariposa, Tuolumne', 'Tehama', 'Lake',
       'Alpine, Inyo, Mono', 'Nevada', 'Los Angeles', 'San Diego',
       'Orange', 'Santa Barbara', 'Kern', 'Riverside', 'San Bernardino',
       'Ventura', 'Imperial', 'San Luis Obispo'], dtype=object)

In [5]:
filtered_homelessness_data, omitted_data = filter_counties(homelessness_data, county_column='COUNTY')

In [6]:
# View unique entries from the df that holds all entries not in our California counties function
# After getting rid of California as an entry, all entries hold multiple counties
omitted_data = omitted_data[omitted_data['COUNTY'] != 'California']
unique_entries = omitted_data['COUNTY'].unique()
unique_entries

array(['Monterey, San Benito', 'Kings, Tulare', 'Fresno, Madera',
       'Shasta, Siskiyou, Lassen, Plumas, Del Norte, Modoc, Sierra',
       'Colusa, Glenn, Trinity', 'Yuba, Sutter',
       'Amador, Calaveras, Mariposa, Tuolumne', 'Alpine, Inyo, Mono'],
      dtype=object)

In [7]:
print(len(omitted_data))
omitted_data.head(5)

218


Unnamed: 0,CALENDAR_YEAR,LOCATION_ID,LOCATION,GENDER,EXPERIENCING_HOMELESSNESS,COUNTY
38,2017,CA-506,"Monterey, San Benito Counties CoC",Female,1224.0,"Monterey, San Benito"
39,2017,CA-506,"Monterey, San Benito Counties CoC",Male,988.0,"Monterey, San Benito"
40,2017,CA-506,"Monterey, San Benito Counties CoC",Non-Singular Gender,,"Monterey, San Benito"
41,2017,CA-506,"Monterey, San Benito Counties CoC",Questioning Gender,,"Monterey, San Benito"
42,2017,CA-506,"Monterey, San Benito Counties CoC",Transgender,,"Monterey, San Benito"


### Function to handle the df holding rows with multiple counties

* splits multiple counties into their own rows
* divides the 'Experiencing_homelessness' values by the number of counties

In [8]:
# Function to split rows with multiple counties
def split_county(row):
    counties = row['COUNTY'].split(', ')
    num_counties = len(counties)
    new_rows = []
    for county in counties:
        new_row = row.copy()
        new_row['COUNTY'] = county
        new_row['EXPERIENCING_HOMELESSNESS'] //= num_counties
        new_rows.append(new_row)
    return pd.DataFrame(new_rows)

# Apply the function to each row and concatenate the results
new_rows = pd.concat([split_county(row) for _, row in omitted_data.iterrows()], ignore_index=True)
cleaned_dropped_rows = new_rows.dropna()
print(len(cleaned_dropped_rows))
cleaned_dropped_rows.head(5)

406


Unnamed: 0,CALENDAR_YEAR,LOCATION_ID,LOCATION,GENDER,EXPERIENCING_HOMELESSNESS,COUNTY
0,2017,CA-506,"Monterey, San Benito Counties CoC",Female,612.0,Monterey
1,2017,CA-506,"Monterey, San Benito Counties CoC",Female,612.0,San Benito
2,2017,CA-506,"Monterey, San Benito Counties CoC",Male,494.0,Monterey
3,2017,CA-506,"Monterey, San Benito Counties CoC",Male,494.0,San Benito
10,2017,CA-513,"Kings, Tulare Counties CoC",Female,705.0,Kings


In [9]:
# Fact checking Alpine county with Alpine county after grouping the data
alpine = cleaned_dropped_rows[cleaned_dropped_rows['COUNTY']=='Alpine']
alpine.head()

Unnamed: 0,CALENDAR_YEAR,LOCATION_ID,LOCATION,GENDER,EXPERIENCING_HOMELESSNESS,COUNTY
67,2017,CA-530,"Alpine, Inyo, Mono Counties CoC",Female,26.0,Alpine
70,2017,CA-530,"Alpine, Inyo, Mono Counties CoC",Male,27.0,Alpine
156,2018,CA-530,"Alpine, Inyo, Mono Counties CoC",Female,42.0,Alpine
159,2018,CA-530,"Alpine, Inyo, Mono Counties CoC",Male,31.0,Alpine
241,2019,CA-530,"Alpine, Inyo, Mono Counties CoC",Female,26.0,Alpine


In [10]:
# Group by the 'Name' column and aggregate the 'Value' column
summed_dropped_rows = cleaned_dropped_rows.groupby(['COUNTY', 'CALENDAR_YEAR'])['EXPERIENCING_HOMELESSNESS'].sum().reset_index()
print(len(summed_dropped_rows))
summed_dropped_rows.head(5)

171


Unnamed: 0,COUNTY,CALENDAR_YEAR,EXPERIENCING_HOMELESSNESS
0,Alpine,2017,53.0
1,Alpine,2018,73.0
2,Alpine,2019,50.0
3,Alpine,2020,72.0
4,Alpine,2021,74.0


### Now further cleaning the data that got didnt get omitted from the filter county function

In [11]:
filtered_homelessness_data = filtered_homelessness_data[['CALENDAR_YEAR', 'COUNTY', 'EXPERIENCING_HOMELESSNESS']]
filtered_homelessness_data

Unnamed: 0,CALENDAR_YEAR,COUNTY,EXPERIENCING_HOMELESSNESS
6,2017,Santa Clara,3877.0
7,2017,Santa Clara,6225.0
8,2017,Santa Clara,
9,2017,Santa Clara,39.0
10,2017,Santa Clara,15.0
...,...,...,...
1475,2023,San Luis Obispo,1069.0
1476,2023,San Luis Obispo,1395.0
1477,2023,San Luis Obispo,
1478,2023,San Luis Obispo,


In [12]:
# Fact checking the totals from Alameda with the totals in Alameda after grouping the data below
alameda = filtered_homelessness_data[filtered_homelessness_data['COUNTY']=='Alameda']
alameda.head(6)

Unnamed: 0,CALENDAR_YEAR,COUNTY,EXPERIENCING_HOMELESSNESS
16,2017,Alameda,3015.0
17,2017,Alameda,3548.0
18,2017,Alameda,
19,2017,Alameda,
20,2017,Alameda,17.0
21,2017,Alameda,23.0


In [13]:
# Group the homeless data by county and calendar year and add the experiencing homeless values
# This eliminates the multiple entries from the gender column into total counts per county per year 
summed_filtered_homelessness_data = filtered_homelessness_data.groupby(['COUNTY', 'CALENDAR_YEAR'])['EXPERIENCING_HOMELESSNESS'].sum().reset_index()
summed_filtered_homelessness_data.head(5)

Unnamed: 0,COUNTY,CALENDAR_YEAR,EXPERIENCING_HOMELESSNESS
0,Alameda,2017,6603.0
1,Alameda,2018,6481.0
2,Alameda,2019,7550.0
3,Alameda,2020,9124.0
4,Alameda,2021,10118.0


### Adding the cleaned dataframes containing homeless data together

In [14]:
# Concatenate the dataframes
concatenated_df = pd.concat([summed_filtered_homelessness_data, summed_dropped_rows], ignore_index=True)
homelessness_county_year = concatenated_df.sort_values(by='COUNTY')
homelessness_county_year.head(5)

Unnamed: 0,COUNTY,CALENDAR_YEAR,EXPERIENCING_HOMELESSNESS
0,Alameda,2017,6603.0
1,Alameda,2018,6481.0
2,Alameda,2019,7550.0
3,Alameda,2020,9124.0
4,Alameda,2021,10118.0


### Import California county population estimate data from AWS

In [17]:
est_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
ca_est_county_pop = pd.read_csv(est_pop)
# Rename columns for future merging
ca_est_county_pop = ca_est_county_pop.rename(columns={'county': 'COUNTY'})
ca_est_county_pop = ca_est_county_pop.rename(columns={'est_total_pop': 'est_population_2021'})
ca_est_county_pop = ca_est_county_pop.drop(columns=['Unnamed: 0'])

ca_est_county_pop.head()

Unnamed: 0,COUNTY,est_population_2021
0,Alameda,1663823
1,Alpine,1515
2,Amador,40577
3,Butte,213605
4,Calaveras,45674


In [18]:
# Merge population estimates and cleaned homelessness data based on the shared 'COUNTY' column
population_homelessness_merge = pd.merge(homelessness_county_year, ca_est_county_pop, on='COUNTY')
population_homelessness_merge.head(5)

Unnamed: 0,COUNTY,CALENDAR_YEAR,EXPERIENCING_HOMELESSNESS,est_population_2021
0,Alameda,2017,6603.0,1663823
1,Alameda,2018,6481.0,1663823
2,Alameda,2019,7550.0,1663823
3,Alameda,2020,9124.0,1663823
4,Alameda,2021,10118.0,1663823


In [19]:
# Calculate percent population homeless
population_homelessness_merge['percent_pop_homeless'] = (population_homelessness_merge['EXPERIENCING_HOMELESSNESS']/population_homelessness_merge['est_population_2021'])*100
population_homelessness_merge.head(5)

Unnamed: 0,COUNTY,CALENDAR_YEAR,EXPERIENCING_HOMELESSNESS,est_population_2021,percent_pop_homeless
0,Alameda,2017,6603.0,1663823,0.396857
1,Alameda,2018,6481.0,1663823,0.389525
2,Alameda,2019,7550.0,1663823,0.453774
3,Alameda,2020,9124.0,1663823,0.548376
4,Alameda,2021,10118.0,1663823,0.608118


In [20]:
# Group by COUNTY and calculate the average of percent_pop_homeless
average_percent_pop_homeless = population_homelessness_merge.groupby('COUNTY')['percent_pop_homeless'].mean().reset_index()
average_percent_pop_homeless.tail(5)

Unnamed: 0,COUNTY,percent_pop_homeless
53,Tulare,0.352914
54,Tuolumne,0.20574
55,Ventura,0.261874
56,Yolo,0.541057
57,Yuba,0.947555


In [22]:
ca_tract = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2021.csv"
ca_tract_county = pd.read_csv(ca_tract)
ca_tract_county = ca_tract_county.drop(columns='Unnamed: 0')
ca_tract_county = ca_tract_county.rename(columns={'County': 'COUNTY'})

ca_tract_county

Unnamed: 0,Census Tract,COUNTYFP,COUNTY,Total Population 2021
0,6085504321,85,Santa Clara,5412
1,6085504410,85,Santa Clara,4124
2,6085507003,85,Santa Clara,3074
3,6085507004,85,Santa Clara,3926
4,6085502204,85,Santa Clara,3242
...,...,...,...,...
9124,6059001303,59,Orange,6515
9125,6059001304,59,Orange,3565
9126,6059001401,59,Orange,4756
9127,6013367200,13,Contra Costa,5869


In [23]:
county_to_tract_homelessness = pd.merge(ca_tract_county, average_percent_pop_homeless, on='COUNTY', how='right')
county_to_tract_homelessness = county_to_tract_homelessness[['Census Tract', 'COUNTY', 'percent_pop_homeless']]

county_to_tract_homelessness

Unnamed: 0,Census Tract,COUNTY,percent_pop_homeless
0,6001428301,Alameda,0.528619
1,6001428302,Alameda,0.528619
2,6001428400,Alameda,0.528619
3,6001430900,Alameda,0.528619
4,6001431000,Alameda,0.528619
...,...,...,...
9124,6115041001,Yuba,0.947555
9125,6115040304,Yuba,0.947555
9126,6115040202,Yuba,0.947555
9127,6115040201,Yuba,0.947555


In [24]:
county_to_tract_homelessness.to_csv('society_vulnerable_homelessness_metric.csv')

Function Call(s)

In [25]:
@append_metadata
def calc_homelessness_services_percent(input_csv, export=False, varname = ''):
    '''
    Calculates the average percentage of population receiving homelessness response services per California
    county, sourced from the California Homelessness Data Integration System: 
    https://data.ca.gov/dataset/homelessness-demographics. 

    "Homelessness response services" is defined by CHDIS as individuals who at any point in the selected timeframe:
    (1) accessed lodging services through emergency shelter, transitional housing, and/or safe haven projects
    (2) entered into a permanent housing project from homelessness
    (3) reported living in a homeless situation (including living in a location not meant for habitation) at
    the time they accessed other services

    Methods
    -------
    Data was cleaned to translate CHDIS "Continuum of Care" regions to county. While not an exact 1 to 1 relationship, 
    estimates were divided amongst counties so that the the values sum to the Continuum of Care count.
    Estimates for each county and year were divided by 2021 American Community Survey (ACS) population
    to calculate percentage of population receiving homelessness response services.
    Percent values per county were averaged across the ~7 year dataset to calculate average percent
    of population receiving homelessness response services per CA county.
    The percent values were then merged in with 2021 ACS tract data so each tract had the avg percent homelessness
    value from its respective county.
    
    Parameters
    ----------
    df: string
        the dataframe containing the initial homelessness response services data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI homelessness response services metric to AWS
        True = will upload resulting df containing CAL CRAI homelessness response services metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    society_vulnerable_homelessness.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    if export == False:
        print('Data transformation: translating location column to exclusively state county name.')
        print('Data transformation: adjusting column types to numeric for future calculations.')
        print('Data transformation: splitting multi-county locations to per county. Values sum to raw data location count.')
        print('Data transformation: import 2021 ACS county data and calculate percent of population receiving homelessness response services.')
        print('Data transformation: merge with 2021 ACS tract data so each tract within a given county has that respective counties metric value.')
        
    if export == True: 
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        upload_csv_aws([input_csv], bucket_name, directory)

    # Check if the file exists before attempting to remove it
    if os.path.exists('experiencing_homelessness_gender_demographics.csv'):
        os.remove('experiencing_homelessness_gender_demographics.csv')  # remove from local to clear up directory

    if os.path.exists(input_csv[0]):
        os.remove(input_csv[0])

In [26]:
homelessness_metric = 'society_vulnerable_homelessness_metric.csv'

calc_homelessness_services_percent(homelessness_metric, export=False, varname='society_vulnerable_percent_homelessness_services')