# SDCTA
# San Diego County Homelessness Program Analysis Data Cleaning

## Setup

In [103]:
#To quickly install all req, uncomment line of code below (Note: Only run once!)
#!pip install -r requirements.txt

# Imports
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering

import os

In [97]:
# Reading in data
raw = pd.read_csv("data/raw/CityExpendituresRaw.csv")
raw.head(5)

Unnamed: 0,Unique.ID,Grantor,Grantee,Program,Year,Date,EndDate,Amount,AmendmentNumber,Funding.Agency,...,Issued,Funding.Type,Years,Average.By.Year,City.Year,Population,Amount.Per.Capita,Amount.Per.PEH,Population.PEH,ExpenditureType
0,,City of Imperial Beach,,,2022.0,,,0,,,...,,,,,City of Imperial Beach|2022,,$0.00,$0.00,0.0,Other/Unknown
1,,City of Imperial Beach,,,2021.0,,,0,,,...,,,,,City of Imperial Beach|2021,,$0.00,$0.00,0.0,Other/Unknown
2,,City of Imperial Beach,,,2020.0,,,0,,,...,,,,,City of Imperial Beach|2020,,$0.00,$0.00,16.0,Other/Unknown
3,,City of Imperial Beach,,,2019.0,,,0,,,...,,,,,City of Imperial Beach|2019,,$0.00,$0.00,12.0,Other/Unknown
4,,City of Imperial Beach,,,2018.0,,,0,,,...,,,,,City of Imperial Beach|2018,,$0.00,$0.00,7.0,Other/Unknown


## Basic Filtering
Since the goal is to filter the unique programs down as small as possible with the intention of combinining programs that do the same thing, but named slightly differently, checking if any of the `Program` row strings contain the name of another program could yield some level of shrinkage.


In [98]:
# Function to update program names by checking for occurrences in the 'Program' column
def update_program_column(processed):
    """
    Update program names by checking if other rows contain the same string.

    Parameters:
    - processed: The DataFrame containing the 'Program' column.

    Returns:
    - A new DataFrame with updated program names.
    """
    # Create a mapping of unique program values
    program_mapping = {val: val for val in processed['Program'].dropna().unique()}

    # Iterate over each row in the DataFrame
    for index, row in processed.iterrows():
        program = row['Program']
        # Check if the program is a string
        if isinstance(program, str):
            # Update other occurrences of the program name
            processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]

    return processed


# Function to replace dashes with spaces in the 'Program' column
def replace_dashes_with_spaces(processed):
    """
    Replace all dashes in the 'Program' column with spaces.

    Parameters:
    - processed: The DataFrame containing the 'Program' column.

    Returns:
    - A new DataFrame with the dashes replaced by spaces.
    """
    # Replace dashes with spaces in the 'Program' column
    processed['Program'] = processed['Program'].str.replace('-', ' ', regex=False)

    return processed

# Create copy of raw data frame for data cleaing
processed = raw.copy()

# Convert all program names to lowercase for standardization
processed['Program'] = processed['Program'].str.lower()

# Replace dashes with spaces in the 'Program' column
processed = replace_dashes_with_spaces(processed)

# Update program names based on occurrences in the DataFrame
processed = update_program_column(processed)

# Display the number of unique program names before and after processing
print("Unique program names in raw data:", len(raw["Program"].unique()))
print("Unique program names in processed data:", len(processed['Program'].unique()))


  processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]
  processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]
  processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]
  processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]
  processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]
  processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]
  processed.loc[processed['Program'].str.contains(program, na=False), 'Program'] = program_mapping[program]


Unique program names in raw data: 186
Unique program names in processed data: 107


This approach yielded pretty good results. Will try manual cleaning.

In [99]:
print(processed['Program'].unique()[:35])

[nan 'rental assistance' 'homeless services' 'shelter' 'motel voucher'
 'rapid re housing' 'project h.o.p.e.' 'take back the streets'
 'work for hope' 'housing navigator' 'improve fencing' 'hvac replacements'
 'acquisition of facilility for provision of homeless'
 'railing replacement' 'security fencing' 'outreach' 'scattered site'
 'supportive service  a way back home'
 'general funding for homelessness services' 'housing stability services'
 'housing navigation services' 'hygiene supplies' 'gift cards'
 'city housing support' 'fair housing' 'program development'
 '211 assistance' 'case management'
 'provide emergency housing to imminently homeless, or episodically and chronically homeless individuals and families in the city of santee, and who are unable to access housing during the coronavirus pandemic'
 'provide support for regional homeless service providers, networking and communication for organizations serving and impacted by homeless persons, and building capacity of the east 

In [100]:
def convert_program_value(processed, value_to_convert, new_value):
    """
    Convert a specified value in the 'Program' column of the processed DataFrame.

    Parameters:
    - processed: The DataFrame containing the 'Program' column.
    - value_to_convert: The value in the 'Program' column that you want to convert.
    - new_value: The new value to replace the old value with.

    Returns:
    - A new DataFrame with the specified conversion applied.
    """
    
    # Replace the specified value in the 'Program' column
    processed['Program'] = processed['Program'].replace(value_to_convert, new_value)

    return processed

In [101]:
processed = convert_program_value(processed,
                                    'address homeless issues through case management; provide food, shelter vouchers, as well as skill development for long-term self-sufficiency to 200 residents.',  
                                   'address homeless issues through case management, provide food, shelter vouchers, and skill development for long-term self sufficiency'
)

processed = convert_program_value(processed,
                                  'outreeach',
                                  'outreach'
)
print("Unique program names in processed data:", processed['Program'].nunique())

Unique program names in processed data: 105


Manual Cleaning was horrible. Going to try more advanced techniques

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering

def cluster_programs_with_mapping(processed):
    """
    Cluster similar program names and create a mapping of original to representative names.

    Parameters:
    - processed: DataFrame containing the 'Program' column.

    Returns:
    - processed: Same DataFrame with updated program names.
    - change_mapping: A dictionary mapping original program names to their representative names.
    """
    
    # Extract unique program names while dropping NA values
    unique_programs = processed['Program'].dropna().unique().tolist()

    # Convert the program names into a TF-IDF matrix for clustering
    vectorizer = TfidfVectorizer().fit_transform(unique_programs)
    vectors = vectorizer.toarray()

    # Perform hierarchical clustering on the TF-IDF vectors
    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5)  # Adjust threshold as necessary
    clustering_model.fit(vectors)

    # Create a mapping of clusters to program names and track changes
    cluster_mapping = {}
    change_mapping = {}

    # Iterate over each unique cluster
    for cluster in set(clustering_model.labels_):
        # Get all program names in the current cluster
        cluster_programs = [unique_programs[i] for i in range(len(unique_programs)) if clustering_model.labels_[i] == cluster]
        representative_program = cluster_programs[0]  # Use the first program in the cluster as the representative

        # Map each program in the cluster to the representative program
        for program in cluster_programs:
            cluster_mapping[program] = representative_program
            if program != representative_program:
                change_mapping[program] = representative_program  # Record the change

    # Replace program values in the processed DataFrame with their representatives
    processed['Program'] = processed['Program'].replace(cluster_mapping)

    return processed, change_mapping


# Apply Cluster Cleaning
processed, changes = cluster_programs_with_mapping(processed)

# Display the number of unique program names in the processed DataFrame
print("Unique program names after clustering:", processed['Program'].nunique())
# Show what each program was changed from and to
print("Program changes mapping:", changes)


Unique program names after clustering: 105
Program changes mapping: {}


Cluter method didn't detect anything to change. Going to save `processed` data frame for as is for now.

In [105]:
# Create the 'data/processed' directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Define the path for the CSV file
csv_file_path = os.path.join('data', 'processed', 'processed.csv')

# Save the DataFrame to a CSV file
processed.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")

Data saved to data\processed\processed.csv


During the preparation of this work the author used ChatGPT in order to streamline the creation of the functions. After using this tool, the author identified and reviewed the content as needed and takes full responsibility for the content of the code and resulting processed data.
