In [26]:
import os

from cities.queries.fips_query import CTFipsQuery
from cities.utils.data_grabber import CTDataGrabberCSV, find_repo_root, list_available_features
import time

root = find_repo_root()

#### Defining the Functions Used for the Comparison

The goal is to demonstrate that the size of the census tract level variables is responsible for the poor time performance of `CTFipsQuery`. To achieve this, we will compare the execution time of the `CTFipsQuery` function for the original size of the `population` census tract level variable, as well as for half and a quarter of its size.


In [28]:
def save_first_half_of_population_data(data, feature_type, fraction, file_path, pre_or_post='pre_2020'):

    if fraction not in [0.25, 0.5]:
        raise ValueError("Fraction must be 0.25 or 0.5.")

    data = CTDataGrabberCSV(ct_time_period = pre_or_post)

    if feature_type == 'std_wide':
        data.get_features_std_wide(['population'])
        pop = data.std_wide['population']
    elif feature_type == 'wide':
        data.get_features_wide(['population'])
        pop = data.wide['population']
    else:
        raise ValueError("Invalid feature_type. Must be 'std_wide' or 'wide'.")
    
    num_rows = len(pop)
    num_to_save = int(num_rows * fraction)
    
    selected_data = pop.iloc[:num_to_save]

    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    selected_data.to_csv(file_path, index=False)



def delete_file(file_path):
    if os.path.isfile(file_path):
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    else:
        print(f"File not found: {file_path}")



In [34]:
def measure_time(command):

    start_time = time.time()
    command()
    end_time = time.time()
    return end_time - start_time

def run_query(fips, population_type, ct_time_period):

    f = CTFipsQuery(
        fips,
        population_type,
        ct_time_period= ct_time_period,
    )
    f.find_euclidean_kins()

def main(fips, ct_time = "pre_2020"):

    time_population = measure_time(lambda: run_query(fips, "population", ct_time))
    time_population_half = measure_time(lambda: run_query(fips, "populationHalf", ct_time))
    time_population_quarter = measure_time(lambda: run_query(fips, "populationQuarter", ct_time))

    print(f"Time for 'population': {time_population:.4f} seconds")
    print(f"Time for 'populationHalf': {time_population_half:.4f} seconds")
    print(f"Time for 'populationQuarter': {time_population_quarter:.4f} seconds")

The test for `pre_2020` population variable

In [33]:
# halfs

std_wide_path = f"{root}/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv"
wide_path = f"{root}/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='pre_2020')
save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='pre_2020')

# quarters

std_wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv"
wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='pre_2020')
save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='pre_2020')

# the test

fips_number = 1003010705
main(fips_number, "pre_2020")

# deleting the files created for the comparison
file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]

for file_path in file_paths:
    delete_file(file_path)

Time for 'population': 47.8251 seconds
Time for 'populationHalf': 2.6128 seconds
Time for 'populationQuarter': 1.0029 seconds
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv


The test for `post_2020` population variable

In [35]:
# halfs

std_wide_path = f"{root}/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv"
wide_path = f"{root}/data/Census_tract_level/populationHalf_post2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='post_2020')
save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='post_2020')

# quarters

std_wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv"
wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='post_2020')
save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='post_2020')

# the test

fips_number = 12117021203
main(fips_number, "post_2020")

# deleting the files created for the comparison
file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]

for file_path in file_paths:
    delete_file(file_path)

Time for 'population': 81.2661 seconds
Time for 'populationHalf': 3.4648 seconds
Time for 'populationQuarter': 1.0445 seconds
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationHalf_post2020_CT_wide.csv
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv
Deleted: /home/nikodem/Documents/pythonProjects/cities/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv
