In [43]:
import os

from cities.queries.fips_query import CTFipsQuery
from cities.utils.data_grabber import CTDataGrabberCSV, find_repo_root, list_available_features
import time

root = find_repo_root()

#### Defining the Functions Used for the Comparison

The goal is to demonstrate that the size of the census tract level variables is responsible for the poor time performance of `CTFipsQuery`. To achieve this, we will compare the execution time of the `CTFipsQuery` function for the original size of the `population` census tract level variable, as well as for half and a quarter of its size.


In [44]:
def save_first_half_of_population_data(data, feature_type, fraction, file_path, pre_or_post='pre_2020'):

    if fraction not in [0.25, 0.5]:
        raise ValueError("Fraction must be 0.25 or 0.5.")

    data = CTDataGrabberCSV(ct_time_period = pre_or_post)

    if feature_type == 'std_wide':
        data.get_features_std_wide(['population'])
        pop = data.std_wide['population']
    elif feature_type == 'wide':
        data.get_features_wide(['population'])
        pop = data.wide['population']
    else:
        raise ValueError("Invalid feature_type. Must be 'std_wide' or 'wide'.")
    
    num_rows = len(pop)
    num_to_save = int(num_rows * fraction)
    
    selected_data = pop.iloc[:num_to_save]

    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    selected_data.to_csv(file_path, index=False)



def delete_file(file_path):
    if os.path.isfile(file_path):
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    else:
        print(f"File not found: {file_path}")



In [45]:
def measure_time(command):

    start_time = time.time()
    command()
    end_time = time.time()
    return end_time - start_time

def run_query(fips, population_type, ct_time_period):

    f = CTFipsQuery(
        fips,
        population_type,
        ct_time_period= ct_time_period,
    )
    f.find_euclidean_kins()

def main(fips, ct_time = "pre_2020"):

    time_population = measure_time(lambda: run_query(fips, "population", ct_time))
    time_population_half = measure_time(lambda: run_query(fips, "populationHalf", ct_time))
    time_population_quarter = measure_time(lambda: run_query(fips, "populationQuarter", ct_time))

    print(f"Time for 'population': {time_population:.4f} seconds")
    print(f"Time for 'populationHalf': {time_population_half:.4f} seconds")
    print(f"Time for 'populationQuarter': {time_population_quarter:.4f} seconds")

The test for `pre_2020` population variable

In [48]:
# halfs

std_wide_path = f"{root}/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv"
wide_path = f"{root}/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='pre_2020')
save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='pre_2020')

# quarters

std_wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv"
wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='pre_2020')
save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='pre_2020')

# the test

fips_number = 1003010705
main(fips_number, "pre_2020")

# deleting the files created for the comparison
file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]

for file_path in file_paths:
    delete_file(file_path)

Time for 'population': 8.6737 seconds
Time for 'populationHalf': 2.7692 seconds
Time for 'populationQuarter': 1.1141 seconds
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationHalf_pre2020_CT_std_wide.csv
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationHalf_pre2020_CT_wide.csv
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationQuarter_pre2020_CT_std_wide.csv
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationQuarter_pre2020_CT_wide.csv


The test for `post_2020` population variable

In [47]:
# halfs

std_wide_path = f"{root}/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv"
wide_path = f"{root}/data/Census_tract_level/populationHalf_post2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.5, std_wide_path, pre_or_post='post_2020')
save_first_half_of_population_data(data, 'wide', 0.5, wide_path, pre_or_post='post_2020')

# quarters

std_wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv"
wide_pathQ = f"{root}/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv"

save_first_half_of_population_data(data, 'std_wide', 0.25, std_wide_pathQ, pre_or_post='post_2020')
save_first_half_of_population_data(data, 'wide', 0.25, wide_pathQ, pre_or_post='post_2020')

# the test

fips_number = 12117021203
main(fips_number, "post_2020")

# deleting the files created for the comparison
file_paths = [std_wide_path, wide_path, std_wide_pathQ, wide_pathQ]

for file_path in file_paths:
    delete_file(file_path)

Time for 'population': 10.5933 seconds
Time for 'populationHalf': 3.1721 seconds
Time for 'populationQuarter': 1.0914 seconds
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationHalf_post2020_CT_std_wide.csv
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationHalf_post2020_CT_wide.csv
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationQuarter_post2020_CT_std_wide.csv
Deleted: C:\Users\nikod\Documents\PythonProjects\cities/data/Census_tract_level/populationQuarter_post2020_CT_wide.csv


In [54]:
import random
import cProfile


#variables = list_available_features(level='census_tract')
time_periods = ['pre_2020', 'post_2020']  

#variable = random.choice(variables)
variable = 'population'
ct_time_period = random.choice(time_periods)

data = CTDataGrabberCSV(ct_time_period=ct_time_period)
data.get_features_wide([variable])
var = data.wide[variable]

fips = random.choice(list(var['GeoFIPS']))



def basic_run_ctfips():

    f = CTFipsQuery(
        fips,
        variable,
        ct_time_period=ct_time_period,
    )
 
    f.find_euclidean_kins()
    f.show_kins_plot()
    

profiler_basic_ctfips = cProfile.Profile()

profiler_basic_ctfips.enable()
basic_run_ctfips()
profiler_basic_ctfips.disable()

profiler_basic_ctfips.print_stats(sort='cumulative')


         992810 function calls (981271 primitive calls) in 9.048 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    9.060    4.530 interactiveshell.py:3514(run_code)
        2    0.000    0.000    9.060    4.530 {built-in method builtins.exec}
        1    0.011    0.011    9.048    9.048 827749473.py:20(basic_run_ctfips)
        1    2.486    2.486    6.902    6.902 fips_query.py:803(__init__)
      119    4.174    0.035    4.174    0.035 {method 'reduce' of 'numpy.ufunc' objects}
        9    0.000    0.000    4.173    0.464 {method 'sum' of 'numpy.ndarray' objects}
        9    0.000    0.000    4.173    0.464 _methods.py:47(_sum)
        1    0.129    0.129    1.758    1.758 fips_query.py:237(find_euclidean_kins)
    72187    1.011    0.000    1.285    0.000 similarity_utils.py:63(generalized_euclidean_distance)
        3    0.000    0.000    0.547    0.182 data_grabber.py:80(_get_features)

In [55]:
def basic_run_ctfips():

    f = CTFipsQuery(
    fips,
    variable,
    ct_time_period = ct_time_period,
    feature_groups_with_weights={variable: 1, "urbanicity": 2},
)
 
    f.find_euclidean_kins()
    f.show_kins_plot()
    

profiler_basic_ctfips = cProfile.Profile()

profiler_basic_ctfips.enable()
basic_run_ctfips()
profiler_basic_ctfips.disable()

profiler_basic_ctfips.print_stats(sort='cumulative')

         1003385 function calls (991604 primitive calls) in 9.541 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    9.559    4.779 interactiveshell.py:3514(run_code)
        2    0.000    0.000    9.558    4.779 {built-in method builtins.exec}
        1    0.017    0.017    9.541    9.541 3385331089.py:1(basic_run_ctfips)
        1    2.491    2.491    7.335    7.335 fips_query.py:803(__init__)
      171    4.436    0.026    4.436    0.026 {method 'reduce' of 'numpy.ufunc' objects}
       13    0.000    0.000    4.435    0.341 {method 'sum' of 'numpy.ndarray' objects}
       13    0.000    0.000    4.435    0.341 _methods.py:47(_sum)
        1    0.127    0.127    1.822    1.822 fips_query.py:237(find_euclidean_kins)
    72187    0.990    0.000    1.294    0.000 similarity_utils.py:63(generalized_euclidean_distance)
        3    0.000    0.000    0.707    0.236 data_grabber.py:80(_get_features