In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly import graph_objs as go

from cities.utils.data_grabber import DataGrabber, list_available_features, list_tensed_features
from cities.queries.fips_query import FipsQuery


In [2]:
# this should yield the same ordering as the next cell

new_f = FipsQuery(33011, feature_groups_with_weights={"ethnic_composition" : 4}, top = 20)
new_f.find_euclidean_kins()
df = new_f.euclidean_kins
display(new_f.plot_weights())
display(df.head())

(3081, 10)
['mexican_ethnic_composition', 'puerto_rican_ethnic_composition', 'cuban_ethnic_composition', 'other_hispanic_latino_ethnic_composition', 'white_ethnic_composition', 'black_african_american_ethnic_composition', 'american_indian_alaska_native_ethnic_composition', 'asian_ethnic_composition', 'native_hawaiian_other_pacific_islander_ethnic_composition', 'other_race_races_ethnic_composition']
(3081, 10)
3081


None

Unnamed: 0,GeoFIPS,GeoName,mexican_ethnic_composition,puerto_rican_ethnic_composition,cuban_ethnic_composition,other_hispanic_latino_ethnic_composition,white_ethnic_composition,black_african_american_ethnic_composition,american_indian_alaska_native_ethnic_composition,asian_ethnic_composition,native_hawaiian_other_pacific_islander_ethnic_composition,other_race_races_ethnic_composition,distance to 33011
1760,33011,"Hillsborough, NH",0.005682,0.012574,0.001063,0.017558,0.414191,0.011178,0.000396,0.020968,6.7e-05,0.016323,0.0
2243,42017,"Bucks, PA",0.006231,0.011642,0.000948,0.009954,0.412445,0.018679,0.000266,0.0238,6.7e-05,0.015967,0.112109
2303,44003,"Kent, RI",0.003472,0.010337,0.001352,0.013927,0.433063,0.008875,0.000685,0.015102,5.6e-05,0.01313,0.135908
1829,36021,"Columbia, NY",0.004969,0.008776,0.001145,0.010635,0.424132,0.019468,0.00026,0.009669,4.1e-05,0.020905,0.14602
303,9005,"Litchfield, CT",0.002895,0.014918,0.000942,0.016916,0.432315,0.008044,0.000319,0.009505,0.0,0.014146,0.158289


In [3]:
# this should return the same ordering of locations as the previous cell
new_f = FipsQuery(33011, outcome_var="gdp", feature_groups_with_weights={"ethnic_composition" : 4, "gdp" : 0}, top = 20)
new_f.find_euclidean_kins()
df = new_f.euclidean_kins
display(new_f.plot_weights())
display(df.head())

(3081, 30)
['2001_gdp', '2002_gdp', '2003_gdp', '2004_gdp', '2005_gdp', '2006_gdp', '2007_gdp', '2008_gdp', '2009_gdp', '2010_gdp', '2011_gdp', '2013_gdp', '2014_gdp', '2015_gdp', '2016_gdp', '2017_gdp', '2018_gdp', '2019_gdp', '2020_gdp', '2021_gdp', 'mexican_ethnic_composition', 'puerto_rican_ethnic_composition', 'cuban_ethnic_composition', 'other_hispanic_latino_ethnic_composition', 'white_ethnic_composition', 'black_african_american_ethnic_composition', 'american_indian_alaska_native_ethnic_composition', 'asian_ethnic_composition', 'native_hawaiian_other_pacific_islander_ethnic_composition', 'other_race_races_ethnic_composition']
(3081, 30)
3081


None

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,cuban_ethnic_composition,other_hispanic_latino_ethnic_composition,white_ethnic_composition,black_african_american_ethnic_composition,american_indian_alaska_native_ethnic_composition,asian_ethnic_composition,native_hawaiian_other_pacific_islander_ethnic_composition,other_race_races_ethnic_composition,distance to 33011,percentile
0,33011,"Hillsborough, NH",78.763,81.764,88.173,92.192,93.669,96.663,97.649,97.544,...,0.001063,0.017558,0.414191,0.011178,0.000396,0.020968,6.7e-05,0.016323,0.0,78.84
1,42017,"Bucks, PA",86.661,89.196,90.155,92.713,95.581,96.274,99.513,101.034,...,0.000948,0.009954,0.412445,0.018679,0.000266,0.0238,6.7e-05,0.015967,0.112109,41.21
2,44003,"Kent, RI",85.628,90.049,93.548,98.018,100.744,106.524,99.924,97.906,...,0.001352,0.013927,0.433063,0.008875,0.000685,0.015102,5.6e-05,0.01313,0.135908,35.01
3,36021,"Columbia, NY",88.39,85.389,87.184,93.877,90.006,89.017,84.805,89.708,...,0.001145,0.010635,0.424132,0.019468,0.00026,0.009669,4.1e-05,0.020905,0.14602,14.54
4,9005,"Litchfield, CT",99.445,100.091,100.378,99.149,98.643,100.329,97.272,100.708,...,0.000942,0.016916,0.432315,0.008044,0.000319,0.009505,0.0,0.014146,0.158289,21.58


In [2]:
# note: you should expect different years for different variables, as
# you restrict weights to certain years *for the outcome variable only*
# and use whatever data is available for other variables
# and other variables have different years available

f  = FipsQuery(1007, outcome_var = "gdp",
               feature_groups_with_weights= {"gdp": 1, "population":1}, #with one feature group only
               # weights 1-4 won't make a difference
               lag = 0, top =5, time_decay = 1.5, outcome_comparison_period=(2003, 2010))
f.find_euclidean_kins()
display(f.plot_weights())

(3081, 37)
['2003_gdp', '2004_gdp', '2005_gdp', '2006_gdp', '2007_gdp', '2008_gdp', '2009_gdp', '2010_gdp', '1993_population', '1994_population', '1995_population', '1996_population', '1997_population', '1998_population', '1999_population', '2000_population', '2001_population', '2002_population', '2003_population', '2004_population', '2005_population', '2006_population', '2007_population', '2008_population', '2009_population', '2010_population', '2011_population', '2012_population', '2013_population', '2014_population', '2015_population', '2016_population', '2017_population', '2018_population', '2019_population', '2020_population', '2021_population']
(3081, 37)
3081
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False


  for col_name, col_data in featurewise_contributions_df.iteritems():


None

In [3]:
#SANITY_CHECK: here weights and years should be in alignment for the outcome var

f  = FipsQuery(42001, "gdp", lag = 0, top =5, time_decay = 1.06, 
               outcome_comparison_period=(2003, 2010), outcome_percentile_range= (40,100))
f.find_euclidean_kins()
# you can inspect the weights resulting from your time_decay setting:

display(f.plot_weights())

None

In [4]:
#SANITY_CHECK here there were issues with weight plotting, weights poured over outcomes,
# fixed, check

f  = FipsQuery(20003, outcome_var = "gdp",
               feature_groups_with_weights= {"gdp":0, "population":4},
               # weights= {"population":4}, 
               lag = 3, top =10, time_decay = 1.03)
f.find_euclidean_kins()
display(f.plot_weights())

None

In [5]:
# this threw an error before
f  = FipsQuery(42001, feature_groups_with_weights= {"population":4})
f.find_euclidean_kins()
display(f.plot_weights())
display(f.euclidean_kins)

None

Unnamed: 0,1993_population,1994_population,1995_population,1996_population,1997_population,1998_population,1999_population,2000_population,2001_population,2002_population,...,2013_population,2014_population,2015_population,2016_population,2017_population,2018_population,2019_population,2020_population,2021_population,distance to 42001
2235,83013.0,84186.0,85063.0,86252.0,87751.0,89074.0,90363.0,91457.0,92591.0,93934.0,...,101504.0,101830.0,102411.0,102625.0,103414.0,103932.0,103778.0,103795.0,104127.0,0.000000
3006,92876.0,93717.0,94509.0,95529.0,95998.0,96512.0,96985.0,97390.0,97856.0,98097.0,...,102191.0,102495.0,102590.0,102927.0,103180.0,103754.0,104175.0,104076.0,104362.0,0.004093
1509,84234.0,85586.0,87584.0,89357.0,90710.0,91697.0,92914.0,94050.0,94872.0,95771.0,...,101823.0,102058.0,102429.0,102952.0,103563.0,103967.0,104137.0,104769.0,105231.0,0.008424
27,101310.0,102213.0,102342.0,103063.0,104129.0,104367.0,104002.0,103286.0,102976.0,102988.0,...,104249.0,103880.0,103601.0,103603.0,103854.0,103646.0,103440.0,103393.0,103162.0,0.008731
1230,112397.0,112025.0,111680.0,111231.0,111040.0,110704.0,110295.0,110192.0,109836.0,109861.0,...,107221.0,106599.0,105916.0,105184.0,104967.0,104674.0,104079.0,103594.0,102985.0,0.011795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,472.0,441.0,434.0,429.0,440.0,447.0,445.0,445.0,423.0,421.0,...,443.0,437.0,433.0,445.0,432.0,435.0,436.0,431.0,439.0,1.977809
2644,428.0,413.0,420.0,415.0,411.0,427.0,412.0,420.0,421.0,429.0,...,426.0,421.0,421.0,404.0,383.0,389.0,355.0,346.0,340.0,1.978614
2648,342.0,363.0,335.0,358.0,373.0,389.0,346.0,358.0,313.0,308.0,...,268.0,257.0,273.0,281.0,278.0,265.0,256.0,270.0,258.0,1.980969
2664,109.0,114.0,111.0,109.0,78.0,83.0,77.0,65.0,66.0,75.0,...,77.0,59.0,72.0,64.0,66.0,67.0,67.0,67.0,57.0,1.985240


In [5]:
# You don't want to pass outcome and are interested in similarities
print(list_available_features())
f  = FipsQuery(42001, feature_groups_with_weights= {"population":4, "spending_HHS": 3})
f.find_euclidean_kins()
display(f.plot_weights())
display(f.euclidean_kins)

['industry_other_services_total', 'population', 'industry_construction_total', 'spending_transportation', 'industry_public_administration_total', 'spending_commerce', 'urbanization', 'industry_mining_total', 'industry_information_total', 'ethnic_composition', 'industry_educational_services_total', 'industry_manufacturing_total', 'industry_real_estate_total', 'industry_transportation_warehousing_total', 'industry_utilities_total', 'industry_management_enterprises_total', 'industry_admin_support_services_total', 'industry', 'gdp', 'industry_wholesale_trade_total', 'industry_arts_recreation_total', 'industry_agriculture_total', 'industry_professional_services_total', 'transport', 'industry_retail_trade_total', 'industry_accommodation_food_services_total', 'industry_healthcare_social_services_total', 'industry_finance_insurance_total', 'spending_HHS']


None

Unnamed: 0,1993_population,1994_population,1995_population,1996_population,1997_population,1998_population,1999_population,2000_population,2001_population,2002_population,...,2013_spending_HHS,2014_spending_HHS,2015_spending_HHS,2016_spending_HHS,2017_spending_HHS,2018_spending_HHS,2019_spending_HHS,2020_spending_HHS,2021_spending_HHS,distance to 42001
2235,83013.0,84186.0,85063.0,86252.0,87751.0,89074.0,90363.0,91457.0,92591.0,93934.0,...,6.841459e+07,2.843109e+07,1.022756e+07,3.068268e+07,2.081153e+07,3.134013e+07,5.207267e+07,3.151378e+07,4.473508e+07,0.000000
3006,92876.0,93717.0,94509.0,95529.0,95998.0,96512.0,96985.0,97390.0,97856.0,98097.0,...,2.548639e+07,3.562433e+07,2.471649e+07,3.263073e+07,5.403922e+07,4.703442e+07,5.464052e+07,5.844357e+07,7.377923e+07,0.026253
1244,96534.0,97920.0,99350.0,100660.0,101474.0,102280.0,102991.0,104000.0,104848.0,105794.0,...,5.135486e+07,1.685584e+07,2.075568e+07,6.394366e+07,2.995285e+07,6.536969e+07,5.782530e+07,6.692654e+07,4.542509e+07,0.030965
7,116324.0,116161.0,116790.0,116684.0,117254.0,117179.0,114910.0,111081.0,111266.0,111625.0,...,3.943803e+07,4.531137e+07,3.266888e+07,3.188817e+07,3.195046e+07,4.203667e+07,2.714948e+07,2.231390e+07,2.433614e+07,0.033050
1853,98327.0,98509.0,98816.0,99509.0,99693.0,99603.0,99802.0,100106.0,100819.0,101763.0,...,2.743605e+07,1.820944e+07,7.596719e+06,1.534277e+07,5.635192e+07,2.467059e+07,1.403323e+07,4.650258e+07,4.890882e+07,0.034477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2384,15660.0,15902.0,15997.0,16082.0,16053.0,16129.0,16386.0,16505.0,16351.0,16479.0,...,4.016453e+09,3.975215e+09,4.085792e+09,3.715688e+09,6.654582e+09,7.660348e+09,1.127461e+10,1.530217e+10,1.712052e+10,2.369198
2740,649226.0,671759.0,696278.0,717194.0,736587.0,761335.0,788500.0,819692.0,844877.0,848090.0,...,2.225282e+11,2.231797e+11,2.118196e+11,2.138400e+11,2.527288e+11,3.552319e+11,4.896949e+11,6.288495e+11,7.776532e+11,2.469107
198,9100159.0,9096608.0,9089015.0,9127042.0,9206538.0,9313589.0,9437290.0,9538191.0,9626034.0,9705913.0,...,2.348763e+10,2.294435e+10,2.502567e+10,2.839330e+10,3.868660e+10,5.899629e+10,1.410259e+11,2.187211e+11,2.577860e+11,2.725159
1819,298592.0,300262.0,299725.0,297680.0,296187.0,295097.0,294692.0,295106.0,296232.0,298283.0,...,3.271360e+11,3.134117e+11,3.894925e+11,3.271486e+11,7.462131e+11,6.579396e+11,8.941410e+11,9.732370e+11,9.351828e+11,2.941499


In [6]:
# you want to pass an outcome but give it weight 0 in similarity calculations

print(list_available_features())
f  = FipsQuery(42001, outcome_var = "spending_HHS", feature_groups_with_weights= {"spending_HHS": 0, "population":4})
f.find_euclidean_kins()
display(f.plot_weights())
display(f.euclidean_kins)

['industry_other_services_total', 'population', 'industry_construction_total', 'spending_transportation', 'industry_public_administration_total', 'spending_commerce', 'urbanization', 'industry_mining_total', 'industry_information_total', 'ethnic_composition', 'industry_educational_services_total', 'industry_manufacturing_total', 'industry_real_estate_total', 'industry_transportation_warehousing_total', 'industry_utilities_total', 'industry_management_enterprises_total', 'industry_admin_support_services_total', 'industry', 'gdp', 'industry_wholesale_trade_total', 'industry_arts_recreation_total', 'industry_agriculture_total', 'industry_professional_services_total', 'transport', 'industry_retail_trade_total', 'industry_accommodation_food_services_total', 'industry_healthcare_social_services_total', 'industry_finance_insurance_total', 'spending_HHS']


None

Unnamed: 0,GeoFIPS,GeoName,2010,2011,2012,2013,2014,2015,2016,2017,...,2014_population,2015_population,2016_population,2017_population,2018_population,2019_population,2020_population,2021_population,distance to 42001,percentile
0,42001,"Adams, PA",2.771827e+07,2.855134e+07,1.427164e+07,6.841459e+07,2.843109e+07,1.022756e+07,3.068268e+07,2.081153e+07,...,101830.0,102411.0,102625.0,103414.0,103932.0,103778.0,103795.0,104127.0,0.000000,63.85
1,55039,"Fond du Lac, WI",3.610418e+07,2.696325e+07,3.833402e+07,2.548639e+07,3.562433e+07,2.471649e+07,3.263073e+07,5.403922e+07,...,102495.0,102590.0,102927.0,103180.0,103754.0,104175.0,104076.0,104362.0,0.004093,69.34
2,29071,"Franklin, MO",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,102058.0,102429.0,102952.0,103563.0,103967.0,104137.0,104769.0,105231.0,0.008424,44.71
3,1055,"Etowah, AL",1.649418e+08,1.154164e+09,2.897688e+08,1.134255e+09,6.389997e+08,8.881343e+08,1.031911e+09,9.080343e+08,...,103880.0,103601.0,103603.0,103854.0,103646.0,103440.0,103393.0,103162.0,0.008731,90.56
4,26017,"Bay, MI",2.073759e+06,1.232872e+06,5.381210e+05,8.434320e+05,1.047389e+06,1.047389e+06,2.580589e+06,4.565004e+06,...,106599.0,105916.0,105184.0,104967.0,104674.0,104079.0,103594.0,102985.0,0.011795,52.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,31005,"Arthur, NE",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,437.0,433.0,445.0,432.0,435.0,436.0,431.0,439.0,1.977809,0.00
3078,48261,"Kenedy, TX",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,421.0,421.0,404.0,383.0,389.0,355.0,346.0,340.0,1.978614,0.00
3079,48269,"King, TX",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,257.0,273.0,281.0,278.0,265.0,256.0,270.0,258.0,1.980969,0.00
3080,48301,"Loving, TX",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,59.0,72.0,64.0,66.0,67.0,67.0,67.0,57.0,1.985240,0.00


In [6]:
# the other queries still work

f  = FipsQuery(1007, outcome_var = "gdp",
               feature_groups_with_weights= {"gdp": -2, "population":1}, #with one feature group only
               # weights 1-4 won't make a difference
               lag = 0, top =5, time_decay = 1.03)
f.find_euclidean_kins()
display(f.plot_weights())
display(f.euclidean_kins)

None

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2014_population,2015_population,2016_population,2017_population,2018_population,2019_population,2020_population,2021_population,distance to 1007,percentile
0,1007,"Bibb, AL",80.443,81.527,85.124,89.317,88.782,89.597,95.308,94.745,...,22586.0,22607.0,22654.0,22606.0,22383.0,22405.0,22223.0,22477.0,0.000000,47.86
1,48109,"Culberson, TX",35.264,37.743,36.255,38.339,40.177,41.247,42.368,53.349,...,2301.0,2275.0,2244.0,2259.0,2212.0,2186.0,2193.0,2193.0,1.947278,99.97
2,48389,"Reeves, TX",46.003,49.290,44.960,41.682,39.742,41.332,41.009,41.389,...,14614.0,14936.0,14484.0,14314.0,14526.0,14847.0,14730.0,14487.0,1.968913,99.90
3,48283,"La Salle, TX",6.638,6.679,7.065,6.727,7.541,8.729,8.254,7.900,...,7115.0,7175.0,7057.0,6916.0,6808.0,6763.0,6642.0,6670.0,2.070731,96.40
4,48255,"Karnes, TX",6.498,6.891,7.331,6.759,6.537,6.290,6.662,6.937,...,14569.0,14976.0,14997.0,14990.0,15028.0,14605.0,14721.0,14754.0,2.076390,98.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,6073,"San Diego, CA",79.703,83.170,87.449,92.400,97.137,99.354,100.938,100.267,...,3234658.0,3262566.0,3283586.0,3293575.0,3303463.0,3297959.0,3297252.0,3286069.0,2.924300,80.21
3078,4013,"Maricopa, AZ",77.463,80.415,85.793,90.349,97.615,102.729,105.313,103.632,...,4040171.0,4105747.0,4174844.0,4231511.0,4292576.0,4363816.0,4438342.0,4496588.0,2.950617,87.18
3079,48201,"Harris, TX",73.137,72.696,72.847,79.658,80.626,87.278,94.310,91.961,...,4452976.0,4553991.0,4619635.0,4651955.0,4672445.0,4704042.0,4732491.0,4728030.0,2.952810,51.14
3080,17031,"Cook, IL",95.406,94.886,95.455,97.260,99.315,101.320,101.826,99.238,...,5320233.0,5324961.0,5320293.0,5311621.0,5297956.0,5287099.0,5262741.0,5173146.0,3.019405,52.43


In [8]:
# You don't want to pass outcome and are interested in similarities
print(list_available_features())
f  = FipsQuery(42001, feature_groups_with_weights= {"population":4, "spending_HHS": 3})
f.find_euclidean_kins()
display(f.plot_weights())
display(f.euclidean_kins)

['industry_other_services_total', 'population', 'industry_construction_total', 'spending_transportation', 'industry_public_administration_total', 'spending_commerce', 'urbanization', 'industry_mining_total', 'industry_information_total', 'ethnic_composition', 'industry_educational_services_total', 'industry_manufacturing_total', 'industry_real_estate_total', 'industry_transportation_warehousing_total', 'industry_utilities_total', 'industry_management_enterprises_total', 'industry_admin_support_services_total', 'industry', 'gdp', 'industry_wholesale_trade_total', 'industry_arts_recreation_total', 'industry_agriculture_total', 'industry_professional_services_total', 'transport', 'industry_retail_trade_total', 'industry_accommodation_food_services_total', 'industry_healthcare_social_services_total', 'industry_finance_insurance_total', 'spending_HHS']


None

Unnamed: 0,1993_population,1994_population,1995_population,1996_population,1997_population,1998_population,1999_population,2000_population,2001_population,2002_population,...,2013_spending_HHS,2014_spending_HHS,2015_spending_HHS,2016_spending_HHS,2017_spending_HHS,2018_spending_HHS,2019_spending_HHS,2020_spending_HHS,2021_spending_HHS,distance to 42001
2235,83013.0,84186.0,85063.0,86252.0,87751.0,89074.0,90363.0,91457.0,92591.0,93934.0,...,6.841459e+07,2.843109e+07,1.022756e+07,3.068268e+07,2.081153e+07,3.134013e+07,5.207267e+07,3.151378e+07,4.473508e+07,0.000000
3006,92876.0,93717.0,94509.0,95529.0,95998.0,96512.0,96985.0,97390.0,97856.0,98097.0,...,2.548639e+07,3.562433e+07,2.471649e+07,3.263073e+07,5.403922e+07,4.703442e+07,5.464052e+07,5.844357e+07,7.377923e+07,0.026253
1244,96534.0,97920.0,99350.0,100660.0,101474.0,102280.0,102991.0,104000.0,104848.0,105794.0,...,5.135486e+07,1.685584e+07,2.075568e+07,6.394366e+07,2.995285e+07,6.536969e+07,5.782530e+07,6.692654e+07,4.542509e+07,0.030965
7,116324.0,116161.0,116790.0,116684.0,117254.0,117179.0,114910.0,111081.0,111266.0,111625.0,...,3.943803e+07,4.531137e+07,3.266888e+07,3.188817e+07,3.195046e+07,4.203667e+07,2.714948e+07,2.231390e+07,2.433614e+07,0.033050
1853,98327.0,98509.0,98816.0,99509.0,99693.0,99603.0,99802.0,100106.0,100819.0,101763.0,...,2.743605e+07,1.820944e+07,7.596719e+06,1.534277e+07,5.635192e+07,2.467059e+07,1.403323e+07,4.650258e+07,4.890882e+07,0.034477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2384,15660.0,15902.0,15997.0,16082.0,16053.0,16129.0,16386.0,16505.0,16351.0,16479.0,...,4.016453e+09,3.975215e+09,4.085792e+09,3.715688e+09,6.654582e+09,7.660348e+09,1.127461e+10,1.530217e+10,1.712052e+10,2.369198
2740,649226.0,671759.0,696278.0,717194.0,736587.0,761335.0,788500.0,819692.0,844877.0,848090.0,...,2.225282e+11,2.231797e+11,2.118196e+11,2.138400e+11,2.527288e+11,3.552319e+11,4.896949e+11,6.288495e+11,7.776532e+11,2.469107
198,9100159.0,9096608.0,9089015.0,9127042.0,9206538.0,9313589.0,9437290.0,9538191.0,9626034.0,9705913.0,...,2.348763e+10,2.294435e+10,2.502567e+10,2.839330e+10,3.868660e+10,5.899629e+10,1.410259e+11,2.187211e+11,2.577860e+11,2.725159
1819,298592.0,300262.0,299725.0,297680.0,296187.0,295097.0,294692.0,295106.0,296232.0,298283.0,...,3.271360e+11,3.134117e+11,3.894925e+11,3.271486e+11,7.462131e+11,6.579396e+11,8.941410e+11,9.732370e+11,9.351828e+11,2.941499


#### Use case: I just want a rough idea

You want to know where your jurisdiction stands relative to all the others in the country. you also know the rate 
of locations where the most recent value of the outcome is lower than yours.

In [7]:
f  = FipsQuery(42001, "gdp")
f.compare_my_outcome_to_others(sample_size= 100, range_multiplier= 10)


#### Use case: similarity in outcome patterns

You want to find top five juristdiction with similar gdp change patterns. You value times nearest to you a bit more, but also you only want the years 2003-2019 to be used for the outcome comparison.

In [8]:
f  = FipsQuery(42001, "gdp", lag = 0, top =5, time_decay = 1.06, 
               outcome_comparison_period=(2003, 2019), outcome_percentile_range= (40,100))
f.find_euclidean_kins()
# you can inspect the weights resulting from your time_decay setting:

display(f.plot_weights())

None

In [9]:
# you can find the distances and  inspect the resulting 
# dataframe that contains the ranking:
f.find_euclidean_kins()
display(f.euclidean_kins)

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2014,2015,2016,2017,2018,2019,2020,2021,distance to 42001,percentile
0,42001,"Adams, PA",78.619,84.689,84.475,85.860,89.556,93.508,93.154,95.379,...,100.006,102.509,103.708,108.411,105.390,103.440,97.678,102.664,0.000000,35.14
1,24013,"Carroll, MD",76.152,80.700,82.853,86.013,89.398,95.381,95.463,97.823,...,99.510,101.215,101.568,106.456,104.838,105.452,101.050,105.298,0.042184,41.95
2,28105,"Oktibbeha, MS",83.279,85.143,83.864,85.565,91.160,93.433,93.899,98.855,...,100.858,100.069,106.639,105.926,106.941,107.105,108.225,109.357,0.059161,51.43
4,36103,"Suffolk, NY",77.583,80.337,83.711,87.603,88.109,90.437,90.857,94.202,...,100.099,101.102,102.505,103.283,103.288,105.925,101.523,106.687,0.060547,44.94
5,25027,"Worcester, MA",83.429,84.954,88.210,89.841,90.625,92.234,93.830,96.316,...,101.656,103.801,104.461,104.997,107.212,107.542,104.070,109.597,0.062987,51.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3075,48127,"Dimmit, TX",9.381,10.070,10.269,10.101,10.527,13.382,13.424,12.854,...,208.912,253.553,226.448,235.298,230.138,256.745,206.367,159.131,1.225183,96.27
3076,48389,"Reeves, TX",46.003,49.290,44.960,41.682,39.742,41.332,41.009,41.389,...,212.473,334.183,416.768,607.235,843.967,1226.531,1136.483,942.206,1.239320,99.90
3078,48301,"Loving, TX",37.050,40.836,40.690,49.561,81.009,110.338,149.312,114.503,...,186.286,306.333,551.251,802.739,1064.149,1388.130,1453.540,1080.924,1.260074,99.94
3080,48283,"La Salle, TX",6.638,6.679,7.065,6.727,7.541,8.729,8.254,7.900,...,224.270,255.409,208.985,202.036,201.509,219.118,177.489,159.947,1.307401,96.40


In [10]:
# you can plot the few most similar cities:
fig = f.show_kins_plot()

### Use case: similarity in outcome patterns and some other features

Say you want to include historical population patterns in your similarity ranking. You also want to pay a bit more attention to older data points. And you can now set weights to negative values to indicate that you care about dissimilarity in that feature.

In [13]:
f  = FipsQuery(1007, outcome_var = "gdp",
               feature_groups_with_weights= {"gdp": -2, "population":1}, #with one feature group only
               # weights 1-4 won't make a difference
               lag = 0, top =5, time_decay = 1.03)
f.find_euclidean_kins()
# you still can inspect the resulting weighing:
display(f.plot_weights())

None

In [14]:
# you still have access to the distances and the ranking.
# only, this time there are more columns in the dataframe:
display(f.euclidean_kins)


Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2014_population,2015_population,2016_population,2017_population,2018_population,2019_population,2020_population,2021_population,distance to 1007,percentile
0,1007,"Bibb, AL",80.443,81.527,85.124,89.317,88.782,89.597,95.308,94.745,...,22586.0,22607.0,22654.0,22606.0,22383.0,22405.0,22223.0,22477.0,0.000000,47.86
1,48109,"Culberson, TX",35.264,37.743,36.255,38.339,40.177,41.247,42.368,53.349,...,2301.0,2275.0,2244.0,2259.0,2212.0,2186.0,2193.0,2193.0,1.947278,99.97
2,48389,"Reeves, TX",46.003,49.290,44.960,41.682,39.742,41.332,41.009,41.389,...,14614.0,14936.0,14484.0,14314.0,14526.0,14847.0,14730.0,14487.0,1.968913,99.90
3,48283,"La Salle, TX",6.638,6.679,7.065,6.727,7.541,8.729,8.254,7.900,...,7115.0,7175.0,7057.0,6916.0,6808.0,6763.0,6642.0,6670.0,2.070731,96.40
4,48255,"Karnes, TX",6.498,6.891,7.331,6.759,6.537,6.290,6.662,6.937,...,14569.0,14976.0,14997.0,14990.0,15028.0,14605.0,14721.0,14754.0,2.076390,98.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,6073,"San Diego, CA",79.703,83.170,87.449,92.400,97.137,99.354,100.938,100.267,...,3234658.0,3262566.0,3283586.0,3293575.0,3303463.0,3297959.0,3297252.0,3286069.0,2.924300,80.21
3078,4013,"Maricopa, AZ",77.463,80.415,85.793,90.349,97.615,102.729,105.313,103.632,...,4040171.0,4105747.0,4174844.0,4231511.0,4292576.0,4363816.0,4438342.0,4496588.0,2.950617,87.18
3079,48201,"Harris, TX",73.137,72.696,72.847,79.658,80.626,87.278,94.310,91.961,...,4452976.0,4553991.0,4619635.0,4651955.0,4672445.0,4704042.0,4732491.0,4728030.0,2.952810,51.14
3080,17031,"Cook, IL",95.406,94.886,95.455,97.260,99.315,101.320,101.826,99.238,...,5320233.0,5324961.0,5320293.0,5311621.0,5297956.0,5287099.0,5262741.0,5173146.0,3.019405,52.43


In [15]:
# you still can plot the few top ranked cities:
fig = f.show_kins_plot()

#### Use case: similarity of outcome with a lag

You care about similarity of outcome variables, but your question now is: what other locations were 2 years ago in a similar place to me now, when it comes to the outcome variable and the features?


In [16]:
f  = FipsQuery(42001, "gdp", lag = 2, top =5, time_decay = 1.06)
f.find_euclidean_kins()

display(f.plot_weights())

None

In [17]:
f.find_euclidean_kins()
display(f.euclidean_kins)

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2014,2015,2016,2017,2018,2019,2020,2021,distance to 42001,percentile
0,42001,"Adams, PA",78.619,84.689,84.475,85.860,89.556,93.508,93.154,95.379,...,100.006,102.509,103.708,108.411,105.390,103.440,97.678,102.664,0.000000,35.14
1,20161,"Riley, KS",78.822,79.879,83.156,85.898,86.815,89.029,93.378,97.320,...,102.355,102.888,103.512,102.037,100.206,100.843,100.186,101.947,0.067165,33.32
2,17001,"Adams, IL",79.654,81.654,86.491,90.160,91.589,93.582,93.367,95.001,...,102.400,101.760,104.586,97.057,99.635,98.477,93.510,100.614,0.067992,30.63
3,33019,"Sullivan, NH",87.624,93.209,91.829,96.587,97.861,95.699,97.592,92.842,...,102.845,106.400,101.283,101.850,99.204,99.205,101.346,109.571,0.068728,51.82
4,13171,"Lamar, GA",83.191,81.883,83.438,84.551,93.216,93.727,93.967,98.477,...,99.373,103.592,103.735,104.157,101.036,105.494,110.219,114.258,0.069925,62.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,48389,"Reeves, TX",46.003,49.290,44.960,41.682,39.742,41.332,41.009,41.389,...,212.473,334.183,416.768,607.235,843.967,1226.531,1136.483,942.206,1.257687,99.90
3078,48127,"Dimmit, TX",9.381,10.070,10.269,10.101,10.527,13.382,13.424,12.854,...,208.912,253.553,226.448,235.298,230.138,256.745,206.367,159.131,1.275203,96.27
3079,48301,"Loving, TX",37.050,40.836,40.690,49.561,81.009,110.338,149.312,114.503,...,186.286,306.333,551.251,802.739,1064.149,1388.130,1453.540,1080.924,1.303942,99.94
3080,48283,"La Salle, TX",6.638,6.679,7.065,6.727,7.541,8.729,8.254,7.900,...,224.270,255.409,208.985,202.036,201.509,219.118,177.489,159.947,1.352714,96.40


In [18]:
fig = f.show_kins_plot()

#### Use case: similarity of outcome and other features with a lag

In [19]:
f  = FipsQuery(20003, outcome_var = "gdp",
               feature_groups_with_weights= {"gdp":0, "population":4},
               # weights= {"population":4}, 
               lag = 3, top =10, time_decay = 1.03)
f.find_euclidean_kins()
display(f.plot_weights())



None

In [20]:
# if you want the full dataframe with distances, 
# you still can get this
# it's just boring to print it all over again
#f.find_euclidean_kins()
#display(f.euclidean_kins)

# or, you can plot the few top ranked cities:
fig = f.show_kins_plot()

In [21]:
# You don't want to pass outcome and are interested in similarities

f  = FipsQuery(42001, feature_groups_with_weights= {"population":4})
f.find_euclidean_kins()
display(f.plot_weights())
display(f.euclidean_kins)

None

Unnamed: 0,1993_population,1994_population,1995_population,1996_population,1997_population,1998_population,1999_population,2000_population,2001_population,2002_population,...,2013_population,2014_population,2015_population,2016_population,2017_population,2018_population,2019_population,2020_population,2021_population,distance to 42001
2235,83013.0,84186.0,85063.0,86252.0,87751.0,89074.0,90363.0,91457.0,92591.0,93934.0,...,101504.0,101830.0,102411.0,102625.0,103414.0,103932.0,103778.0,103795.0,104127.0,0.000000
3006,92876.0,93717.0,94509.0,95529.0,95998.0,96512.0,96985.0,97390.0,97856.0,98097.0,...,102191.0,102495.0,102590.0,102927.0,103180.0,103754.0,104175.0,104076.0,104362.0,0.004093
1509,84234.0,85586.0,87584.0,89357.0,90710.0,91697.0,92914.0,94050.0,94872.0,95771.0,...,101823.0,102058.0,102429.0,102952.0,103563.0,103967.0,104137.0,104769.0,105231.0,0.008424
27,101310.0,102213.0,102342.0,103063.0,104129.0,104367.0,104002.0,103286.0,102976.0,102988.0,...,104249.0,103880.0,103601.0,103603.0,103854.0,103646.0,103440.0,103393.0,103162.0,0.008731
1230,112397.0,112025.0,111680.0,111231.0,111040.0,110704.0,110295.0,110192.0,109836.0,109861.0,...,107221.0,106599.0,105916.0,105184.0,104967.0,104674.0,104079.0,103594.0,102985.0,0.011795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,472.0,441.0,434.0,429.0,440.0,447.0,445.0,445.0,423.0,421.0,...,443.0,437.0,433.0,445.0,432.0,435.0,436.0,431.0,439.0,1.977809
2644,428.0,413.0,420.0,415.0,411.0,427.0,412.0,420.0,421.0,429.0,...,426.0,421.0,421.0,404.0,383.0,389.0,355.0,346.0,340.0,1.978614
2648,342.0,363.0,335.0,358.0,373.0,389.0,346.0,358.0,313.0,308.0,...,268.0,257.0,273.0,281.0,278.0,265.0,256.0,270.0,258.0,1.980969
2664,109.0,114.0,111.0,109.0,78.0,83.0,77.0,65.0,66.0,75.0,...,77.0,59.0,72.0,64.0,66.0,67.0,67.0,67.0,57.0,1.985240
