In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly import graph_objs as go

from cities.utils.data_grabber import DataGrabber
from cities.queries.fips_query import FipsQuery

#### Use case: rI just want a rough idea

You want to know where your jurisdiction stands relative to all the others in the country.

In [7]:
f  = FipsQuery(42001, "gdp")
f.where_are_we(sample_size= 100)

#### Use case: similarity in outcome patterns

You want to find top five juristdiction with similar gdp change patterns. You value times nearest to you a bit more.

In [8]:
f  = FipsQuery(42001, "gdp", lag = 0, top =5, time_decay = 1.06)
f.find_euclidean_kins()
# you can inspect the weights resulting from your time_decay setting:
display(f.weigth_plot.show())


None

In [9]:
# you can find the distances and  inspect the resulting 
# dataframe that contains the ranking:
f.find_euclidean_kins()
display(f.euclidean_kins)

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,distance to 42001
2236,42001,"Adams, PA",78.619,84.689,84.475,85.860,89.556,93.508,93.154,95.379,...,100.080,100.006,102.509,103.708,108.411,105.390,103.440,97.678,102.664,0.000000
1190,24013,"Carroll, MD",76.152,80.700,82.853,86.013,89.398,95.381,95.463,97.823,...,100.226,99.510,101.215,101.568,106.456,104.838,105.452,101.050,105.298,0.026198
540,15003,"Honolulu, HI",77.518,79.162,82.883,87.189,91.507,93.039,94.927,96.471,...,101.365,102.008,104.781,106.434,108.409,108.672,107.108,96.323,101.427,0.029788
1871,36103,"Suffolk, NY",77.583,80.337,83.711,87.603,88.109,90.437,90.857,94.202,...,99.697,100.099,101.102,102.505,103.283,103.288,105.925,101.523,106.687,0.034842
2300,42129,"Westmoreland, PA",84.150,86.075,88.194,92.699,95.620,95.716,96.062,98.024,...,100.238,101.619,101.374,103.245,105.277,106.227,107.686,99.995,104.881,0.035479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025,38087,"Slope, ND",34.746,11.451,39.302,32.702,39.083,45.047,57.236,64.676,...,84.679,77.426,57.979,36.621,61.571,45.949,42.019,41.446,47.427,0.629680
2709,48389,"Reeves, TX",46.003,49.290,44.960,41.682,39.742,41.332,41.009,41.389,...,121.888,212.473,334.183,416.768,607.235,843.967,1226.531,1136.483,942.206,0.666063
676,17179,"Tazewell, IL",42.031,40.832,41.214,40.862,46.054,51.686,57.682,57.881,...,74.577,72.169,58.943,50.605,40.879,38.782,36.135,33.239,35.584,0.679293
2569,48109,"Culberson, TX",35.264,37.743,36.255,38.339,40.177,41.247,42.368,53.349,...,144.271,258.311,404.721,546.092,673.326,712.048,986.320,1238.206,1185.010,0.695582


In [10]:
# you can plot the few most similar cities:
f.plot_kins()

### Use case: similarity in outcome patterns and some other features

Say you want to include historical population patterns in your similarity ranking. You also want to pay a bit more attention to older data points.

In [5]:
f  = FipsQuery(1007, outcome_var = "gdp",
               feature_groups= ["population"],
               weights= {"population":4}, #with one feature group only
               # weights 1-4 won't make a difference
               lag = 0, top =5, time_decay = 1.03)
f.find_euclidean_kins()
# you still can inspect the resulting weighing:
display(f.weigth_plot.show())


None

In [7]:
# you still have access to the distances and the ranking.
# only, this time there are more columns in the dataframe:
display(f.euclidean_kins)

# TODO as feature list length changes, distances cease to be comparable; should we leave it like that?
# or should we consider some normalization?

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2013_population,2014_population,2015_population,2016_population,2017_population,2018_population,2019_population,2020_population,2021_population,distance to 1007
3,1007,"Bibb, AL",80.443,81.527,85.124,89.317,88.782,89.597,95.308,94.745,...,22544.0,22586.0,22607.0,22654.0,22606.0,22383.0,22405.0,22223.0,22477.0,0.000000
2655,48281,"Lampasas, TX",81.686,88.549,87.516,89.575,94.593,96.503,88.870,91.188,...,20115.0,20076.0,20324.0,20486.0,20812.0,21081.0,21356.0,21707.0,22252.0,0.104128
1333,27055,"Houston, MN",80.205,83.024,86.994,91.554,90.852,93.510,88.846,91.712,...,18841.0,18787.0,18793.0,18858.0,18842.0,18762.0,18841.0,18818.0,18778.0,0.125357
1679,31067,"Gage, NE",86.763,85.374,89.891,96.591,101.412,103.496,95.471,98.868,...,21845.0,21663.0,21845.0,21772.0,21818.0,21695.0,21762.0,21663.0,21616.0,0.125489
1868,36097,"Schuyler, NY",83.962,86.333,89.118,91.393,89.870,88.556,94.765,98.586,...,18455.0,18265.0,18136.0,18092.0,18044.0,17995.0,17983.0,17857.0,17752.0,0.127654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,6073,"San Diego, CA",79.703,83.170,87.449,92.400,97.137,99.354,100.938,100.267,...,3199707.0,3234658.0,3262566.0,3283586.0,3293575.0,3303463.0,3297959.0,3297252.0,3286069.0,2.177449
97,4013,"Maricopa, AZ",77.463,80.415,85.793,90.349,97.615,102.729,105.313,103.632,...,3977518.0,4040171.0,4105747.0,4174844.0,4231511.0,4292576.0,4363816.0,4438342.0,4496588.0,2.311991
2615,48201,"Harris, TX",73.137,72.696,72.847,79.658,80.626,87.278,94.310,91.961,...,4350992.0,4452976.0,4553991.0,4619635.0,4651955.0,4672445.0,4704042.0,4732491.0,4728030.0,2.382193
602,17031,"Cook, IL",95.406,94.886,95.455,97.260,99.315,101.320,101.826,99.238,...,5303041.0,5320233.0,5324961.0,5320293.0,5311621.0,5297956.0,5287099.0,5262741.0,5173146.0,2.633683


In [8]:
# you still can plot the few top ranked cities:
f.plot_kins()

2021


#### Use case: similarity of outcome with a lag

You care about similarity of outcome variables, but your question now is: what other locations were 2 years ago in a similar place to me now, when it comes to the outcome variable and the features?


In [10]:
f  = FipsQuery(42001, "gdp", lag = 2, top =5, time_decay = 1.06)
f.find_euclidean_kins()

display(f.weigth_plot.show())

None

In [11]:
f.find_euclidean_kins()
display(f.euclidean_kins)

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,distance to 42001
2236,42001,"Adams, PA",78.619,84.689,84.475,85.860,89.556,93.508,93.154,95.379,...,100.080,100.006,102.509,103.708,108.411,105.390,103.440,97.678,102.664,0.000000
960,20161,"Riley, KS",78.822,79.879,83.156,85.898,86.815,89.029,93.378,97.320,...,99.925,102.355,102.888,103.512,102.037,100.206,100.843,100.186,101.947,0.032475
587,17001,"Adams, IL",79.654,81.654,86.491,90.160,91.589,93.582,93.367,95.001,...,99.232,102.400,101.760,104.586,97.057,99.635,98.477,93.510,100.614,0.032873
1765,33019,"Sullivan, NH",87.624,93.209,91.829,96.587,97.861,95.699,97.592,92.842,...,101.387,102.845,106.400,101.283,101.850,99.204,99.205,101.346,109.571,0.033485
464,13171,"Lamar, GA",83.191,81.883,83.438,84.551,93.216,93.727,93.967,98.477,...,99.737,99.373,103.592,103.735,104.157,101.036,105.494,110.219,114.258,0.034046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2709,48389,"Reeves, TX",46.003,49.290,44.960,41.682,39.742,41.332,41.009,41.389,...,121.888,212.473,334.183,416.768,607.235,843.967,1226.531,1136.483,942.206,0.610786
2578,48127,"Dimmit, TX",9.381,10.070,10.269,10.101,10.527,13.382,13.424,12.854,...,169.087,208.912,253.553,226.448,235.298,230.138,256.745,206.367,159.131,0.619309
2665,48301,"Loving, TX",37.050,40.836,40.690,49.561,81.009,110.338,149.312,114.503,...,123.162,186.286,306.333,551.251,802.739,1064.149,1388.130,1453.540,1080.924,0.633216
2656,48283,"La Salle, TX",6.638,6.679,7.065,6.727,7.541,8.729,8.254,7.900,...,174.138,224.270,255.409,208.985,202.036,201.509,219.118,177.489,159.947,0.656968


In [12]:
f.plot_kins()

2021


#### Use case: similarity of outcome and other features with a lag

In [16]:
f  = FipsQuery(20003, outcome_var = "gdp",
               feature_groups= ["population"],
               weights= {"population":4}, 
               lag = 3, top =10, time_decay = 1.03)
f.find_euclidean_kins()
display(f.weigth_plot.show())

None

In [17]:
# if you want the full dataframe with distances, why not
#f.find_euclidean_kins()
#display(f.euclidean_kins)
# or, you can plot the few top ranked cities:
f.plot_kins()

2021
