# High Resolution Pop Density

Meta's High Res Density files are poorly formatted. Multiple CSV's with duplicate locations isn't the best way to store, access or ingest this data into a spatial database. This notebook ingests the raw CSVs, converts them to parquet files and joins them based on location.

See:
https://data.humdata.org/dataset/united-states-high-resolution-population-density-maps-demographic-estimates

We're going to test a new pandas-like lib called "polars"

In [1]:
# ! pip install polars

In [2]:
import pandas as pd 
import polars as pl
from pathlib import Path
import os
from pathlib import Path

## Settings
Below are the settings to control the actions of this script.

In [17]:
csv_list = [
            '/home/jovyan/data/HighDensityPop/USA_children_under_five_2020-03-07.csv',
            '/home/jovyan/data/HighDensityPop/USA_youth_15_24_2020-03-07.csv',
            '/home/jovyan/data/HighDensityPop/USA_elderly_60_plus_2020-03-07.csv',
            '/home/jovyan/data/HighDensityPop/USA_men_2020-03-07_part_1_of_2.csv',
            '/home/jovyan/data/HighDensityPop/USA_men_2020-03-07_part_2_of_2.csv',
            '/home/jovyan/data/HighDensityPop/USA_women_2020-03-07_part_1_of_2.csv',
            '/home/jovyan/data/HighDensityPop/USA_women_2020-03-07_part_2_of_2.csv',
            '/home/jovyan/data/HighDensityPop/population_usa_2019-07-01_part_1_of_6.csv',
            '/home/jovyan/data/HighDensityPop/population_usa_2019-07-01_part_2_of_6.csv',
            '/home/jovyan/data/HighDensityPop/population_usa_2019-07-01_part_3_of_6.csv',
            '/home/jovyan/data/HighDensityPop/population_usa_2019-07-01_part_4_of_6.csv',
            '/home/jovyan/data/HighDensityPop/population_usa_2019-07-01_part_5_of_6.csv',
            '/home/jovyan/data/HighDensityPop/population_usa_2019-07-01_part_6_of_6.csv',
    
           ]

# howard county Envelope: POLYGON((-77.187113 39.103142,-77.187113 39.369323,-76.696774 39.369323,-76.696774 39.103142,-77.187113 39.103142)) 
llx = -76.696774
lly = 39.103142

urx = -77.187113
ury = 39.369323
 
HOST = '10.152.63.171'
PORT = '5555'

# HOST = '10.0.0.91'
# PORT = '5555'

DO_INSERTS = True

## Steps

 * Read CSV file for category
 * Filter to Howard County
 * Write as Parquet

In [4]:
# for csv_in in csv_list:
#     print(csv_in)
#     df = pd.read_csv(csv_in)
#     df = df.rename(columns = {'Lat':'latitude','Lon':'longitude', 'Population':'population'} )
#     box1_cond = (lly <= df.latitude) & (df.latitude <= ury) & (urx <= df.longitude) & (df.longitude <= llx) 
#     df = df[box1_cond]
#     file_name = Path(csv_in).stem
#     df['type'] = file_name 
#     df.to_parquet(Path(csv_in).with_suffix('.parq'))
# df = None

## Next Steps
 * Read all Parquet's into single dataframe
 * Delete 'part_1_of_2' in type column
 * Do a spatial join/reverse pivot on columns

In [5]:
data_dir = Path('/home/jovyan/data/HighDensityPop/')
full_df = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.parq')
)


In [6]:
type_dict = {'USA_women_2020-03-07_part_2_of_2': 'USA_women_2020-03-07',
             'USA_women_2020-03-07_part_1_of_2': 'USA_women_2020-03-07',
             'USA_men_2020-03-07_part_1_of_2':'USA_men_2020-03-07',
             'USA_children_under_five_2020-03-07':'USA_children_under_five_2020-03-07', 
             'USA_youth_15_24_2020-03-07':'USA_youth_15_24_2020-03-07',
             'USA_men_2020-03-07_part_2_of_2':'USA_men_2020-03-07', 
             'USA_elderly_60_plus_2020-03-07':'USA_elderly_60_plus_2020-03-07',
             'population_usa_2019-07-01_part_1_of_6':'USA_total_2019-07-01',
             'population_usa_2019-07-01_part_2_of_6':'USA_total_2019-07-01',
             'population_usa_2019-07-01_part_3_of_6':'USA_total_2019-07-01',
             'population_usa_2019-07-01_part_4_of_6':'USA_total_2019-07-01',
             'population_usa_2019-07-01_part_5_of_6':'USA_total_2019-07-01',
             'population_usa_2019-07-01_part_6_of_6':'USA_total_2019-07-01' }

In [7]:
full_df = full_df.replace({'type': type_dict})

In [8]:
full_df = full_df.pivot(index=['latitude','longitude'], columns='type', values='population') 
full_df['Total_2020'] = full_df[['USA_men_2020-03-07','USA_women_2020-03-07']].sum(axis=1)

In [9]:
full_df = full_df.fillna(0)
full_df = full_df.reset_index()
full_df.to_csv('/home/jovyan/data/HighDensityPop/population_howard_county_full.csv')

In [10]:
full_df

type,latitude,longitude,USA_children_under_five_2020-03-07,USA_elderly_60_plus_2020-03-07,USA_men_2020-03-07,USA_total_2019-07-01,USA_women_2020-03-07,USA_youth_15_24_2020-03-07,Total_2020
0,39.103194,-77.173194,0.243826,0.929588,2.095382,0.0,2.461122,0.457174,4.556504
1,39.103194,-77.172917,0.243826,0.929588,2.095382,0.0,2.461122,0.457174,4.556504
2,39.103194,-77.172639,0.243826,0.929588,2.095382,0.0,2.461122,0.457174,4.556504
3,39.103194,-77.172361,0.243826,0.929588,2.095382,0.0,2.461122,0.457174,4.556504
4,39.103194,-77.169861,0.317703,0.907723,1.225426,0.0,2.133149,0.317703,3.358575
...,...,...,...,...,...,...,...,...,...
484386,39.369306,-76.699306,0.724950,1.933199,2.658149,0.0,3.866398,0.000000,6.524547
484387,39.369306,-76.697917,0.165703,0.138086,0.938982,0.0,1.021834,0.276171,1.960816
484388,39.369306,-76.697639,0.165703,0.138086,0.938982,0.0,1.021834,0.276171,1.960816
484389,39.369306,-76.697361,0.165703,0.138086,0.938982,0.0,1.021834,0.276171,1.960816


## Write dataframe to table


In [11]:
import geopandas as gpd
from sqlalchemy import create_engine
import psycopg2 as pg
import psycopg2.extras as extras

In [12]:
gdf = gpd.GeoDataFrame(full_df, 
                       geometry=gpd.points_from_xy(full_df['longitude'], full_df['latitude']), 
                       crs="EPSG:4326")
gdf = gdf.set_geometry("geometry")

In [13]:
gdf.to_parquet('./data/high_density_pop.parq')

In [18]:
# https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_postgis.html
# conn = pg.connect(database="deer", user='gmu', password='super_secret_password', host=HOST, port=PORT) 
engine = create_engine(f"postgresql://gmu:super_secret_password@{HOST}:{PORT}/deer")  
gdf.to_postgis("high_res_pop", engine, schema = "pop", if_exists='append')  


## Refresh Materialized Views!

In [19]:
SQL = '''
REFRESH MATERIALIZED VIEW deer.lead_lag_positions WITH DATA;
REFRESH MATERIALIZED VIEW deer.monthly_hull WITH DATA;
REFRESH MATERIALIZED VIEW deer.heatmap_hex_grid_hourly WITH DATA;
REFRESH MATERIALIZED VIEW deer.heatmap_hex_grid WITH DATA;
REFRESH MATERIALIZED VIEW pop.population_hex_summary WITH DATA; 
REFRESH MATERIALIZED VIEW postgisftw.commercial_poi WITH DATA;
REFRESH MATERIALIZED VIEW pop.annual_raw_visit_counts_kde_2019 WITH DATA;
REFRESH MATERIALIZED VIEW pop.annual_raw_visit_counts_kde_2018 WITH DATA; 

REFRESH MATERIALIZED VIEW postgisftw.hex_environmental_features_30m WITH DATA; 
'''


In [20]:
%%time

conn = pg.connect(
    host=HOST, 
    port=PORT, 
    dbname='deer', 
    user='gmu', 
    password='super_secret_password', 
    connect_timeout=5
) 
cursor = conn.cursor()
cursor.execute(SQL)
conn.commit()
cursor.close()
conn.close()

CPU times: user 286 ms, sys: 922 ms, total: 1.21 s
Wall time: 50.7 s
