In [1]:
# Imports
import pandas as pd
import numpy as np
import geopandas as gpd
import shapely
import requests
import googlemaps
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
%run -i ../notebooks/functions/functions.py

## Population

### Population Import

In [2]:
# Read population csv
df = pd.read_csv("../raw_data/density/population/population.csv")

### Cleaning

In [3]:
# Separate 2000 and 2010 data
df_2000 = df.iloc[0:195]
df_2010 = df.iloc[195:]
df_2010 = df_2010.reset_index()

In [4]:
# Change names and drop unused columns
df_2010['ntacode'] = df_2010['NTA Code']
df_2010 = df_2010.drop(['index','Borough','Year','FIPS County Code','NTA Name','NTA Code'], axis=1)

### Geocoding

In [5]:
# Read shape data
poly = gpd.read_file("../raw_data/density/population/geo_export.shp")
# Invert lat-long to conform with our standard
poly['geometry'] = poly.geometry.map(lambda polygon: shapely.ops.transform(lambda x, y: (y, x), polygon))
# Find centroids of NTAs
poly.centroid
poly["NTA_lat"] = poly.centroid.x
poly["NTA_long"] = poly.centroid.y
# Create NTA object
NTAs = poly.drop(['boro_code', 'boro_name', 'county_fip', 'shape_area', 'shape_leng', 'geometry', 'ntaname'], axis=1)

### Additional Cleaning

In [6]:
# Fix index and drop additional unused column
population = NTAs.set_index('ntacode').join(df_2010.set_index('ntacode'))
geocoded_population = population.reset_index()
geocoded_population = geocoded_population.drop(['ntacode'], axis=1)

In [7]:
# Replace 0 values with median
geocoded_population = geocoded_population.replace(to_replace={'Population': {0: df['Population'].median()}}, value=None)

### Population Export

In [8]:
# Save processed CSV
geocoded_population.to_csv('../processed_data/density/geocoded_population.csv', index=False, encoding='utf-8')

## Car Traffic

### Traffic Import

In [9]:
# Read car traffic CSV
df = pd.read_csv("../raw_data/density/traffic/traffic.csv")

### Cleaning

In [10]:
# Separate 2020 data from the rest
df = df[df["Date"].str.contains("2020") == True]
# Convert hourly data to full day average
df['traffic'] = (df.iloc[:, 5:].sum(axis=1)) / 24
# Drop unused columns
df = df[['ID', 'Roadway Name', 'To', 'traffic']].copy()

In [11]:
# Bin roadways by ID
df_mean = df.groupby(['ID']).mean()
df_mean = df_mean.reset_index()

In [12]:
# Drop duplicates
df = df.drop(['traffic'], axis=1)
df = df.drop_duplicates()

In [13]:
# Prepare traffic object for geocoding
traffic = df.set_index('ID').join(df_mean.set_index('ID'))
traffic = traffic.reset_index()
traffic = traffic.drop_duplicates(subset=['ID'], keep='first')

### Geocoding

In [14]:
# THIS HAS BEEN COMMENTED OUT FOR SAKE OF MY WALLET, DATA HAS BEEN SAVED

In [15]:
 # Create google maps client object
#gmaps = googlemaps.Client(key='GOOGLE_API_KEY')

In [16]:
 # Pull lat-long data for each roadway bin
#lat_long = []
#for i in range(len(traffic)):
#    first = traffic.iloc[i][1]
#    second = traffic.iloc[i][2]
#    geocode_result = gmaps.geocode(f'{first}, {second}, NY')
#    coords = (geocode_result[0]['geometry']['location']['lat'], geocode_result[0]['geometry']['location']['lng'])
#    lat_long.append(coords)

In [17]:
 # Prepare lat-long data
#lat = []
#lng = []
#for i in range(349):
#    lat.append(lat_long[i][0])
#    lng.append(lat_long[i][1])

In [18]:
 # Append lat-long data to dataframe
#traffic = traffic.drop(['Roadway Name', 'To', 'ID'], axis=1)
#traffic['latitude'] = lat
#traffic['longitude'] = lng

### Traffic Export

In [19]:
# COMMENTED OUT TO AVOID ERRORS STEMMING FROM API REQUESTS
# Save car traffic CSV
# traffic.to_csv('../processed_data/density/geocoded_traffic.csv', index=False, encoding='utf-8')

## Pedestrian Traffic

### Pedestrian Import

In [20]:
# Read pedestrian CSV
df = pd.read_csv("../raw_data/density/pedestrian/pedestrian.csv")

### Cleaning

In [21]:
# Drop unused columns
df = df.drop(['Borough', 'Loc', 'OBJECTID', 'To_Street', 'Index'], axis=1)
# Select used columns
df = df[['the_geom', 'Street_Nam', 'From_Stree', 'May21_AM', 'May21_PM', 'May21_MD']]
# Drop corrupted value
df = df.drop(25)

In [22]:
# Average 2021 pedestrian traffic
df['ped_traffic'] = (df.iloc[:, 4:].sum(axis=1)) / 3
df = df.drop(['May21_AM', 'May21_PM', 'May21_MD'], axis=1)

### Geocoding

In [23]:
# Break down geometry data to individual lat-long
lat = []
lng = []
for i in range(113):
    try:
        string = df['the_geom'][i]
        words = string.split()
        lng.append(float(words[1][1:]))
        lat.append(float(words[2][:-1]))
    except KeyError:
        lat.append(0)
        lng.append(0)

### Additional Cleaning

In [24]:
# Drop unused columns
df = df.drop(['Street_Nam', 'From_Stree', 'the_geom'], axis=1)
# Rename lat-long as per our conventions
df['NTA_lat'] = lat
df['NTA_long'] = lng
# Remove broken lat-long data
df = df[df.NTA_lat != 0]
# Replace 0 values with median
df.replace(to_replace={'ped_traffic': {0: df['ped_traffic'].median()}}, value=None)

Unnamed: 0,ped_traffic,NTA_lat,NTA_long
0,1657.333333,40.879199,-73.904591
1,1169.000000,40.826628,-73.921884
2,3326.000000,40.862155,-73.895358
3,790.666667,40.881287,-73.878925
4,1255.333333,40.844637,-73.889564
...,...,...,...
109,157.666667,40.819495,-73.933305
110,60.666667,40.814131,-73.933133
111,58.333333,40.807555,-73.932530
112,29.666667,40.803769,-73.928961


### Pedestrian Export

In [25]:
# Save to pedestrian CSV
df.to_csv('../processed_data/density/geocoded_pedestrian.csv', index=False, encoding='utf-8')