In [31]:
import json
import os
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import Polygon
import pandas as pd


Census tract zones: https://www.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page

Taxi Zones: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc

In [9]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning


In [14]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []

Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other: ['combined_df.csv', 'combined_df_all_zones.csv', 'fhv_2021_01.csv', 'green_2021_01.csv', 'taxi_zones_alternate.csv', 'taxi_zones_raw.geojson', 'taxi_zone_lookup.csv', 'tract_zones_raw.geojson', 'yellow_2021_01.csv']


In [15]:
#input file
taxi_zones_raw_file = os.path.join(cwd, data_dir, "taxi_zones_raw.geojson")

#output file
taxi_zones_manhattan = os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson")

In [16]:
# Function to load and filter GeoJSON
def filter_taxi_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "borough" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('borough') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [17]:
# Filter the GeoJSON file
filter_taxi_geojson(taxi_zones_raw_file, taxi_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {taxi_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\taxi_zones_manhattan.geojson


In [18]:
#input file
tract_zones_raw_file = os.path.join(cwd, data_dir, "tract_zones_raw.geojson")

#output file
tract_zones_manhattan = os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson")

In [19]:
def filter_tract_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "BoroName" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('BoroName') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "crs": data.get('crs', {}),
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [20]:
# Filter the GeoJSON file
filter_tract_geojson(tract_zones_raw_file, tract_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {tract_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\tract_zones_manhattan.geojson


In [25]:
# Load census tract GeoJSON
census_tracts = gpd.read_file(os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson"))

# Load taxi zones GeoJSON
taxi_zones = gpd.read_file(os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson"))

In [26]:
# Create spatial index for census tracts
census_tracts_sindex = census_tracts.sindex

In [27]:
"""
Find Intersecting Census Tracts for Each Taxi Zone
"""

# Function to find intersecting census tracts
def find_intersections(taxi_zone_geom, census_tracts, sindex):
    possible_matches_index = list(sindex.intersection(taxi_zone_geom.bounds))
    possible_matches = census_tracts.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.intersects(taxi_zone_geom)]
    return precise_matches


Store the results: Loop through each taxi zone, use find_intersections to find intersecting census tracts, and calculate the intersection areas and area changes. Store the CTLabels, original areas, intersection areas, and area changes in a dictionary.

In [29]:
""" 
Loop through each taxi zone to find intersecting census tracts and record the required information:
"""

# Dictionary to store the results
taxi_zone_to_census_tracts = {}
taxi_zone_census_tracts_info = {}

for idx, taxi_zone in taxi_zones.iterrows():
    taxi_zone_geom = taxi_zone.geometry
    intersecting_census_tracts = find_intersections(taxi_zone_geom, census_tracts, census_tracts_sindex)
    
    census_tracts_info = []
    for _, census_tract in intersecting_census_tracts.iterrows():
        intersection = taxi_zone_geom.intersection(census_tract.geometry)
        intersection_area = intersection.area
        area_change_percentage = (intersection_area / census_tract.geometry.area) * 100
        
        tract_info = {
            'CTLabel': census_tract['CTLabel'],
            'original_area': census_tract.geometry.area,
            'intersection_area': intersection_area,
            'area_change_percentage': area_change_percentage
        }
        
        census_tracts_info.append(tract_info)
    
    taxi_zone_census_tracts_info[taxi_zone['location_id']] = census_tracts_info

print(taxi_zone_census_tracts_info)

{'4': [{'CTLabel': '10.02', 'original_area': 2.1731498272724617e-05, 'intersection_area': 2.606048193739001e-08, 'area_change_percentage': 0.11992031847200676}, {'CTLabel': '22.01', 'original_area': 1.7230653405222628e-05, 'intersection_area': 3.962622996295975e-08, 'area_change_percentage': 0.22997520193255702}, {'CTLabel': '20', 'original_area': 1.3735311799389601e-05, 'intersection_area': 1.3514220476494598e-05, 'area_change_percentage': 98.39034361852035}, {'CTLabel': '22.02', 'original_area': 5.98002396453943e-06, 'intersection_area': 5.971196701988194e-06, 'area_change_percentage': 99.85238750540833}, {'CTLabel': '26.01', 'original_area': 1.1300576377131507e-05, 'intersection_area': 1.1289065171391883e-05, 'area_change_percentage': 99.89813611841146}, {'CTLabel': '24', 'original_area': 1.8453785084052236e-05, 'intersection_area': 1.8058330180640233e-05, 'area_change_percentage': 97.8570526230212}, {'CTLabel': '26.02', 'original_area': 1.1039818818277425e-05, 'intersection_area': 

In [40]:
census_dir = os.path.join(os.getcwd(), "..", "Datasets", "census")
print(f"Data directory: {census_dir}")

# Check if the directory exists
if not os.path.exists(census_dir):
    print(f"Directory {census_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(census_dir)
    print(f"Files in directory {census_dir}: {all_files_in_dir}")

all_files = []

Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census: ['DECENNIALDP2020.DP1-2024-05-27T181726.csv', 'DECENNIALDP2020_census_tract.csv', 'DECENNIALDP2020_zips.csv', 'NYC_Census_Tracts_2020.csv']


In [41]:
# Define the file path relative to the data directory
census_data_path = os.path.join(cwd, census_dir, "DECENNIALDP2020_census_tract.csv")

census_data = pd.read_csv(census_data_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding='utf-8-sig')

In [43]:
census_data.columns = census_data.columns.str.replace('ï»¿', '')

# Create a mapping from CTLabel to census tract columns
census_tract_columns = [col for col in census_data.columns if 'Census Tract' in col and 'Count' in col]
census_tract_map = {col.split('!!')[0]: col for col in census_tract_columns}

In [44]:
census_data

Unnamed: 0,Label (Grouping),"Manhattan borough, New York County, New York!!Count","Manhattan borough, New York County, New York!!Percent",Census Tract 1; New York County; New York!!Count,Census Tract 1; New York County; New York!!Percent,Census Tract 2.01; New York County; New York!!Count,Census Tract 2.01; New York County; New York!!Percent,Census Tract 2.02; New York County; New York!!Count,Census Tract 2.02; New York County; New York!!Percent,Census Tract 5; New York County; New York!!Count,...,Census Tract 309; New York County; New York!!Count,Census Tract 309; New York County; New York!!Percent,Census Tract 311; New York County; New York!!Count,Census Tract 311; New York County; New York!!Percent,Census Tract 317.03; New York County; New York!!Count,Census Tract 317.03; New York County; New York!!Percent,Census Tract 317.04; New York County; New York!!Count,Census Tract 317.04; New York County; New York!!Percent,Census Tract 319; New York County; New York!!Count,Census Tract 319; New York County; New York!!Percent
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251,100.0%,0,-,2012,100.0%,7266,100.0%,5.0,...,8594,100.0%,12.0,100.0%,5847,100.0%,10422,100.0%,3.0,100.0%
2,Under 5 years,68849,4.1%,0,-,65,3.2%,277,3.8%,1.0,...,477,5.6%,2.0,16.7%,678,11.6%,718,6.9%,0.0,0.0%
3,5 to 9 years,63039,3.7%,0,-,95,4.7%,347,4.8%,0.0,...,482,5.6%,0.0,0.0%,561,9.6%,581,5.6%,1.0,33.3%
4,10 to 14 years,63419,3.7%,0,-,110,5.5%,344,4.7%,0.0,...,572,6.7%,0.0,0.0%,355,6.1%,531,5.1%,0.0,0.0%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Rental vacancy rate (percent) [5],6.1,(X),0.0,(X),4.2,(X),2.5,(X),100.0,...,1.6,(X),0.0,(X),10.7,(X),7.1,(X),0.0,(X)
170,HOUSING TENURE,,,,,,,,,,...,,,,,,,,,,
171,Occupied housing units,817782,100.0%,0,-,804,100.0%,3386,100.0%,0.0,...,3483,100.0%,0.0,-,2407,100.0%,5011,100.0%,0.0,-
172,Owner-occupied housing units,191489,23.4%,0,-,11,1.4%,949,28.0%,0.0,...,194,5.6%,0.0,-,233,9.7%,1547,30.9%,0.0,-


In [45]:
# Function to adjust demographic data
def adjust_demographic_data(census_tracts_info, census_data, census_tract_map):
    adjusted_data = {}
    for tract_info in census_tracts_info:
        ctlabel = tract_info['CTLabel']
        area_change_percentage = tract_info['area_change_percentage'] / 100
        
        if ctlabel in census_tract_map:
            tract_column = census_tract_map[ctlabel]
            tract_data = census_data[tract_column]
            
            for index, value in tract_data.items():
                if index not in adjusted_data:
                    adjusted_data[index] = 0
                if pd.notna(value):
                    adjusted_data[index] += value * area_change_percentage
    
    return adjusted_data


In [46]:
# Dictionary to store adjusted demographic data for each taxi zone
taxi_zone_demographics = {}

for location_id, census_tracts_info in taxi_zone_census_tracts_info.items():
    adjusted_data = adjust_demographic_data(census_tracts_info, census_data, census_tract_map)
    taxi_zone_demographics[location_id] = adjusted_data

In [48]:
# Convert the dictionary to a DataFrame for easier handling
taxi_zone_demographics_df = pd.DataFrame.from_dict(taxi_zone_demographics, orient='index')

# Save the adjusted demographic data to a new CSV file
taxi_zone_demographics_df.to_csv(os.path.join(data_dir, 'adjusted_taxi_zone_demographics.csv'))

# Print the resulting DataFrame
print(taxi_zone_demographics_df.head())

Empty DataFrame
Columns: []
Index: []
