In [1070]:
import json
import os
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import Polygon
from shapely.ops import unary_union
import pandas as pd
from shapely import wkt

Census tract zones: https://www.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page

Taxi Zones: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc

In [1071]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning


In [1072]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []

Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other: ['census_tracts_coords.csv', 'combined_df.csv', 'combined_df_all_zones.csv', 'fhv_2021_01.csv', 'green_2021_01.csv', 'taxi_zones_alternate.csv', 'taxi_zones_coords.csv', 'taxi_zones_manhattan.geojson', 'taxi_zones_raw.geojson', 'taxi_zone_lookup.csv', 'tract_zones_manhattan.geojson', 'tract_zones_raw.geojson', 'yellow_2021_01.csv']


In [1073]:
#input file
taxi_zones_raw_file = os.path.join(cwd, data_dir, "taxi_zones_raw.geojson")

#output file
taxi_zones_manhattan = os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson")

In [1074]:
# Function to load and filter GeoJSON
def filter_taxi_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "borough" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('borough') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [1075]:
# Filter the GeoJSON file
filter_taxi_geojson(taxi_zones_raw_file, taxi_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {taxi_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\taxi_zones_manhattan.geojson


In [1076]:
#input file
tract_zones_raw_file = os.path.join(cwd, data_dir, "tract_zones_raw.geojson")

#output file
tract_zones_manhattan = os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson")

In [1077]:
def filter_tract_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "BoroName" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('BoroName') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "crs": data.get('crs', {}),
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [1078]:
# Filter the GeoJSON file
filter_tract_geojson(tract_zones_raw_file, tract_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {tract_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\tract_zones_manhattan.geojson


In [1079]:
census_data_dir = os.path.join(os.getcwd(), "..", "Datasets", "census")

# Define the directory where the data is located relative to the current working directory
print("Census CSV Directory:", census_data_dir)

# Define the file path relative to the data directory
census_data_path = os.path.join(cwd, census_data_dir, "census_manhat_data.csv")

census_data = pd.read_csv(census_data_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding='utf-8-sig')

Census CSV Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census


In [1080]:
strings_to_remove = ["New", "York", "County", "!!", "borough", ";", ","]

# Function to clean column names
def clean_column_names(column):
    for string in strings_to_remove:
        column = column.replace(string, "").strip()
    return column

census_data.columns = census_data.columns.map(clean_column_names)

# Define a function to rename columns
def rename_columns(columns):
    new_columns = []
    for col in columns:
        # Split by spaces, join with single underscore, convert to lowercase
        new_col = '_'.join(col.split()).lower()
        new_columns.append(new_col)
    return new_columns

# Rename columns using the function
census_data.columns = rename_columns(census_data.columns)

In [1081]:
census_data = census_data.head(120)

# Filter columns that end with '_count' along with 'label_(grouping)'
count_columns = ['label_(grouping)'] + [col for col in census_data.columns if col.endswith('_count')]

# Display the filtered columns
census_data = census_data[count_columns]

In [1082]:
census_data

Unnamed: 0,label_(grouping),manhattan_count,census_tract_1_count,census_tract_2.01_count,census_tract_2.02_count,census_tract_5_count,census_tract_6_count,census_tract_7_count,census_tract_8_count,census_tract_9_count,...,census_tract_295_count,census_tract_297_count,census_tract_299_count,census_tract_303_count,census_tract_307_count,census_tract_309_count,census_tract_311_count,census_tract_317.03_count,census_tract_317.04_count,census_tract_319_count
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251,0,2012,7266,5.0,11616,10542,10871,2016,...,7039,16.0,3598,3691,3427,8594,12.0,5847,10422,3.0
2,Under 5 years,68849,0,65,277,1.0,407,420,382,144,...,338,0.0,156,190,126,477,2.0,678,718,0.0
3,5 to 9 years,63039,0,95,347,0.0,477,308,401,69,...,302,0.0,197,140,146,482,0.0,561,581,1.0
4,10 to 14 years,63419,0,110,344,0.0,536,218,401,61,...,282,3.0,194,146,153,572,0.0,355,531,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,American Indian and Alaska Native ...,1895,0,0,12,0.0,16,7,4,0,...,5,0.0,3,1,2,18,1.0,0,5,0.0
116,Asian alone,219624,0,702,1832,2.0,6027,1894,8708,357,...,242,0.0,25,133,155,130,0.0,1029,2435,0.0
117,Native Hawaiian and Other Pacific ...,882,0,0,0,0.0,6,10,5,4,...,1,0.0,0,4,0,1,0.0,2,5,0.0
118,Some Other Race alone,13335,0,11,55,0.0,42,88,53,28,...,46,0.0,27,35,23,56,0.0,31,77,0.0


In [1083]:
# Define the file path relative to the data directory
census_data_path_renamed = os.path.join(cwd, census_data_dir, "census_data.csv")

# Save the DataFrame to CSV
census_data.to_csv(census_data_path_renamed, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", census_data_path_renamed)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census\census_data.csv


In [1084]:
census_data = pd.read_csv(census_data_path_renamed, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding='utf-8-sig')

In [1085]:
# Load census tract and taxi zones GeoJSON
census_tracts_geojson = gpd.read_file(os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson"))
taxi_zones_geojson = gpd.read_file(os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson"))

In [1086]:
# Define the file path
file_path = os.path.join(data_dir, "census_tracts_coords.csv")

# Save the DataFrame to CSV
census_tracts_geojson.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\census_tracts_coords.csv


In [1087]:
# Define the file path
file_path = os.path.join(data_dir, "taxi_zones_coords.csv")

# Save the DataFrame to CSV
taxi_zones_geojson.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\taxi_zones_coords.csv


In [1088]:
census_tracts = pd.read_csv(os.path.join(cwd, data_dir, "census_tracts_coords.csv"))
taxi_zones = pd.read_csv(os.path.join(cwd, data_dir, "taxi_zones_coords.csv"))

In [1089]:
# Function to calculate intersection and percentage overlap
def calculate_overlap(census_geom, taxi_geom):
    intersection = census_geom.intersection(taxi_geom)
    intersection_area = intersection.area
    census_area = census_geom.area
    percentage_overlap = (intersection_area / census_area) * 100
    if percentage_overlap >= 99:
        percentage_overlap = 100
    elif percentage_overlap <= 1:
        percentage_overlap = 0
    return intersection_area, census_area, percentage_overlap

In [1090]:
# Ensure all values in the geometry columns are strings
census_tracts['geometry_tract'] = census_tracts['geometry'].astype(str)
taxi_zones['geometry_taxi'] = taxi_zones['geometry'].astype(str)

# Convert geometries to shapely objects
census_tracts['geometry_tract'] = census_tracts['geometry'].apply(wkt.loads)
taxi_zones['geometry_taxi'] = taxi_zones['geometry'].apply(wkt.loads)

# Initialize an empty list to store the results
overlap_results = []

In [1091]:
# Calculate intersections and percentage overlap
for idx_tract, tract_row in census_tracts.iterrows():
    for idx_taxi, taxi_row in taxi_zones.iterrows():
        tract_geom = tract_row['geometry_tract']
        taxi_geom = taxi_row['geometry_taxi']
        
        if tract_geom.intersects(taxi_geom):
            intersection_area, tract_area, percentage_overlap = calculate_overlap(tract_geom, taxi_geom)
            
            # Append only if percentage_overlap > 0
            if percentage_overlap > 0:
                overlap_results.append({
                    'CTLabel': tract_row['CTLabel'],
                    'location_id': taxi_row['location_id'],
                    'intersection_area': intersection_area,
                    'tract_area': tract_area,
                    'percentage_overlap': percentage_overlap
                })

# Convert the results into a DataFrame
overlap_df = pd.DataFrame(overlap_results)

In [1092]:
# Define the file path relative to the data directory
overlap_df_path = os.path.join(cwd, census_data_dir, "overlap_df.csv")

# Save the DataFrame to CSV
overlap_df.to_csv(overlap_df_path, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", overlap_df)

DataFrame saved to:      CTLabel  location_id  intersection_area  tract_area  percentage_overlap
0       1.00          103       6.284910e-06    0.000018           34.452613
1       1.00          103       1.184074e-05    0.000018           64.908539
2       2.01          232       9.626498e-06    0.000010          100.000000
3       6.00          232       2.554676e-05    0.000026          100.000000
4      14.01          232       9.943394e-06    0.000010          100.000000
..       ...          ...                ...         ...                 ...
313   317.04          261       5.679329e-07    0.000024            2.378531
314   194.00           74       1.866329e-05    0.000019          100.000000
315   242.00           74       3.679666e-05    0.000037          100.000000
316    13.00          261       3.368247e-05    0.000034          100.000000
317    21.00          231       2.532817e-05    0.000025          100.000000

[318 rows x 5 columns]


In [1093]:
# # Filter rows where percentage_overlap is >= 99 or <= 1
# filtered_df = overlap_df[(overlap_df['percentage_overlap'] >= 99) | (overlap_df['percentage_overlap'] <= 1)]

# # Get the number of rows in the filtered DataFrame
# num_rows = filtered_df.shape[0]
# num_rows

# 628 of 655 areas with an overlap >99% or less than 1%. rounding above code to 0% or 100%

In [1094]:
overlap_df

Unnamed: 0,CTLabel,location_id,intersection_area,tract_area,percentage_overlap
0,1.00,103,6.284910e-06,0.000018,34.452613
1,1.00,103,1.184074e-05,0.000018,64.908539
2,2.01,232,9.626498e-06,0.000010,100.000000
3,6.00,232,2.554676e-05,0.000026,100.000000
4,14.01,232,9.943394e-06,0.000010,100.000000
...,...,...,...,...,...
313,317.04,261,5.679329e-07,0.000024,2.378531
314,194.00,74,1.866329e-05,0.000019,100.000000
315,242.00,74,3.679666e-05,0.000037,100.000000
316,13.00,261,3.368247e-05,0.000034,100.000000


In [1095]:
# # Standardize CTLabel values to match the census tract column naming conventions
overlap_df['CTLabel'] = overlap_df['CTLabel'].apply(lambda x: str(int(float(x))) if float(x).is_integer() else str(x))
overlap_df

Unnamed: 0,CTLabel,location_id,intersection_area,tract_area,percentage_overlap
0,1,103,6.284910e-06,0.000018,34.452613
1,1,103,1.184074e-05,0.000018,64.908539
2,2.01,232,9.626498e-06,0.000010,100.000000
3,6,232,2.554676e-05,0.000026,100.000000
4,14.01,232,9.943394e-06,0.000010,100.000000
...,...,...,...,...,...
313,317.04,261,5.679329e-07,0.000024,2.378531
314,194,74,1.866329e-05,0.000019,100.000000
315,242,74,3.679666e-05,0.000037,100.000000
316,13,261,3.368247e-05,0.000034,100.000000


In [1096]:
taxi_zone_ids = overlap_df['location_id'].unique()
print(len(taxi_zone_ids))
taxi_zone_ids

67


array([103, 232, 148,   4, 231,  79, 125, 144, 234, 114, 113, 249, 107,
       137, 170, 164,  90,  68, 162, 161, 186, 246, 229, 100, 163, 237,
       141, 230,  48, 140, 262, 236, 142, 143, 263,  75, 239, 238,  74,
        41, 151,  24, 166,  42, 152, 116, 244, 243, 127, 153,  87, 224,
        88, 211,  45,  50,  43, 233, 194,  13, 120, 202, 209,  12, 158,
       128, 261], dtype=int64)

In [1097]:
# # Extracting all unique "CTLabel" values
# ct_labels = overlap_df['CTLabel'].unique()

# # Calculating the length of the unique "CTLabel" values list
# length_of_ct_labels = len(ct_labels.tolist())
# length_of_ct_labels

In [1098]:
census_data.head()

Unnamed: 0,label_(grouping),manhattan_count,census_tract_1_count,census_tract_2.01_count,census_tract_2.02_count,census_tract_5_count,census_tract_6_count,census_tract_7_count,census_tract_8_count,census_tract_9_count,...,census_tract_295_count,census_tract_297_count,census_tract_299_count,census_tract_303_count,census_tract_307_count,census_tract_309_count,census_tract_311_count,census_tract_317.03_count,census_tract_317.04_count,census_tract_319_count
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251.0,0.0,2012.0,7266.0,5.0,11616.0,10542.0,10871.0,2016.0,...,7039.0,16.0,3598.0,3691.0,3427.0,8594.0,12.0,5847.0,10422.0,3.0
2,Under 5 years,68849.0,0.0,65.0,277.0,1.0,407.0,420.0,382.0,144.0,...,338.0,0.0,156.0,190.0,126.0,477.0,2.0,678.0,718.0,0.0
3,5 to 9 years,63039.0,0.0,95.0,347.0,0.0,477.0,308.0,401.0,69.0,...,302.0,0.0,197.0,140.0,146.0,482.0,0.0,561.0,581.0,1.0
4,10 to 14 years,63419.0,0.0,110.0,344.0,0.0,536.0,218.0,401.0,61.0,...,282.0,3.0,194.0,146.0,153.0,572.0,0.0,355.0,531.0,0.0


In [1100]:
# number_of_columns = len(census_data.columns.tolist())
# number_of_columns

In [1101]:
# Extract demographic data columns
demographic_columns = census_data.columns[census_data.columns.str.startswith('census_tract_')]

# Filter only the relevant demographic columns along with the labels
relevant_demographic_data = census_data[['label_(grouping)', 'manhattan_count'] + list(demographic_columns)]

# Display the first few rows of the relevant demographic data
relevant_demographic_data

Unnamed: 0,label_(grouping),manhattan_count,census_tract_1_count,census_tract_2.01_count,census_tract_2.02_count,census_tract_5_count,census_tract_6_count,census_tract_7_count,census_tract_8_count,census_tract_9_count,...,census_tract_295_count,census_tract_297_count,census_tract_299_count,census_tract_303_count,census_tract_307_count,census_tract_309_count,census_tract_311_count,census_tract_317.03_count,census_tract_317.04_count,census_tract_319_count
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251,0,2012,7266,5.0,11616,10542,10871,2016,...,7039,16.0,3598,3691,3427,8594,12.0,5847,10422,3.0
2,Under 5 years,68849,0,65,277,1.0,407,420,382,144,...,338,0.0,156,190,126,477,2.0,678,718,0.0
3,5 to 9 years,63039,0,95,347,0.0,477,308,401,69,...,302,0.0,197,140,146,482,0.0,561,581,1.0
4,10 to 14 years,63419,0,110,344,0.0,536,218,401,61,...,282,3.0,194,146,153,572,0.0,355,531,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,American Indian and Alaska Native ...,1895,0,0,12,0.0,16,7,4,0,...,5,0.0,3,1,2,18,1.0,0,5,0.0
116,Asian alone,219624,0,702,1832,2.0,6027,1894,8708,357,...,242,0.0,25,133,155,130,0.0,1029,2435,0.0
117,Native Hawaiian and Other Pacific ...,882,0,0,0,0.0,6,10,5,4,...,1,0.0,0,4,0,1,0.0,2,5,0.0
118,Some Other Race alone,13335,0,11,55,0.0,42,88,53,28,...,46,0.0,27,35,23,56,0.0,31,77,0.0


In [1102]:
# Initialize a DataFrame to store the adjusted demographic data for each taxi zone
taxi_zone_ids = overlap_df['location_id'].unique()
adjusted_demographic_df = pd.DataFrame(index=census_data.index[1:], columns=taxi_zone_ids).fillna(0)

# Iterate through each row in the overlap_df DataFrame to get the intersection details
for idx, row in overlap_df.iterrows():
    ct_label = row['CTLabel']
    location_id = row['location_id']
    percentage_overlap = row['percentage_overlap'] / 100.0  # Convert to a fraction

    # Extract the relevant columns for the current census tract
    ct_columns = [col for col in demographic_columns if col.startswith(f'census_tract_{ct_label}')]

    for col in ct_columns:
        # Extract the demographic count
        demographic_count = census_data[col].iloc[1:]  # Skip the first row which is not numeric

        # Convert the demographic count to float after removing commas
        demographic_count = pd.to_numeric(demographic_count, errors='coerce').fillna(0)

        # Calculate the adjusted count based on the percentage overlap
        adjusted_count = demographic_count * percentage_overlap

        # Sum the adjusted counts for each location_id
        adjusted_demographic_df[location_id] += adjusted_count

  adjusted_demographic_df = pd.DataFrame(index=census_data.index[1:], columns=taxi_zone_ids).fillna(0)


In [1103]:
# Convert the dictionary to a DataFrame and ensure the columns are ordered by taxi_zone_ids
adjusted_demographic_df = pd.DataFrame(adjusted_demographic_df)

adjusted_demographic_df

Unnamed: 0,103,232,148,4,231,79,125,144,234,114,...,233,194,13,120,202,209,12,158,128,261
1,2506.157941,0.000000,0.0,0.000000,560.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,12.0,0.0,0.000000,3.0,0.000000,16.0,0.000000
2,33068.572709,6311.873817,7595.0,4995.701344,5694.594645,776.0,152.0,326.0,841.0,425.0,...,706.0,6.0,1355.208005,2.0,644.0,247.281976,0.0,475.783515,0.0,3982.077851
3,29715.478732,5760.809435,7896.0,5264.365503,5190.734373,566.0,113.0,265.0,571.0,382.0,...,578.0,3.0,1108.595432,0.0,514.0,247.281976,1.0,379.639708,0.0,2986.819264
4,29650.602237,5683.322414,7990.0,5572.802256,4839.355251,591.0,102.0,314.0,455.0,325.0,...,556.0,6.0,860.520294,0.0,414.0,207.641354,0.0,319.781299,3.0,3036.629999
5,31949.169263,6938.013472,8224.0,6830.531397,4456.548831,788.0,64.0,388.0,618.0,959.0,...,416.0,16.0,465.447538,1.0,368.0,630.474658,0.0,317.249059,1.0,2636.801581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,709.214259,150.255451,198.0,191.330123,144.512033,74.0,0.0,7.0,20.0,6.0,...,10.0,2.0,4.867110,1.0,109.0,0.000000,0.0,4.796144,0.0,47.118927
116,54712.288128,10291.770465,13765.0,8300.416385,3877.648435,2888.0,338.0,1011.0,2384.0,1686.0,...,0.0,13.0,0.000000,0.0,885.0,0.000000,0.0,703.624863,0.0,3088.000000
117,235.590927,173.209242,54.0,43.426444,30.059337,41.0,1.0,11.0,6.0,4.0,...,13.0,3.0,6.803072,0.0,97.0,7.550595,0.0,3.000000,0.0,25.118927
118,5919.910875,1438.957421,1444.0,948.906244,895.601550,372.0,26.0,135.0,171.0,82.0,...,192.0,3.0,104.960904,0.0,127.0,54.741812,0.0,111.165300,0.0,614.831469


In [1104]:
# Extract the labels and initial columns
labels_and_initial_columns = census_data[['label_(grouping)', 'manhattan_count']].iloc[1:].reset_index(drop=True)

# Reattach the labels and initial columns to the adjusted data
final_demographic_data = pd.concat(
    [labels_and_initial_columns, adjusted_demographic_df.reset_index(drop=True)], 
    axis=1
)

# Rename columns to match taxi zone IDs
final_demographic_data.columns = ['label_(grouping)', 'manhattan_count'] + list(map(str, taxi_zone_ids))


In [1105]:
# Round the demographic data to the nearest integer
final_demographic_data.iloc[:, 2:] = final_demographic_data.iloc[:, 2:].round()


final_demographic_data.head(20)

Unnamed: 0,label_(grouping),manhattan_count,103,232,148,4,231,79,125,144,...,233,194,13,120,202,209,12,158,128,261
0,Total population,1694251.0,2506.0,0.0,0.0,0.0,560.0,0.0,0.0,0.0,...,0.0,0.0,0.0,12.0,0.0,0.0,3.0,0.0,16.0,0.0
1,Under 5 years,68849.0,33069.0,6312.0,7595.0,4996.0,5695.0,776.0,152.0,326.0,...,706.0,6.0,1355.0,2.0,644.0,247.0,0.0,476.0,0.0,3982.0
2,5 to 9 years,63039.0,29715.0,5761.0,7896.0,5264.0,5191.0,566.0,113.0,265.0,...,578.0,3.0,1109.0,0.0,514.0,247.0,1.0,380.0,0.0,2987.0
3,10 to 14 years,63419.0,29651.0,5683.0,7990.0,5573.0,4839.0,591.0,102.0,314.0,...,556.0,6.0,861.0,0.0,414.0,208.0,0.0,320.0,3.0,3037.0
4,15 to 19 years,74917.0,31949.0,6938.0,8224.0,6831.0,4457.0,788.0,64.0,388.0,...,416.0,16.0,465.0,1.0,368.0,630.0,0.0,317.0,1.0,2637.0
5,20 to 24 years,137699.0,45511.0,11389.0,10843.0,7970.0,5818.0,2482.0,152.0,1360.0,...,2318.0,16.0,554.0,0.0,967.0,859.0,0.0,638.0,0.0,6838.0
6,25 to 29 years,202817.0,59286.0,9596.0,13162.0,11808.0,7905.0,3412.0,391.0,990.0,...,951.0,87.0,1262.0,3.0,1409.0,768.0,0.0,1149.0,4.0,6867.0
7,30 to 34 years,180136.0,65777.0,11853.0,16035.0,10042.0,8780.0,4066.0,399.0,1576.0,...,582.0,128.0,631.0,0.0,1393.0,790.0,0.0,1486.0,1.0,8362.0
8,35 to 39 years,131176.0,58454.0,13416.0,12404.0,8752.0,7161.0,2933.0,326.0,962.0,...,1905.0,125.0,702.0,1.0,1009.0,687.0,0.0,1018.0,1.0,6945.0
9,40 to 44 years,103841.0,47510.0,10468.0,10358.0,7218.0,7104.0,1954.0,233.0,677.0,...,1415.0,147.0,1522.0,0.0,774.0,498.0,0.0,768.0,0.0,6428.0


In [1106]:
# Define the file path relative to the data directory
final_demographic_data_path = os.path.join(cwd, census_data_dir, "final_demographic_data.csv")

# Save the DataFrame to CSV
final_demographic_data.to_csv(final_demographic_data_path, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", final_demographic_data)

DataFrame saved to:                                       label_(grouping) manhattan_count  \
0                                     Total population       1,694,251   
1                                        Under 5 years          68,849   
2                                         5 to 9 years          63,039   
3                                       10 to 14 years          63,419   
4                                       15 to 19 years          74,917   
..                                                 ...             ...   
114              American Indian and Alaska Native ...           1,895   
115                                        Asian alone         219,624   
116              Native Hawaiian and Other Pacific ...             882   
117                              Some Other Race alone          13,335   
118                                  Two or More Races          62,989   

         103      232      148       4     231      79    125     144  ...  \
0     2506.0 