In [41]:
import json
import os
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import Polygon
from shapely.ops import unary_union
import pandas as pd
from shapely import wkt

Census tract zones: https://www.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page

Taxi Zones: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc

In [42]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning


In [43]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []

Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other: ['census_tracts_coords.csv', 'combined_df.csv', 'combined_df_all_zones.csv', 'fhv_2021_01.csv', 'green_2021_01.csv', 'taxi_zones_alternate.csv', 'taxi_zones_coords.csv', 'taxi_zones_manhattan.geojson', 'taxi_zones_raw.geojson', 'taxi_zone_lookup.csv', 'tract_zones_manhattan.geojson', 'tract_zones_raw.geojson', 'yellow_2021_01.csv']


In [44]:
#input file
taxi_zones_raw_file = os.path.join(cwd, data_dir, "taxi_zones_raw.geojson")

#output file
taxi_zones_manhattan = os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson")

In [45]:
# Function to load and filter GeoJSON
def filter_taxi_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "borough" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('borough') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [46]:
# Filter the GeoJSON file
filter_taxi_geojson(taxi_zones_raw_file, taxi_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {taxi_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\taxi_zones_manhattan.geojson


In [47]:
#input file
tract_zones_raw_file = os.path.join(cwd, data_dir, "tract_zones_raw.geojson")

#output file
tract_zones_manhattan = os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson")

In [48]:
def filter_tract_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "BoroName" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('BoroName') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "crs": data.get('crs', {}),
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [49]:
# Filter the GeoJSON file
filter_tract_geojson(tract_zones_raw_file, tract_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {tract_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\tract_zones_manhattan.geojson


In [50]:
census_data_dir = os.path.join(os.getcwd(), "..", "Datasets", "census")

# Define the directory where the data is located relative to the current working directory
print("Census CSV Directory:", census_data_dir)

# Define the file path relative to the data directory
census_data_path = os.path.join(cwd, census_data_dir, "census_manhat_data.csv")

census_data = pd.read_csv(census_data_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding='utf-8-sig')

Census CSV Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census


In [51]:
strings_to_remove = ["New", "York", "County", "!!", "borough", ";", ","]

# Function to clean column names
def clean_column_names(column):
    for string in strings_to_remove:
        column = column.replace(string, "").strip()
    return column

census_data.columns = census_data.columns.map(clean_column_names)

# Define a function to rename columns
def rename_columns(columns):
    new_columns = []
    for col in columns:
        # Split by spaces, join with single underscore, convert to lowercase
        new_col = '_'.join(col.split()).lower()
        new_columns.append(new_col)
    return new_columns

# Rename columns using the function
census_data.columns = rename_columns(census_data.columns)

# Drop columns containing '_percent'
census_data = census_data.loc[:, ~census_data.columns.str.contains('_percent')]


In [52]:
census_data

Unnamed: 0,label_(grouping),manhattan_count,census_tract_1_count,census_tract_2.01_count,census_tract_2.02_count,census_tract_5_count,census_tract_6_count,census_tract_7_count,census_tract_8_count,census_tract_9_count,...,census_tract_295_count,census_tract_297_count,census_tract_299_count,census_tract_303_count,census_tract_307_count,census_tract_309_count,census_tract_311_count,census_tract_317.03_count,census_tract_317.04_count,census_tract_319_count
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251,0,2012,7266,5.0,11616,10542,10871,2016,...,7039,16.0,3598,3691,3427,8594,12.0,5847,10422,3.0
2,Under 5 years,68849,0,65,277,1.0,407,420,382,144,...,338,0.0,156,190,126,477,2.0,678,718,0.0
3,5 to 9 years,63039,0,95,347,0.0,477,308,401,69,...,302,0.0,197,140,146,482,0.0,561,581,1.0
4,10 to 14 years,63419,0,110,344,0.0,536,218,401,61,...,282,3.0,194,146,153,572,0.0,355,531,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Rental vacancy rate (percent) [5],6.1,0.0,4.2,2.5,100.0,8.1,13.8,3.0,20.2,...,2.9,100.0,1.1,3.9,4.1,1.6,0.0,10.7,7.1,0.0
170,HOUSING TENURE,,,,,,,,,,...,,,,,,,,,,
171,Occupied housing units,817782,0,804,3386,0.0,4833,5929,4619,904,...,3283,0.0,1507,1740,1767,3483,0.0,2407,5011,0.0
172,Owner-occupied housing units,191489,0,11,949,0.0,275,850,154,222,...,324,0.0,22,128,807,194,0.0,233,1547,0.0


In [53]:
# Identify the "Total population" row by its label, if necessary
total_population_row = census_data[census_data['label_(grouping)'].str.contains('Total population', na=False)].iloc[0]

# Step 1: Convert the relevant "Total population" values to numeric, skipping the first column
total_population_numeric = pd.to_numeric(total_population_row[1:], errors='coerce')

# Step 2: Identify columns with a total population less than 500
low_population_tracts = total_population_numeric[total_population_numeric < 500].index

# Step 3: Drop the identified columns from the census_data
census_data = census_data.drop(columns=low_population_tracts)

# Display the cleaned census data
print(census_data.head())

# Output the cleaned census data and the columns that were removed
print("Columns removed:", list(low_population_tracts))

         label_(grouping) manhattan_count census_tract_2.01_count  \
0             SEX AND AGE             NaN                     NaN   
1        Total population       1,694,251                   2,012   
2           Under 5 years          68,849                      65   
3            5 to 9 years          63,039                      95   
4          10 to 14 years          63,419                     110   

  census_tract_2.02_count census_tract_6_count census_tract_7_count  \
0                     NaN                  NaN                  NaN   
1                   7,266               11,616               10,542   
2                     277                  407                  420   
3                     347                  477                  308   
4                     344                  536                  218   

  census_tract_8_count census_tract_9_count census_tract_10.01_count  \
0                  NaN                  NaN                      NaN   
1             

In [54]:
census_data = census_data.head(120)

# Filter columns that end with '_count' along with 'label_(grouping)'
count_columns = ['label_(grouping)'] + [col for col in census_data.columns if col.endswith('_count')]

# Display the filtered columns
census_data = census_data[count_columns]

In [55]:
census_data

Unnamed: 0,label_(grouping),manhattan_count,census_tract_2.01_count,census_tract_2.02_count,census_tract_6_count,census_tract_7_count,census_tract_8_count,census_tract_9_count,census_tract_10.01_count,census_tract_10.02_count,...,census_tract_287_count,census_tract_291_count,census_tract_293_count,census_tract_295_count,census_tract_299_count,census_tract_303_count,census_tract_307_count,census_tract_309_count,census_tract_317.03_count,census_tract_317.04_count
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251,2012,7266,11616,10542,10871,2016,1767,6300,...,3772,10509,8035,7039,3598,3691,3427,8594,5847,10422
2,Under 5 years,68849,65,277,407,420,382,144,89,336,...,161,494,371,338,156,190,126,477,678,718
3,5 to 9 years,63039,95,347,477,308,401,69,68,365,...,142,471,415,302,197,140,146,482,561,581
4,10 to 14 years,63419,110,344,536,218,401,61,101,439,...,165,620,444,282,194,146,153,572,355,531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,American Indian and Alaska Native ...,1895,0,12,16,7,4,0,0,3,...,8,14,4,5,3,1,2,18,0,5
116,Asian alone,219624,702,1832,6027,1894,8708,357,272,1050,...,89,117,103,242,25,133,155,130,1029,2435
117,Native Hawaiian and Other Pacific ...,882,0,0,6,10,5,4,0,0,...,0,2,0,1,0,4,0,1,2,5
118,Some Other Race alone,13335,11,55,42,88,53,28,29,44,...,28,87,76,46,27,35,23,56,31,77


In [56]:
# Define the file path relative to the data directory
census_data_path_renamed = os.path.join(cwd, census_data_dir, "census_data.csv")

# Save the DataFrame to CSV
census_data.to_csv(census_data_path_renamed, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", census_data_path_renamed)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census\census_data.csv


In [82]:
census_data = pd.read_csv(census_data_path_renamed, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding='utf-8-sig')

In [83]:
# Load census tract and taxi zones GeoJSON
census_tracts_geojson = gpd.read_file(os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson"))
taxi_zones_geojson = gpd.read_file(os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson"))

In [84]:
# Define the file path
file_path = os.path.join(data_dir, "census_tracts_coords.csv")

# Save the DataFrame to CSV
census_tracts_geojson.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\census_tracts_coords.csv


In [85]:
# Define the file path
file_path = os.path.join(data_dir, "taxi_zones_coords.csv")

# Save the DataFrame to CSV
taxi_zones_geojson.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\taxi_zones_coords.csv


In [86]:
census_tracts = pd.read_csv(os.path.join(cwd, data_dir, "census_tracts_coords.csv"))
taxi_zones = pd.read_csv(os.path.join(cwd, data_dir, "taxi_zones_coords.csv"))

In [87]:
census_data

Unnamed: 0,label_(grouping),manhattan_count,census_tract_2.01_count,census_tract_2.02_count,census_tract_6_count,census_tract_7_count,census_tract_8_count,census_tract_9_count,census_tract_10.01_count,census_tract_10.02_count,...,census_tract_287_count,census_tract_291_count,census_tract_293_count,census_tract_295_count,census_tract_299_count,census_tract_303_count,census_tract_307_count,census_tract_309_count,census_tract_317.03_count,census_tract_317.04_count
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251,2012,7266,11616,10542,10871,2016,1767,6300,...,3772,10509,8035,7039,3598,3691,3427,8594,5847,10422
2,Under 5 years,68849,65,277,407,420,382,144,89,336,...,161,494,371,338,156,190,126,477,678,718
3,5 to 9 years,63039,95,347,477,308,401,69,68,365,...,142,471,415,302,197,140,146,482,561,581
4,10 to 14 years,63419,110,344,536,218,401,61,101,439,...,165,620,444,282,194,146,153,572,355,531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,American Indian and Alaska Native ...,1895,0,12,16,7,4,0,0,3,...,8,14,4,5,3,1,2,18,0,5
116,Asian alone,219624,702,1832,6027,1894,8708,357,272,1050,...,89,117,103,242,25,133,155,130,1029,2435
117,Native Hawaiian and Other Pacific ...,882,0,0,6,10,5,4,0,0,...,0,2,0,1,0,4,0,1,2,5
118,Some Other Race alone,13335,11,55,42,88,53,28,29,44,...,28,87,76,46,27,35,23,56,31,77


In [88]:
census_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Columns: 301 entries, label_(grouping) to census_tract_317.04_count
dtypes: float64(3), object(298)
memory usage: 282.3+ KB


In [89]:
# Function to calculate intersection and percentage overlap
def calculate_overlap(census_geom, taxi_geom):
    intersection = census_geom.intersection(taxi_geom)
    intersection_area = intersection.area
    census_area = census_geom.area
    percentage_overlap = (intersection_area / census_area)
    if percentage_overlap >= .99:
        percentage_overlap = 1
    elif percentage_overlap <= .01:
        percentage_overlap = 0
    return intersection_area, census_area, percentage_overlap

In [90]:
# Ensure all values in the geometry columns are strings
census_tracts['geometry_tract'] = census_tracts['geometry'].astype(str)
taxi_zones['geometry_taxi'] = taxi_zones['geometry'].astype(str)

# Convert geometries to shapely objects
census_tracts['geometry_tract'] = census_tracts['geometry'].apply(wkt.loads)
taxi_zones['geometry_taxi'] = taxi_zones['geometry'].apply(wkt.loads)

# Initialize an empty list to store the results
overlap_results = []

In [91]:
# Calculate intersections and percentage overlap
for idx_tract, tract_row in census_tracts.iterrows():
    for idx_taxi, taxi_row in taxi_zones.iterrows():
        tract_geom = tract_row['geometry_tract']
        taxi_geom = taxi_row['geometry_taxi']
        
        if tract_geom.intersects(taxi_geom):
            intersection_area, tract_area, percentage_overlap = calculate_overlap(tract_geom, taxi_geom)
            
            # Append only if percentage_overlap > 0
            if percentage_overlap > 0:
                overlap_results.append({
                    'CTLabel': tract_row['CTLabel'],
                    'location_id': taxi_row['location_id'],
                    'intersection_area': intersection_area,
                    'tract_area': tract_area,
                    'percentage_overlap': percentage_overlap
                })

# Convert the results into a DataFrame
overlap_df = pd.DataFrame(overlap_results)

In [92]:
# Define the file path relative to the data directory
overlap_df_path = os.path.join(cwd, census_data_dir, "overlap_df.csv")

# Save the DataFrame to CSV
overlap_df.to_csv(overlap_df_path, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", overlap_df)

DataFrame saved to:      CTLabel  location_id  intersection_area  tract_area  percentage_overlap
0       1.00          103       6.284910e-06    0.000018            0.344526
1       1.00          103       1.184074e-05    0.000018            0.649085
2       2.01          232       9.626498e-06    0.000010            1.000000
3       6.00          232       2.554676e-05    0.000026            1.000000
4      14.01          232       9.943394e-06    0.000010            1.000000
..       ...          ...                ...         ...                 ...
313   317.04          261       5.679329e-07    0.000024            0.023785
314   194.00           74       1.866329e-05    0.000019            1.000000
315   242.00           74       3.679666e-05    0.000037            1.000000
316    13.00          261       3.368247e-05    0.000034            1.000000
317    21.00          231       2.532817e-05    0.000025            1.000000

[318 rows x 5 columns]


In [93]:
# # Filter rows where percentage_overlap is >= 99 or <= 1
# filtered_df = overlap_df[(overlap_df['percentage_overlap'] >= 99) | (overlap_df['percentage_overlap'] <= 1)]

# # Get the number of rows in the filtered DataFrame
# num_rows = filtered_df.shape[0]
# num_rows

# 628 of 655 areas with an overlap >99% or less than 1%. rounding above code to 0% or 100%

In [94]:
# # Standardize CTLabel values to match the census tract column naming conventions
overlap_df['CTLabel'] = overlap_df['CTLabel'].apply(lambda x: str(int(float(x))) if float(x).is_integer() else str(x))
overlap_df.tail(20)

Unnamed: 0,CTLabel,location_id,intersection_area,tract_area,percentage_overlap
298,111.0,68,1.865216e-05,1.9e-05,1.0
299,238.04,202,2.586142e-05,2.6e-05,1.0
300,192.0,74,2.018926e-05,2e-05,1.0
301,245.0,244,2.265856e-05,2.3e-05,1.0
302,251.0,244,1.702275e-05,1.7e-05,1.0
303,160.02,75,5.477295e-06,5e-06,1.0
304,168.0,75,2.129262e-05,2.1e-05,1.0
305,79.0,158,2.83704e-05,3e-05,0.959229
306,69.0,158,2.325034e-05,2.3e-05,1.0
307,75.0,158,1.755923e-05,1.8e-05,1.0


In [95]:
overlap_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CTLabel             318 non-null    object 
 1   location_id         318 non-null    int64  
 2   intersection_area   318 non-null    float64
 3   tract_area          318 non-null    float64
 4   percentage_overlap  318 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 12.6+ KB


In [96]:
taxi_zone_ids = overlap_df['location_id'].unique()
print(len(taxi_zone_ids))
taxi_zone_ids

67


array([103, 232, 148,   4, 231,  79, 125, 144, 234, 114, 113, 249, 107,
       137, 170, 164,  90,  68, 162, 161, 186, 246, 229, 100, 163, 237,
       141, 230,  48, 140, 262, 236, 142, 143, 263,  75, 239, 238,  74,
        41, 151,  24, 166,  42, 152, 116, 244, 243, 127, 153,  87, 224,
        88, 211,  45,  50,  43, 233, 194,  13, 120, 202, 209,  12, 158,
       128, 261], dtype=int64)

In [108]:
# # Extracting all unique "CTLabel" values
# ct_labels = overlap_df['CTLabel'].unique()

# # Calculating the length of the unique "CTLabel" values list
# length_of_ct_labels = len(ct_labels.tolist())
# length_of_ct_labels

310

In [98]:
census_data.head()

Unnamed: 0,label_(grouping),manhattan_count,census_tract_2.01_count,census_tract_2.02_count,census_tract_6_count,census_tract_7_count,census_tract_8_count,census_tract_9_count,census_tract_10.01_count,census_tract_10.02_count,...,census_tract_287_count,census_tract_291_count,census_tract_293_count,census_tract_295_count,census_tract_299_count,census_tract_303_count,census_tract_307_count,census_tract_309_count,census_tract_317.03_count,census_tract_317.04_count
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251.0,2012.0,7266.0,11616.0,10542.0,10871.0,2016.0,1767.0,6300.0,...,3772.0,10509.0,8035.0,7039.0,3598.0,3691.0,3427.0,8594.0,5847.0,10422.0
2,Under 5 years,68849.0,65.0,277.0,407.0,420.0,382.0,144.0,89.0,336.0,...,161.0,494.0,371.0,338.0,156.0,190.0,126.0,477.0,678.0,718.0
3,5 to 9 years,63039.0,95.0,347.0,477.0,308.0,401.0,69.0,68.0,365.0,...,142.0,471.0,415.0,302.0,197.0,140.0,146.0,482.0,561.0,581.0
4,10 to 14 years,63419.0,110.0,344.0,536.0,218.0,401.0,61.0,101.0,439.0,...,165.0,620.0,444.0,282.0,194.0,146.0,153.0,572.0,355.0,531.0


In [109]:
# number_of_columns = len(census_data.columns.tolist())
# number_of_columns

301

In [113]:
def adjust_columns_by_overlap(census_data, overlap_df):
    # Ensure the correct data types
    census_data.columns = census_data.columns.str.strip()
    overlap_df['CTLabel'] = overlap_df['CTLabel'].astype(str)
    
    # Clean data: remove commas and convert to float
    census_data = census_data.applymap(lambda x: float(str(x).replace(',', '')) if isinstance(x, str) and x.replace(',', '').replace('.', '').isdigit() else x)
    
    # Adjust columns in census_data according to percentage_overlap
    for index, row in overlap_df.iterrows():
        ct_label = row['CTLabel']
        location_id = row['location_id']
        percentage_overlap = row['percentage_overlap']
        
        ct_col = f'census_tract_{ct_label}_count'
        if ct_col in census_data.columns:
            adjusted_col = f'location_{location_id}_adjusted_count'
            if adjusted_col not in census_data.columns:
                census_data[adjusted_col] = 0
            census_data[adjusted_col] += census_data[ct_col].fillna(0).astype(float) * percentage_overlap
    
    return census_data

def combine_ctlabels_by_location_id(census_data, overlap_df):
    # Get a unique list of location_ids from overlap_df
    location_ids = overlap_df['location_id'].unique()
    
    # Combine CTLabels into their respective location_ids
    combined_data = pd.DataFrame()
    combined_data['label_(grouping)'] = census_data['label_(grouping)']
    
    for location_id in location_ids:
        adjusted_col = f'location_{location_id}_adjusted_count'
        if adjusted_col in census_data.columns:
            combined_data[f'location_{location_id}_count'] = census_data[adjusted_col].round().astype(int)
    
    return combined_data

adjusted_census_data = adjust_columns_by_overlap(census_data.copy(), overlap_df)
combined_census_data = combine_ctlabels_by_location_id(adjusted_census_data, overlap_df)
combined_census_data['manhattan'] = combined_census_data.iloc[:, 1:].sum(axis=1).round().astype(int)
combined_census_data.columns = combined_census_data.columns.str.replace('_count', '', regex=False)



  census_data = census_data.applymap(lambda x: float(str(x).replace(',', '')) if isinstance(x, str) and x.replace(',', '').replace('.', '').isdigit() else x)


In [114]:
combined_census_data

Unnamed: 0,label_(grouping),location_232,location_148,location_4,location_231,location_79,location_125,location_144,location_234,location_114,...,location_45,location_50,location_233,location_194,location_13,location_202,location_209,location_158,location_261,manhattan
0,SEX AND AGE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Total population,46742,23808,26726,24124,44538,3092,12774,18107,13241,...,22658,22965,22535,1302,15805,11722,7914,11060,5650,1688750
2,Under 5 years,1825,693,925,1815,776,152,326,841,425,...,758,890,706,6,1355,644,247,476,204,68632
3,5 to 9 years,2039,641,1041,1526,566,113,265,571,382,...,823,687,578,3,1109,514,247,380,180,62859
4,10 to 14 years,2252,630,1117,1219,591,102,314,455,325,...,852,623,556,6,861,414,208,320,159,63280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,American Indian and Alaska Native ...,50,18,26,21,74,0,7,20,6,...,15,24,10,2,5,109,0,5,0,1888
116,Asian alone,13759,10729,4063,3329,6645,338,4840,2384,1686,...,15180,5076,5947,13,3366,3897,1984,704,1103,218671
117,Native Hawaiian and Other Pacific ...,7,14,16,13,41,1,11,6,4,...,16,7,13,3,7,97,8,3,0,879
118,Some Other Race alone,333,157,189,228,372,26,135,171,82,...,103,174,192,3,105,127,55,111,51,13286


In [116]:
# Define the file path relative to the data directory
combined_census_data_path = os.path.join(cwd, census_data_dir, "combined_census_data.csv")

# Save the DataFrame to CSV
combined_census_data.to_csv(combined_census_data_path, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", combined_census_data)

DataFrame saved to:                                       label_(grouping)  location_232  \
0                                          SEX AND AGE             0   
1                                     Total population         46742   
2                                        Under 5 years          1825   
3                                         5 to 9 years          2039   
4                                       10 to 14 years          2252   
..                                                 ...           ...   
115              American Indian and Alaska Native ...            50   
116                                        Asian alone         13759   
117              Native Hawaiian and Other Pacific ...             7   
118                              Some Other Race alone           333   
119                                  Two or More Races          1248   

     location_148  location_4  location_231  location_79  location_125  \
0               0           0            

In [76]:
# # Initialize a DataFrame to store the adjusted demographic data for each taxi zone
# taxi_zone_ids = overlap_df['location_id'].unique()
# adjusted_demographic_df = pd.DataFrame(index=census_data.index[1:], columns=taxi_zone_ids).fillna(0)

# # Identify the columns in the census_data DataFrame that correspond to demographic data
# demographic_columns = census_data.columns[census_data.columns.str.startswith('census_tract_')]

# # Iterate through each row in the overlap_df DataFrame to get the intersection details
# for idx, row in overlap_df.iterrows():
#     ct_label = row['CTLabel']
#     location_id = row['location_id']
#     percentage_overlap = row['percentage_overlap']
    
#     print(f"\nProcessing CTLabel: {ct_label}, Location ID: {location_id}, Percentage Overlap: {percentage_overlap}")

#     # Format CTLabel to match the column names in census_data
#     ct_label_str = f'{float(ct_label):.2f}'.rstrip('0').rstrip('.')  # Convert to string and remove trailing zeros
    
#     print(f"Formatted CTLabel: {ct_label_str}")

#     # Extract the relevant columns for the current census tract
#     ct_columns = [col for col in demographic_columns if col.startswith(f'census_tract_{ct_label_str}')]

#     for col in ct_columns:
#         # Extract the demographic count
#         demographic_count = census_data[col].iloc[1:]  # Skip the first row which is non-numeric
#         print(f"\nDemographic Column: {col}")
#         print(f"Original Demographic Count:\n{demographic_count}")

#         # Convert all values to strings, remove commas, and convert to float
#         demographic_count = demographic_count.astype(str).str.replace(',', '').astype(float)
#         print(f"Numeric Demographic Count:\n{demographic_count}")

#         # Calculate the adjusted count based on the percentage overlap
#         adjusted_count = demographic_count * percentage_overlap
#         print(f"Adjusted Count:\n{adjusted_count}")

#         # Ensure that the adjusted_count is correctly added to the DataFrame
#         adjusted_demographic_df[location_id] = adjusted_demographic_df[location_id].add(adjusted_count, fill_value=0)
#         print(f"Updated Adjusted Demographic Data for Location ID {location_id}:\n{adjusted_demographic_df[location_id]}")

# # Display the final adjusted demographic DataFrame
# print("\nFinal Adjusted Demographic Data:")
# print(adjusted_demographic_df)



Processing CTLabel: 1, Location ID: 103, Percentage Overlap: 0.3445261309749672
Formatted CTLabel: 1

Demographic Column: census_tract_10.01_count
Original Demographic Count:
1      1,767
2         89
3         68
4        101
5         87
       ...  
115        0
116      272
117        0
118       29
119       73
Name: census_tract_10.01_count, Length: 119, dtype: object
Numeric Demographic Count:
1      1767.0
2        89.0
3        68.0
4       101.0
5        87.0
        ...  
115       0.0
116     272.0
117       0.0
118      29.0
119      73.0
Name: census_tract_10.01_count, Length: 119, dtype: float64
Adjusted Count:
1      608.777673
2       30.662826
3       23.427777
4       34.797139
5       29.973773
          ...    
115      0.000000
116     93.711108
117      0.000000
118      9.991258
119     25.150408
Name: census_tract_10.01_count, Length: 119, dtype: float64
Updated Adjusted Demographic Data for Location ID 103:
1      608.777673
2       30.662826
3       23.42777

  adjusted_demographic_df = pd.DataFrame(index=census_data.index[1:], columns=taxi_zone_ids).fillna(0)


Adjusted Count:
1      1267.511636
2        64.081860
3        80.619115
4        78.207432
5        69.594278
          ...     
115       3.445261
116     107.492153
117       1.722631
118      10.680310
119      67.182596
Name: census_tract_190_count, Length: 119, dtype: float64
Updated Adjusted Demographic Data for Location ID 103:
1      239933.165503
2       10262.399863
3        9230.888627
4        9198.158645
5        9578.170967
           ...      
115       202.925891
116     32130.851501
117        72.350488
118      1799.459982
119      8895.320176
Name: 103, Length: 119, dtype: float64

Demographic Column: census_tract_191_count
Original Demographic Count:
1      9,005
2        285
3        302
4        336
5        367
       ...  
115        1
116      887
117        5
118       62
119      360
Name: census_tract_191_count, Length: 119, dtype: object
Numeric Demographic Count:
1      9005.0
2       285.0
3       302.0
4       336.0
5       367.0
        ...  
115      

In [77]:
# Convert the dictionary to a DataFrame and ensure the columns are ordered by taxi_zone_ids
# adjusted_demographic_df = pd.DataFrame(adjusted_demographic_df)

# adjusted_demographic_df

Unnamed: 0,103,232,148,4,231,79,125,144,234,114,...,233,194,13,120,202,209,12,158,128,261
1,780728.124451,180799.390544,173356.0,125484.953375,96421.138726,44538.0,3092.0,12774.0,18107.0,13241.0,...,22535.0,1302.0,15804.788435,0,11722.0,7913.967072,0,11059.894768,0,102059.890481
2,33053.813323,6311.873817,7595.0,4995.701344,5692.594645,776.0,152.0,326.0,841.0,425.0,...,706.0,6.0,1355.208005,0,644.0,247.281976,0,475.783515,0,3982.077851
3,29691.632055,5760.809435,7896.0,5264.365503,5189.734373,566.0,113.0,265.0,571.0,382.0,...,578.0,3.0,1108.595432,0,514.0,247.281976,0,379.639708,0,2986.819264
4,29638.678899,5683.322414,7990.0,5572.802256,4839.355251,591.0,102.0,314.0,455.0,325.0,...,556.0,6.0,860.520294,0,414.0,207.641354,0,319.781299,0,3036.629999
5,31911.412025,8294.442322,8224.0,8446.100839,4455.548831,3350.0,64.0,388.0,618.0,959.0,...,416.0,16.0,465.447538,0,368.0,630.474658,0,317.249059,0,2636.801581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,706.233424,150.255451,198.0,191.330123,143.512033,74.0,0.0,7.0,20.0,6.0,...,10.0,2.0,4.867110,0,109.0,0.000000,0,4.796144,0,47.118927
116,103394.574323,32016.427254,22814.0,10392.758454,10737.177460,6645.0,338.0,4840.0,2384.0,1686.0,...,5947.0,13.0,3366.334976,0,3897.0,1983.918758,0,703.624863,0,14925.917225
117,235.590927,173.209242,54.0,43.426444,30.059337,41.0,1.0,11.0,6.0,4.0,...,13.0,3.0,6.803072,0,97.0,7.550595,0,3.000000,0,25.118927
118,5908.981148,1438.957421,1444.0,948.906244,895.601550,372.0,26.0,135.0,171.0,82.0,...,192.0,3.0,104.960904,0,127.0,54.741812,0,111.165300,0,614.831469


In [78]:
# # Extract the labels and initial columns
# labels_and_initial_columns = census_data[['label_(grouping)', 'manhattan_count']].iloc[1:].reset_index(drop=True)

# # Reattach the labels and initial columns to the adjusted data
# final_demographic_data = pd.concat(
#     [labels_and_initial_columns, adjusted_demographic_df.reset_index(drop=True)], 
#     axis=1
# )

# # Rename columns to match taxi zone IDs
# final_demographic_data.columns = ['label_(grouping)', 'manhattan_count'] + list(map(str, taxi_zone_ids))
# final_demographic_data.iloc[:, 2:] = final_demographic_data.iloc[:, 2:].round()


In [79]:
# final_demographic_data.head(30)

Unnamed: 0,label_(grouping),manhattan_count,103,232,148,4,231,79,125,144,...,233,194,13,120,202,209,12,158,128,261
0,Total population,1694251.0,780728.0,180799.0,173356.0,125485.0,96421.0,44538.0,3092.0,12774.0,...,22535.0,1302.0,15805.0,0,11722.0,7914.0,0,11060.0,0,102060.0
1,Under 5 years,68849.0,33054.0,6312.0,7595.0,4996.0,5693.0,776.0,152.0,326.0,...,706.0,6.0,1355.0,0,644.0,247.0,0,476.0,0,3982.0
2,5 to 9 years,63039.0,29692.0,5761.0,7896.0,5264.0,5190.0,566.0,113.0,265.0,...,578.0,3.0,1109.0,0,514.0,247.0,0,380.0,0,2987.0
3,10 to 14 years,63419.0,29639.0,5683.0,7990.0,5573.0,4839.0,591.0,102.0,314.0,...,556.0,6.0,861.0,0,414.0,208.0,0,320.0,0,3037.0
4,15 to 19 years,74917.0,31911.0,8294.0,8224.0,8446.0,4456.0,3350.0,64.0,388.0,...,416.0,16.0,465.0,0,368.0,630.0,0,317.0,0,2637.0
5,20 to 24 years,137699.0,54069.0,14361.0,11975.0,12871.0,7206.0,7941.0,152.0,1360.0,...,2318.0,16.0,554.0,0,967.0,859.0,0,638.0,0,6838.0
6,25 to 29 years,202817.0,85569.0,22972.0,17820.0,13352.0,9526.0,8662.0,391.0,2110.0,...,3729.0,87.0,1262.0,0,1409.0,768.0,0,1149.0,0,14290.0
7,30 to 34 years,180136.0,82616.0,19308.0,16035.0,11286.0,9905.0,5136.0,399.0,1576.0,...,2897.0,128.0,1726.0,0,1393.0,790.0,0,1486.0,0,13179.0
8,35 to 39 years,131176.0,60606.0,13416.0,12404.0,8752.0,8251.0,2933.0,326.0,962.0,...,1905.0,125.0,1764.0,0,1009.0,687.0,0,1018.0,0,9225.0
9,40 to 44 years,103841.0,47432.0,10468.0,10358.0,7218.0,7104.0,1954.0,233.0,677.0,...,1415.0,147.0,1522.0,0,774.0,498.0,0,768.0,0,6428.0


In [80]:
# # Define the file path relative to the data directory
# final_demographic_data_path = os.path.join(cwd, census_data_dir, "final_demographic_data.csv")

# # Save the DataFrame to CSV
# final_demographic_data.to_csv(final_demographic_data_path, index=False, encoding='utf-8-sig')

# print("DataFrame saved to:", final_demographic_data)

DataFrame saved to:                                       label_(grouping) manhattan_count  \
0                                     Total population       1,694,251   
1                                        Under 5 years          68,849   
2                                         5 to 9 years          63,039   
3                                       10 to 14 years          63,419   
4                                       15 to 19 years          74,917   
..                                                 ...             ...   
114              American Indian and Alaska Native ...           1,895   
115                                        Asian alone         219,624   
116              Native Hawaiian and Other Pacific ...             882   
117                              Some Other Race alone          13,335   
118                                  Two or More Races          62,989   

          103       232       148         4      231       79     125  \
0    780728.0  180