In [635]:
import json
import os
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import Polygon
from shapely.ops import unary_union
import pandas as pd
from shapely import wkt
import numpy as np

Census tract zones: https://www.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page

Taxi Zones: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc

In [597]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning


In [598]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []

Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other: ['census_tracts_coords.csv', 'combined_df.csv', 'combined_df_all_zones.csv', 'fhv_2021_01.csv', 'green_2021_01.csv', 'taxi_zones_alternate.csv', 'taxi_zones_coords.csv', 'taxi_zones_manhattan.geojson', 'taxi_zones_raw.geojson', 'taxi_zone_lookup.csv', 'tract_zones_manhattan.geojson', 'tract_zones_raw.geojson', 'yellow_2021_01.csv']


In [599]:
#input file
taxi_zones_raw_file = os.path.join(cwd, data_dir, "taxi_zones_raw.geojson")

#output file
taxi_zones_manhattan = os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson")

In [600]:
# Function to load and filter GeoJSON
def filter_taxi_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "borough" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('borough') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [601]:
# Filter the GeoJSON file
filter_taxi_geojson(taxi_zones_raw_file, taxi_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {taxi_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\taxi_zones_manhattan.geojson


In [602]:
#input file
tract_zones_raw_file = os.path.join(cwd, data_dir, "tract_zones_raw.geojson")

#output file
tract_zones_manhattan = os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson")

In [603]:
def filter_tract_geojson(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Filter features where "BoroName" == "Manhattan"
    filtered_features = [feature for feature in data['features'] if feature['properties'].get('BoroName') == 'Manhattan']
    
    # Create a new GeoJSON structure
    filtered_geojson = {
        "type": "FeatureCollection",
        "crs": data.get('crs', {}),
        "features": filtered_features
    }
    
    # Write the filtered GeoJSON to a new file
    with open(output_file, 'w') as f:
        json.dump(filtered_geojson, f, indent=2)

In [604]:
# Filter the GeoJSON file
filter_tract_geojson(tract_zones_raw_file, tract_zones_manhattan)

print(f"Filtered GeoJSON has been saved to {tract_zones_manhattan}")

Filtered GeoJSON has been saved to c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\tract_zones_manhattan.geojson


In [605]:
census_data_dir = os.path.join(os.getcwd(), "..", "Datasets", "census")

# Define the directory where the data is located relative to the current working directory
print("Census CSV Directory:", census_data_dir)

# Define the file path relative to the data directory
census_data_path = os.path.join(cwd, census_data_dir, "census_manhat_data.csv")

census_data = pd.read_csv(census_data_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding='utf-8-sig')

Census CSV Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census


In [606]:
def clean_and_rename_columns(df):
    df.columns = df.columns.map(lambda col: col.replace("New", "").replace("York", "").replace("County", "")
                                                .replace("!!", "").replace("borough", "").replace(";", "")
                                                .replace(",", "").strip())
    
    df.columns = ['_'.join(col.split()).lower() for col in df.columns]
    
    df = df.loc[:, ~df.columns.str.contains('_percent')]
    
    df.columns = df.columns.str.replace('census_tract_', '').str.replace('_count', '')

    return df

def clean_data(df):
    df_cleaned = df.copy()
    for column in df_cleaned.columns:
        if column != 'label_(grouping)':
            df_cleaned[column] = df_cleaned[column].astype(str).str.replace(',', '').replace('nan', None)
            df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
    return df_cleaned

census_data_cleaned = clean_and_rename_columns(census_data)

total_population_row = census_data_cleaned[census_data_cleaned['label_(grouping)'].str.contains('Total population', na=False)]

df_cleaned = clean_data(census_data_cleaned)

# Ensure data types are compatible
for column in df_cleaned.columns:
    if column in total_population_row.columns:
        if df_cleaned[column].dtype != total_population_row[column].dtype:
            # Strip commas and convert to numeric
            total_population_row[column] = total_population_row[column].astype(str).str.replace(',', '')
            total_population_row[column] = pd.to_numeric(total_population_row[column], errors='coerce')

df_cleaned.update(total_population_row)

  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
  df_cleaned[column] = pd.to_numeric(d

In [607]:
# strings_to_remove = ["New", "York", "County", "!!", "borough", ";", ","]

# # Function to clean column names
# def clean_column_names(column):
#     for string in strings_to_remove:
#         column = column.replace(string, "").strip()
#     return column

# census_data.columns = census_data.columns.map(clean_column_names)

# # Define a function to rename columns
# def rename_columns(columns):
#     new_columns = []
#     for col in columns:
#         # Split by spaces, join with single underscore, convert to lowercase
#         new_col = '_'.join(col.split()).lower()
#         new_columns.append(new_col)
#     return new_columns

# # Rename columns using the function
# census_data.columns = rename_columns(census_data.columns)

# # Drop columns containing '_percent'
# census_data = census_data.loc[:, ~census_data.columns.str.contains('_percent')]


In [608]:
df_cleaned

Unnamed: 0,label_(grouping),manhattan,1,2.01,2.02,5,6,7,8,9,...,295,297,299,303,307,309,311,317.03,317.04,319
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251.0,0,2012.0,7266.0,5.0,11616.0,10542.0,10871.0,2016.0,...,7039.0,16.0,3598.0,3691.0,3427.0,8594.0,12.0,5847.0,10422.0,3.0
2,Under 5 years,68849.0,0,65.0,277.0,1.0,407.0,420.0,382.0,144.0,...,338.0,0.0,156.0,190.0,126.0,477.0,2.0,678.0,718.0,0.0
3,5 to 9 years,63039.0,0,95.0,347.0,0.0,477.0,308.0,401.0,69.0,...,302.0,0.0,197.0,140.0,146.0,482.0,0.0,561.0,581.0,1.0
4,10 to 14 years,63419.0,0,110.0,344.0,0.0,536.0,218.0,401.0,61.0,...,282.0,3.0,194.0,146.0,153.0,572.0,0.0,355.0,531.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Rental vacancy rate (percent) [5],6.1,0.0,4.2,2.5,100.0,8.1,13.8,3.0,20.2,...,2.9,100.0,1.1,3.9,4.1,1.6,0.0,10.7,7.1,0.0
170,HOUSING TENURE,,,,,,,,,,...,,,,,,,,,,
171,Occupied housing units,817782.0,0,804.0,3386.0,0.0,4833.0,5929.0,4619.0,904.0,...,3283.0,0.0,1507.0,1740.0,1767.0,3483.0,0.0,2407.0,5011.0,0.0
172,Owner-occupied housing units,191489.0,0,11.0,949.0,0.0,275.0,850.0,154.0,222.0,...,324.0,0.0,22.0,128.0,807.0,194.0,0.0,233.0,1547.0,0.0


In [609]:
# # Identify the "Total population" row by its label, if necessary
# total_population_row = census_data[census_data['label_(grouping)'].str.contains('Total population', na=False)].iloc[0]

# # Step 1: Convert the relevant "Total population" values to numeric, skipping the first column
# total_population_numeric = pd.to_numeric(total_population_row[1:], errors='coerce')

# # Step 2: Identify columns with a total population less than 500
# low_population_tracts = total_population_numeric[total_population_numeric < 500].index

# # Step 3: Drop the identified columns from the census_data
# census_data = census_data.drop(columns=low_population_tracts)

# # Display the cleaned census data
# print(census_data.head())

# # Output the cleaned census data and the columns that were removed
# print("Columns removed:", list(low_population_tracts))

In [610]:
census_data = census_data.head(120)

# Filter columns that end with '_count' along with 'label_(grouping)'
count_columns = ['label_(grouping)'] + [col for col in census_data.columns if col.endswith('_count')]

# Display the filtered columns
census_data = census_data[count_columns]

In [611]:
# # Rename the columns by stripping the prefixes and suffixes
# census_data.columns = census_data.columns.str.replace('census_tract_', '').str.replace('_count', '')

# # Convert the zone columns to appropriate datatype (float)
# zone_columns = census_data.columns[2:]  # Assuming the first two columns are non-zone data
# census_data[zone_columns] = census_data[zone_columns].apply(pd.to_numeric, errors='coerce')

In [612]:
# def clean_data(census_data):
#     df_cleaned = census_data.copy()
#     for column in df_cleaned.columns:
#         if column != 'label_(grouping)':
#             df_cleaned[column] = df_cleaned[column].astype(str).str.replace(',', '').replace('nan', None)
#             df_cleaned[column] = pd.to_numeric(df_cleaned[column], errors='ignore')
#     return df_cleaned

# df_cleaned = clean_data(census_data)

In [613]:
# Define the file path relative to the data directory
df_cleaned_path = os.path.join(cwd, census_data_dir, "census_data.csv")

# Save the DataFrame to CSV
df_cleaned.to_csv(df_cleaned_path, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", df_cleaned_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\census\census_data.csv


In [614]:
census_data = pd.read_csv(df_cleaned_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding='utf-8-sig')

In [615]:
# Load census tract and taxi zones GeoJSON
census_tracts_geojson = gpd.read_file(os.path.join(cwd, data_dir, "tract_zones_manhattan.geojson"))
taxi_zones_geojson = gpd.read_file(os.path.join(cwd, data_dir, "taxi_zones_manhattan.geojson"))

In [616]:
# Define the file path
file_path = os.path.join(data_dir, "census_tracts_coords.csv")

# Save the DataFrame to CSV
census_tracts_geojson.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\census_tracts_coords.csv


In [617]:
# Define the file path
file_path = os.path.join(data_dir, "taxi_zones_coords.csv")

# Save the DataFrame to CSV
taxi_zones_geojson.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\taxi_zones_coords.csv


In [618]:
census_tracts = pd.read_csv(os.path.join(cwd, data_dir, "census_tracts_coords.csv"))
taxi_zones = pd.read_csv(os.path.join(cwd, data_dir, "taxi_zones_coords.csv"))

In [619]:
census_data

Unnamed: 0,label_(grouping),manhattan,1,2.01,2.02,5,6,7,8,9,...,295,297,299,303,307,309,311,317.03,317.04,319
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251.0,0,2012.0,7266.0,5.0,11616.0,10542.0,10871.0,2016.0,...,7039.0,16.0,3598.0,3691.0,3427.0,8594.0,12.0,5847.0,10422.0,3.0
2,Under 5 years,68849.0,0,65.0,277.0,1.0,407.0,420.0,382.0,144.0,...,338.0,0.0,156.0,190.0,126.0,477.0,2.0,678.0,718.0,0.0
3,5 to 9 years,63039.0,0,95.0,347.0,0.0,477.0,308.0,401.0,69.0,...,302.0,0.0,197.0,140.0,146.0,482.0,0.0,561.0,581.0,1.0
4,10 to 14 years,63419.0,0,110.0,344.0,0.0,536.0,218.0,401.0,61.0,...,282.0,3.0,194.0,146.0,153.0,572.0,0.0,355.0,531.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Rental vacancy rate (percent) [5],6.1,0.0,4.2,2.5,100.0,8.1,13.8,3.0,20.2,...,2.9,100.0,1.1,3.9,4.1,1.6,0.0,10.7,7.1,0.0
170,HOUSING TENURE,,,,,,,,,,...,,,,,,,,,,
171,Occupied housing units,817782.0,0,804.0,3386.0,0.0,4833.0,5929.0,4619.0,904.0,...,3283.0,0.0,1507.0,1740.0,1767.0,3483.0,0.0,2407.0,5011.0,0.0
172,Owner-occupied housing units,191489.0,0,11.0,949.0,0.0,275.0,850.0,154.0,222.0,...,324.0,0.0,22.0,128.0,807.0,194.0,0.0,233.0,1547.0,0.0


In [620]:
census_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Columns: 312 entries, label_(grouping) to 319
dtypes: float64(309), object(3)
memory usage: 424.3+ KB


In [621]:
# Function to calculate intersection and percentage overlap
def calculate_overlap(census_geom, taxi_geom):
    intersection = census_geom.intersection(taxi_geom)
    intersection_area = intersection.area
    census_area = census_geom.area
    percentage_overlap = (intersection_area / census_area)
    if percentage_overlap >= .99:
        percentage_overlap = 1
    elif percentage_overlap <= .01:
        percentage_overlap = 0
    return intersection_area, census_area, percentage_overlap

In [622]:
# Ensure all values in the geometry columns are strings
census_tracts['geometry_tract'] = census_tracts['geometry'].astype(str)
taxi_zones['geometry_taxi'] = taxi_zones['geometry'].astype(str)

# Convert geometries to shapely objects
census_tracts['geometry_tract'] = census_tracts['geometry'].apply(wkt.loads)
taxi_zones['geometry_taxi'] = taxi_zones['geometry'].apply(wkt.loads)

# Initialize an empty list to store the results
overlap_results = []

In [623]:
# Calculate intersections and percentage overlap
for idx_tract, tract_row in census_tracts.iterrows():
    for idx_taxi, taxi_row in taxi_zones.iterrows():
        tract_geom = tract_row['geometry_tract']
        taxi_geom = taxi_row['geometry_taxi']
        
        if tract_geom.intersects(taxi_geom):
            intersection_area, tract_area, percentage_overlap = calculate_overlap(tract_geom, taxi_geom)
            
            # Append only if percentage_overlap > 0
            if percentage_overlap > 0:
                overlap_results.append({
                    'CTLabel': tract_row['CTLabel'],
                    'location_id': taxi_row['location_id'],
                    'intersection_area': intersection_area,
                    'tract_area': tract_area,
                    'percentage_overlap': percentage_overlap
                })

# Convert the results into a DataFrame
overlap_df = pd.DataFrame(overlap_results)

In [624]:
# # Standardize CTLabel values to match the census tract column naming conventions
overlap_df['CTLabel'] = overlap_df['CTLabel'].apply(lambda x: str(int(float(x))) if float(x).is_integer() else str(x))

In [625]:
# Define the file path relative to the data directory
overlap_df_path = os.path.join(cwd, census_data_dir, "overlap_df.csv")

# Save the DataFrame to CSV
overlap_df.to_csv(overlap_df_path, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", overlap_df)

DataFrame saved to:     CTLabel  location_id  intersection_area  tract_area  percentage_overlap
0         1          103       6.284910e-06    0.000018            0.344526
1         1          103       1.184074e-05    0.000018            0.649085
2      2.01          232       9.626498e-06    0.000010            1.000000
3         6          232       2.554676e-05    0.000026            1.000000
4     14.01          232       9.943394e-06    0.000010            1.000000
..      ...          ...                ...         ...                 ...
313  317.04          261       5.679329e-07    0.000024            0.023785
314     194           74       1.866329e-05    0.000019            1.000000
315     242           74       3.679666e-05    0.000037            1.000000
316      13          261       3.368247e-05    0.000034            1.000000
317      21          231       2.532817e-05    0.000025            1.000000

[318 rows x 5 columns]


In [626]:
# # Filter rows where percentage_overlap is >= 99 or <= 1
# filtered_df = overlap_df[(overlap_df['percentage_overlap'] >= 99) | (overlap_df['percentage_overlap'] <= 1)]

# # Get the number of rows in the filtered DataFrame
# num_rows = filtered_df.shape[0]
# num_rows

# 628 of 655 areas with an overlap >99% or less than 1%. rounding above code to 0% or 100%

In [627]:
# # Standardize CTLabel values to match the census tract column naming conventions
overlap_df.tail(20)

Unnamed: 0,CTLabel,location_id,intersection_area,tract_area,percentage_overlap
298,111.0,68,1.865216e-05,1.9e-05,1.0
299,238.04,202,2.586142e-05,2.6e-05,1.0
300,192.0,74,2.018926e-05,2e-05,1.0
301,245.0,244,2.265856e-05,2.3e-05,1.0
302,251.0,244,1.702275e-05,1.7e-05,1.0
303,160.02,75,5.477295e-06,5e-06,1.0
304,168.0,75,2.129262e-05,2.1e-05,1.0
305,79.0,158,2.83704e-05,3e-05,0.959229
306,69.0,158,2.325034e-05,2.3e-05,1.0
307,75.0,158,1.755923e-05,1.8e-05,1.0


In [628]:
overlap_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CTLabel             318 non-null    object 
 1   location_id         318 non-null    int64  
 2   intersection_area   318 non-null    float64
 3   tract_area          318 non-null    float64
 4   percentage_overlap  318 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 12.6+ KB


In [629]:
taxi_zone_ids = overlap_df['location_id'].unique()
print(len(taxi_zone_ids))
taxi_zone_ids

67


array([103, 232, 148,   4, 231,  79, 125, 144, 234, 114, 113, 249, 107,
       137, 170, 164,  90,  68, 162, 161, 186, 246, 229, 100, 163, 237,
       141, 230,  48, 140, 262, 236, 142, 143, 263,  75, 239, 238,  74,
        41, 151,  24, 166,  42, 152, 116, 244, 243, 127, 153,  87, 224,
        88, 211,  45,  50,  43, 233, 194,  13, 120, 202, 209,  12, 158,
       128, 261], dtype=int64)

In [630]:
# # Extracting all unique "CTLabel" values
# ct_labels = overlap_df['CTLabel'].unique()

# # Calculating the length of the unique "CTLabel" values list
# length_of_ct_labels = len(ct_labels.tolist())
# length_of_ct_labels

In [631]:
census_data.head()

Unnamed: 0,label_(grouping),manhattan,1,2.01,2.02,5,6,7,8,9,...,295,297,299,303,307,309,311,317.03,317.04,319
0,SEX AND AGE,,,,,,,,,,...,,,,,,,,,,
1,Total population,1694251.0,0.0,2012.0,7266.0,5.0,11616.0,10542.0,10871.0,2016.0,...,7039.0,16.0,3598.0,3691.0,3427.0,8594.0,12.0,5847.0,10422.0,3.0
2,Under 5 years,68849.0,0.0,65.0,277.0,1.0,407.0,420.0,382.0,144.0,...,338.0,0.0,156.0,190.0,126.0,477.0,2.0,678.0,718.0,0.0
3,5 to 9 years,63039.0,0.0,95.0,347.0,0.0,477.0,308.0,401.0,69.0,...,302.0,0.0,197.0,140.0,146.0,482.0,0.0,561.0,581.0,1.0
4,10 to 14 years,63419.0,0.0,110.0,344.0,0.0,536.0,218.0,401.0,61.0,...,282.0,3.0,194.0,146.0,153.0,572.0,0.0,355.0,531.0,0.0


In [632]:
# number_of_columns = len(census_data.columns.tolist())
# number_of_columns

In [636]:
def adjust_columns_by_overlap(census_data, overlap_df):
    # Ensure the correct data types
    census_data.columns = census_data.columns.str.strip()
    overlap_df['CTLabel'] = overlap_df['CTLabel'].astype(str)
    
    # Clean data: replace non-numeric entries with NaN and convert to float
    def clean_value(x):
        try:
            return float(str(x).replace(',', ''))
        except ValueError:
            return np.nan
    
    census_data = census_data.applymap(lambda x: clean_value(x) if isinstance(x, str) else x)
    
    # Adjust columns in census_data according to percentage_overlap
    for index, row in overlap_df.iterrows():
        ct_label = row['CTLabel']
        location_id = row['location_id']
        percentage_overlap = row['percentage_overlap']
        
        ct_col = f'{ct_label}'
        if ct_col in census_data.columns:
            adjusted_col = f'location_{location_id}_adjusted_count'
            if adjusted_col not in census_data.columns:
                census_data[adjusted_col] = 0
            census_data[adjusted_col] += census_data[ct_col].fillna(0).astype(float) * percentage_overlap
    
    return census_data

def combine_ctlabels_by_location_id(census_data, overlap_df):
    # Get a unique list of location_ids from overlap_df
    location_ids = overlap_df['location_id'].unique()
    
    # Combine CTLabels into their respective location_ids
    combined_data = pd.DataFrame()
    combined_data['label_(grouping)'] = census_data['label_(grouping)']
    
    for location_id in location_ids:
        adjusted_col = f'location_{location_id}_adjusted_count'
        if adjusted_col in census_data.columns:
            combined_data[f'location_{location_id}_count'] = census_data[adjusted_col].round().astype(int)
    
    return combined_data

# Re-run the adjusted functions
adjusted_census_data = adjust_columns_by_overlap(census_data.copy(), overlap_df)
combined_census_data = combine_ctlabels_by_location_id(adjusted_census_data, overlap_df)

# Ensure the label column is properly included
combined_census_data['label_(grouping)'] = census_data['label_(grouping)']
combined_census_data = combined_census_data[['label_(grouping)'] + [col for col in combined_census_data.columns if col != 'label_(grouping)']]

combined_census_data['manhattan'] = combined_census_data.iloc[:, 1:].sum(axis=1).round().astype(int)
combined_census_data.columns = combined_census_data.columns.str.replace('_count', '', regex=False)


  census_data = census_data.applymap(lambda x: clean_value(x) if isinstance(x, str) else x)


In [638]:
combined_census_data

Unnamed: 0,label_(grouping),location_103,location_232,location_148,location_4,location_231,location_79,location_125,location_144,location_234,...,location_194,location_13,location_120,location_202,location_209,location_12,location_158,location_128,location_261,manhattan
0,SEX AND AGE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Total population,4,46742,23808,26726,24124,44538,3092,12774,18107,...,1302,15805,12,11722,7914,3,11060,16,5650,1689929
2,Under 5 years,1,1825,693,925,1815,776,152,326,841,...,6,1355,2,644,247,0,476,0,204,68655
3,5 to 9 years,0,2039,641,1041,1526,566,113,265,571,...,3,1109,0,514,247,1,380,0,180,62885
4,10 to 14 years,0,2252,630,1117,1219,591,102,314,455,...,6,861,0,414,208,0,320,3,159,63296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,Rental vacancy rate (percent) [5],85,36,30,20,42,44,14,28,30,...,24,17,0,16,5,0,28,100,12,2185
170,HOUSING TENURE,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
171,Occupied housing units,0,20779,11010,11990,10066,23191,1702,6447,9700,...,22,7208,0,5114,3873,0,6009,0,3200,815681
172,Owner-occupied housing units,0,4893,925,1441,3810,3219,692,815,3329,...,7,1731,0,855,1700,0,2324,0,396,191094


In [639]:
# Define the file path relative to the data directory
combined_census_data_path = os.path.join(cwd, census_data_dir, "combined_census_data2222.csv")

# Save the DataFrame to CSV
combined_census_data.to_csv(combined_census_data_path, index=False, encoding='utf-8-sig')

print("DataFrame saved to:", combined_census_data)

DataFrame saved to:                           label_(grouping)  location_103  location_232  \
0                              SEX AND AGE             0             0   
1                         Total population             4         46742   
2                            Under 5 years             1          1825   
3                             5 to 9 years             0          2039   
4                           10 to 14 years             0          2252   
..                                     ...           ...           ...   
169      Rental vacancy rate (percent) [5]            85            36   
170                         HOUSING TENURE             0             0   
171                 Occupied housing units             0         20779   
172           Owner-occupied housing units             0          4893   
173          Renter-occupied housing units             0         15886   

     location_148  location_4  location_231  location_79  location_125  \
0               0

In [None]:
# # Initialize a DataFrame to store the adjusted demographic data for each taxi zone
# taxi_zone_ids = overlap_df['location_id'].unique()
# adjusted_demographic_df = pd.DataFrame(index=census_data.index[1:], columns=taxi_zone_ids).fillna(0)

# # Identify the columns in the census_data DataFrame that correspond to demographic data
# demographic_columns = census_data.columns[census_data.columns.str.startswith('census_tract_')]

# # Iterate through each row in the overlap_df DataFrame to get the intersection details
# for idx, row in overlap_df.iterrows():
#     ct_label = row['CTLabel']
#     location_id = row['location_id']
#     percentage_overlap = row['percentage_overlap']
    
#     print(f"\nProcessing CTLabel: {ct_label}, Location ID: {location_id}, Percentage Overlap: {percentage_overlap}")

#     # Format CTLabel to match the column names in census_data
#     ct_label_str = f'{float(ct_label):.2f}'.rstrip('0').rstrip('.')  # Convert to string and remove trailing zeros
    
#     print(f"Formatted CTLabel: {ct_label_str}")

#     # Extract the relevant columns for the current census tract
#     ct_columns = [col for col in demographic_columns if col.startswith(f'census_tract_{ct_label_str}')]

#     for col in ct_columns:
#         # Extract the demographic count
#         demographic_count = census_data[col].iloc[1:]  # Skip the first row which is non-numeric
#         print(f"\nDemographic Column: {col}")
#         print(f"Original Demographic Count:\n{demographic_count}")

#         # Convert all values to strings, remove commas, and convert to float
#         demographic_count = demographic_count.astype(str).str.replace(',', '').astype(float)
#         print(f"Numeric Demographic Count:\n{demographic_count}")

#         # Calculate the adjusted count based on the percentage overlap
#         adjusted_count = demographic_count * percentage_overlap
#         print(f"Adjusted Count:\n{adjusted_count}")

#         # Ensure that the adjusted_count is correctly added to the DataFrame
#         adjusted_demographic_df[location_id] = adjusted_demographic_df[location_id].add(adjusted_count, fill_value=0)
#         print(f"Updated Adjusted Demographic Data for Location ID {location_id}:\n{adjusted_demographic_df[location_id]}")

# # Display the final adjusted demographic DataFrame
# print("\nFinal Adjusted Demographic Data:")
# print(adjusted_demographic_df)


In [None]:
# Convert the dictionary to a DataFrame and ensure the columns are ordered by taxi_zone_ids
# adjusted_demographic_df = pd.DataFrame(adjusted_demographic_df)

# adjusted_demographic_df

In [None]:
# # Extract the labels and initial columns
# labels_and_initial_columns = census_data[['label_(grouping)', 'manhattan_count']].iloc[1:].reset_index(drop=True)

# # Reattach the labels and initial columns to the adjusted data
# final_demographic_data = pd.concat(
#     [labels_and_initial_columns, adjusted_demographic_df.reset_index(drop=True)], 
#     axis=1
# )

# # Rename columns to match taxi zone IDs
# final_demographic_data.columns = ['label_(grouping)', 'manhattan_count'] + list(map(str, taxi_zone_ids))
# final_demographic_data.iloc[:, 2:] = final_demographic_data.iloc[:, 2:].round()


In [None]:
# final_demographic_data.head(30)

In [None]:
# # Define the file path relative to the data directory
# final_demographic_data_path = os.path.join(cwd, census_data_dir, "final_demographic_data.csv")

# # Save the DataFrame to CSV
# final_demographic_data.to_csv(final_demographic_data_path, index=False, encoding='utf-8-sig')

# print("DataFrame saved to:", final_demographic_data)

In [640]:
combined_census_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,164,165,166,167,168,169,170,171,172,173
label_(grouping),SEX AND AGE,Total population,Under 5 years,5 to 9 years,10 to 14 years,15 to 19 years,20 to 24 years,25 to 29 years,30 to 34 years,35 to 39 years,...,"Sold, not occupied","For seasonal, recreational, or occ...",All other vacants,VACANCY RATES,Homeowner vacancy rate (percent) [4],Rental vacancy rate (percent) [5],HOUSING TENURE,Occupied housing units,Owner-occupied housing units,Renter-occupied housing units
location_103,0,4,1,0,0,0,1,0,1,0,...,0,0,20,0,0,85,0,0,0,0
location_232,0,46742,1825,2039,2252,2269,3125,3813,3305,2836,...,90,104,255,0,60,36,0,20779,4893,15886
location_148,0,23808,693,641,630,768,2616,3789,2811,1757,...,8,35,169,0,7,30,0,11010,925,10085
location_4,0,26726,925,1041,1117,1253,2555,3100,2440,1804,...,21,23,111,0,12,20,0,11990,1441,10548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
location_12,0,3,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
location_158,0,11060,476,380,320,317,638,1149,1486,1018,...,26,266,227,0,15,28,0,6009,2324,3684
location_128,0,16,0,0,3,1,0,4,1,1,...,0,0,0,0,100,100,0,0,0,0
location_261,0,5650,204,180,159,133,681,1230,964,589,...,0,22,33,0,3,12,0,3200,396,2804
