In [1]:
import pandas as pd 
import numpy as np

In [2]:
# read in the neighborhood prioritization map dataset
prioritization_df = pd.read_csv('/Users/Marcy_Student/Desktop/Food Insecurity Analysis/datasets/messy/Neighborhood Prioritization Map 2024.csv')

In [3]:
# Display basic information about the dataset like shape, columns, data types, missing values, and duplicates
print("\n\n--- Neighborhood Prioritization Dataset ---")
print(f"Shape: {prioritization_df.shape}")
print(f"Columns: {list(prioritization_df.columns)}")
print(f"\nData types:\n{prioritization_df.dtypes}")
print(f"\nMissing values:\n{prioritization_df.isnull().sum()}")
print(f"Duplicate rows: {prioritization_df.duplicated().sum()}")



--- Neighborhood Prioritization Dataset ---
Shape: (197, 15)
Columns: ['Weighted.Score', 'NTA', 'NTA.Name', 'Geometry', 'Latitude (generated)', 'Longitude (generated)', 'Food.Insecure.Percentage Rank', 'Food.Insecure.Percentage', 'Rank', 'sg abv ca Rank', 'Sg Abv Ca', 'Unemployment.Rate Rank', 'Unemployment.Rate', 'Vulnerable.Population.Percentage Rank', 'Vulnerable.Population.Percentage']

Data types:
Weighted.Score                           float64
NTA                                       object
NTA.Name                                  object
Geometry                                  object
Latitude (generated)                     float64
Longitude (generated)                    float64
Food.Insecure.Percentage Rank              int64
Food.Insecure.Percentage                  object
Rank                                       int64
sg abv ca Rank                             int64
Sg Abv Ca                                float64
Unemployment.Rate Rank                     int64
Unem

In [4]:
prioritization_df.head()

Unnamed: 0,Weighted.Score,NTA,NTA.Name,Geometry,Latitude (generated),Longitude (generated),Food.Insecure.Percentage Rank,Food.Insecure.Percentage,Rank,sg abv ca Rank,Sg Abv Ca,Unemployment.Rate Rank,Unemployment.Rate,Vulnerable.Population.Percentage Rank,Vulnerable.Population.Percentage
0,8.221,BK0104,East Williamsburg,MultiPolygon,40.714789,-73.932444,1,35.99%,1,1,2776626.0,126,6.38%,146,12.43%
1,8.0704,BX0501,University Heights (South)-Morris Heights,MultiPolygon,40.85093,-73.918963,14,29.44%,2,6,1669389.0,20,11.98%,34,19.63%
2,7.6866,BX0901,Soundview-Bruckner-Bronx River,MultiPolygon,40.830599,-73.872393,36,22.63%,3,7,1625976.0,32,10.06%,25,21.43%
3,7.3895,MN1202,Washington Heights (North),MultiPolygon,40.857729,-73.9355,28,24.29%,4,9,1463457.0,19,12.25%,41,18.57%
4,7.2775,BK1503,Sheepshead Bay-Manhattan Beach-Gerritsen Beach,MultiPolygon,40.587225,-73.933868,42,21.11%,5,3,1907056.0,170,4.91%,81,15.95%


In [5]:
# Let's rename the columns to all lowercase and replace spaces with underscores for consistency
prioritization_df.columns = prioritization_df.columns.str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('.', '_')

In [6]:
# Clean Neighborhood Dataset
prioritization_clean = prioritization_df.copy()

In [7]:
# Clean percentage columns (remove % and convert to float)
def clean_percentage(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, str):
        return float(x.replace('%', ''))
    return x

In [8]:
# Apply cleaning function to relevant columns by replacing the original columns with cleaned versions

prioritization_clean['food_insecure_percentage'] = prioritization_clean['food_insecure_percentage'].apply(clean_percentage)
prioritization_clean['unemployment_rate'] = prioritization_clean['unemployment_rate'].apply(clean_percentage)
prioritization_clean['vulnerable_population_percentage'] = prioritization_clean['vulnerable_population_percentage'].apply(clean_percentage)
prioritization_clean['supply_gap'] = prioritization_clean['sg_abv_ca']

In [9]:
# A functtion that extrats the borough from NTA code
def extract_borough(nta):
    if nta.startswith('BX'):
        return 'Bronx'
    elif nta.startswith('BK'):
        return 'Brooklyn'
    elif nta.startswith('MN'):
        return 'Manhattan'
    elif nta.startswith('QN'):
        return 'Queens'
    elif nta.startswith('SI'):
        return 'Staten Island'
    else:
        return 'Unknown'

In [10]:
# Apply the function to create a new 'borough' column (Feature Engineering)
prioritization_clean['borough'] = prioritization_clean['nta'].apply(extract_borough)

In [11]:
print(f"\nBorough distribution by Neighborhood:\n{prioritization_clean['borough'].value_counts()}")


Borough distribution by Neighborhood:
borough
Queens           59
Brooklyn         53
Bronx            37
Manhattan        32
Staten Island    16
Name: count, dtype: int64


In [12]:
prioritization_clean.head()

Unnamed: 0,weighted_score,nta,nta_name,geometry,latitude_generated,longitude_generated,food_insecure_percentage_rank,food_insecure_percentage,rank,sg_abv_ca_rank,sg_abv_ca,unemployment_rate_rank,unemployment_rate,vulnerable_population_percentage_rank,vulnerable_population_percentage,supply_gap,borough
0,8.221,BK0104,East Williamsburg,MultiPolygon,40.714789,-73.932444,1,35.99,1,1,2776626.0,126,6.38,146,12.43,2776626.0,Brooklyn
1,8.0704,BX0501,University Heights (South)-Morris Heights,MultiPolygon,40.85093,-73.918963,14,29.44,2,6,1669389.0,20,11.98,34,19.63,1669389.0,Bronx
2,7.6866,BX0901,Soundview-Bruckner-Bronx River,MultiPolygon,40.830599,-73.872393,36,22.63,3,7,1625976.0,32,10.06,25,21.43,1625976.0,Bronx
3,7.3895,MN1202,Washington Heights (North),MultiPolygon,40.857729,-73.9355,28,24.29,4,9,1463457.0,19,12.25,41,18.57,1463457.0,Manhattan
4,7.2775,BK1503,Sheepshead Bay-Manhattan Beach-Gerritsen Beach,MultiPolygon,40.587225,-73.933868,42,21.11,5,3,1907056.0,170,4.91,81,15.95,1907056.0,Brooklyn


In [13]:
# let's rename the 'nta' column to 'nta_id' for clarity and to match the naming convention of the other datasets we will be using for analysis
prioritization_clean['nta_id'] = prioritization_clean['nta']

In [14]:
prioritization_clean.drop(columns=['nta'], inplace=True)

In [15]:
#Let's reorder the columns for better readability
prioritization_clean = prioritization_clean[['nta_id', 'nta_name', 'borough', 'food_insecure_percentage','food_insecure_percentage_rank', 'unemployment_rate', 'unemployment_rate_rank', 'vulnerable_population_percentage', 'vulnerable_population_percentage_rank', 'supply_gap', 'weighted_score', 'latitude_generated', 'latitude_generated']]

In [16]:
prioritization_clean.head()

Unnamed: 0,nta_id,nta_name,borough,food_insecure_percentage,food_insecure_percentage_rank,unemployment_rate,unemployment_rate_rank,vulnerable_population_percentage,vulnerable_population_percentage_rank,supply_gap,weighted_score,latitude_generated,latitude_generated.1
0,BK0104,East Williamsburg,Brooklyn,35.99,1,6.38,126,12.43,146,2776626.0,8.221,40.714789,40.714789
1,BX0501,University Heights (South)-Morris Heights,Bronx,29.44,14,11.98,20,19.63,34,1669389.0,8.0704,40.85093,40.85093
2,BX0901,Soundview-Bruckner-Bronx River,Bronx,22.63,36,10.06,32,21.43,25,1625976.0,7.6866,40.830599,40.830599
3,MN1202,Washington Heights (North),Manhattan,24.29,28,12.25,19,18.57,41,1463457.0,7.3895,40.857729,40.857729
4,BK1503,Sheepshead Bay-Manhattan Beach-Gerritsen Beach,Brooklyn,21.11,42,4.91,170,15.95,81,1907056.0,7.2775,40.587225,40.587225


In [17]:
# prioritization_clean.to_csv('/Users/Marcy_Student/Desktop/Food Insecurity Analysis/datasets/cleaned/cleaned_neighborhood_prioritization.csv', index=False)