In [3]:
# 01_exploration.ipynb

# --- Notebook: Initial Exploration of Zoning Text Documents ---


In [4]:
# 📦 Import libraries
import pandas as pd
import re

In [5]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [6]:
import spacy
from collections import Counter

In [7]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pdfplumber

# Path to your zoning PDF file
pdf_path = "/Users/bhavinikapur/Desktop/power parks- ai/Article IV- Manufacturing Zoning District.pdf"

In [9]:
# Initialize empty string to collect text
full_text = ""

In [10]:
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"

In [11]:
# Save as plain text for later use
with open("/Users/bhavinikapur/Desktop/power parks- ai/sample_zoning_text.txt", "w", encoding="utf-8") as f:
    f.write(full_text)

print("✅ PDF text extraction complete! Saved as sample_zoning_text.txt")

✅ PDF text extraction complete! Saved as sample_zoning_text.txt


In [12]:
# --- Load sample zoning text file ---
# Replace this path with your actual file location
with open("/Users/bhavinikapur/Desktop/power parks- ai/sample_zoning_text.txt", "r", encoding="utf-8") as f:
    zoning_text = f.read()

print("✅ Loaded zoning text document. First 1000 characters preview:")
print(zoning_text[:1000])

✅ Loaded zoning text document. First 1000 characters preview:
ZZoonniinngg RReessoolluuttiioonn
TTHHEE CCIITTYY OOFF NNEEWW YYOORRKK CCIITTYY PPLLAANNNNIINNGG CCOOMMMMIISSSSIIOONN
EErriicc AAddaammss,, MMaayyoorr DDaanniieell RR.. GGaarrooddnniicckk,, CChhaaiirr
Article IV - Manufacturing District Regulations
File generated by https://zr.planning.nyc.gov on 7/1/2025
ZZoonniinngg RReessoolluuttiioonn
TTHHEE CCIITTYY OOFF NNEEWW YYOORRKK CCIITTYY PPLLAANNNNIINNGG CCOOMMMMIISSSSIIOONN
EErriicc AAddaammss,, MMaayyoorr DDaanniieell RR.. GGaarrooddnniicckk,, CChhaaiirr
Chapter 1 - Statement of Legislative Intent
File generated by https://zr.planning.nyc.gov on 7/1/2025
Chapter 1 - Statement of Legislative Intent
41-00 - GENERAL PURPOSES OF MANUFACTURING DISTRICTS
LAST AMENDED
12/21/1989
The Manufacturing Districts established in this Resolution are designed to promote and protect public health, safety, and
general welfare. These general goals include, among others, the following specific pur

In [13]:
import re

In [14]:
# --- Basic cleaning ---
# Remove extra whitespace
cleaned_text = re.sub(r'\s+', ' ', zoning_text)
print("\n✅ Cleaned text sample:")
print(cleaned_text[:500])


✅ Cleaned text sample:
ZZoonniinngg RReessoolluuttiioonn TTHHEE CCIITTYY OOFF NNEEWW YYOORRKK CCIITTYY PPLLAANNNNIINNGG CCOOMMMMIISSSSIIOONN EErriicc AAddaammss,, MMaayyoorr DDaanniieell RR.. GGaarrooddnniicckk,, CChhaaiirr Article IV - Manufacturing District Regulations File generated by https://zr.planning.nyc.gov on 7/1/2025 ZZoonniinngg RReessoolluuttiioonn TTHHEE CCIITTYY OOFF NNEEWW YYOORRKK CCIITTYY PPLLAANNNNIINNGG CCOOMMMMIISSSSIIOONN EErriicc AAddaammss,, MMaayyoorr DDaanniieell RR.. GGaarrooddnniicckk,, CCh


In [15]:
# Load English model; download if not yet installed
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


In [16]:
doc = nlp(cleaned_text)
print("✅ spaCy processing complete!")


✅ spaCy processing complete!


In [17]:
print("\n✅ Named Entities (first 20):")
for ent in doc.ents[:20]:
    print(f"{ent.text} ({ent.label_})")



✅ Named Entities (first 20):
ZZoonniinngg RReessoolluuttiioonn TTHHEE (PERSON)
PPLLAANNNNIINNGG CCOOMMMMIISSSSIIOONN (PERSON)
MMaayyoorr DDaanniieell RR (PERSON)
CChhaaiirr Article IV - Manufacturing (PERSON)
7/1/2025 ZZoonniinngg RReessoolluuttiioonn TTHHEE (FAC)
PPLLAANNNNIINNGG CCOOMMMMIISSSSIIOONN (PERSON)
MMaayyoorr DDaanniieell RR (PERSON)
CChhaaiirr Chapter 1 - Statement of (PERSON)
Legislative Intent File (ORG)
7/1/2025 Chapter 1 - Statement of Legislative (PRODUCT)
Resolution (FAC)
Resolution (FAC)
41-10 (CARDINAL)
4/9/1981 (DATE)
Residence (ORG)
M1-5B (PRODUCT)
M1-5M (PRODUCT)
M1-6M Districts (PRODUCT)
M1-1D (PRODUCT)
M1-2D (PRODUCT)


In [18]:
# Define keywords to check
keywords = [
    "renewable energy", "solar", "wind", "battery", 
    "data center", "manufacturing", "industrial", 
    "special permit", "environmental", "community", 
    "equity", "justice", "public health"
]

# Lowercase text for search
text_lower = cleaned_text.lower()

# Count keyword occurrences
keyword_counts = {kw: text_lower.count(kw) for kw in keywords}

print("\n✅ Keyword frequencies:")
for kw, count in keyword_counts.items():
    print(f"{kw}: {count}")



✅ Keyword frequencies:
renewable energy: 4
solar: 3
wind: 22
battery: 1
data center: 0
manufacturing: 245
industrial: 39
special permit: 74
environmental: 7
community: 71
equity: 0
justice: 0
public health: 6


In [19]:
tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
token_freq = Counter(tokens)

print("\n✅ Most common tokens (excluding stop words and punctuation):")
print(token_freq.most_common(20))



✅ Most common tokens (excluding stop words and punctuation):
[('shall', 577), ('●', 570), ('Section', 464), ('M1', 450), ('P', 332), ('uses', 309), ('Districts', 298), ('42', 290), ('feet', 288), ('lot', 273), ('area', 271), ('Use', 259), ('use', 257), ('AMENDED', 253), ('parking', 248), ('Group', 234), ('permitted', 231), ('M2', 214), ('provisions', 212), ('street', 210)]
