In [4]:
import requests
import pandas as pd

# NYC Open Data API endpoint
base_url = "https://data.cityofnewyork.us/resource/pvqr-7yc4.json"

# Settings
limit = 50000         # Rows per request
offset = 0            # Starting index
max_records = 200000 # Total you want

# Container for all records
data = []

while offset < max_records:
    params = {
        "$limit": limit,
        "$offset": offset
    }

    response = requests.get(base_url, params=params)

    if response.status_code != 200:
        print(f"Request failed at offset {offset} with status code {response.status_code}")
        break

    batch = response.json()

    if not batch:
        print("No more data returned.")
        break

    data.extend(batch)
    offset += limit

    print(f"✅ Retrieved {len(batch)} records (Total so far: {len(data)})")

# Convert to DataFrame
df = pd.DataFrame(data)

df = df.dropna(subset=['house_number', 'street_name', 'violation_county'])
df.to_csv("nyc_parking_violations_sample.csv", index=False)

print("📦 Done! Saved 200.000 rows to nyc_parking_violations_sample.csv")


✅ Retrieved 50000 records (Total so far: 50000)
✅ Retrieved 50000 records (Total so far: 100000)
✅ Retrieved 50000 records (Total so far: 150000)
✅ Retrieved 50000 records (Total so far: 200000)
📦 Done! Saved 200.000 rows to nyc_parking_violations_sample.csv


In [10]:
import requests
import pandas as pd

# NYC Open Data API endpoint
base_url = "https://data.cityofnewyork.us/resource/pvqr-7yc4.json"

# Load existing dataset
csv_path = "data/nyc_parking_violations_sample.csv"
existing_df = pd.read_csv(csv_path, low_memory=False)

existing_ids = set(existing_df['summons_number'])

# Download up to 50,000 new rows
limit = 50000
offset = len(existing_df)

params = {
    "$limit": limit,
    "$offset": offset
}

response = requests.get(base_url, params=params)
if response.status_code == 200:
    new_data = response.json()
    if new_data:
        new_df = pd.DataFrame(new_data)
        
        # Filter out known summons numbers before merge
        new_df = new_df[~new_df['summons_number'].isin(existing_ids)]

        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
        combined_df.to_csv(csv_path, index=False)

        print(f"📦 Downloaded: {len(new_data):,} rows")
        print(f"🧹 New unique rows added: {len(new_df):,}")
        print(f"📊 Total dataset size now: {len(combined_df):,}")
    else:
        print("🚫 No new data returned from API.")
else:
    print(f"❌ Request failed with status code {response.status_code}")


📦 Downloaded: 50,000 rows
🧹 New unique rows added: 50,000
📊 Total dataset size now: 318,384
