In [54]:
import os
import requests
import random
import csv
from datetime import datetime, timedelta

In [55]:
# API endpoint for the dataset metadata
metadata_url = "https://data.cityofnewyork.us/api/views/bkfu-528j"

# Make a GET request to fetch the metadata
response = requests.get(metadata_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON metadata
    metadata = response.json()
    
    # Print the schema to identify field names
    for column in metadata['columns']:
        print(f"Name: {column['name']}, Field Name: {column['fieldName']}")
else:
    print(f"Failed to retrieve metadata: {response.status_code}, {response.text}")


Name: Event ID, Field Name: event_id
Name: Event Name, Field Name: event_name
Name: Start Date/Time, Field Name: start_date_time
Name: End Date/Time, Field Name: end_date_time
Name: Event Agency, Field Name: event_agency
Name: Event Type, Field Name: event_type
Name: Event Borough, Field Name: event_borough
Name: Event Location, Field Name: event_location
Name: Event Street Side, Field Name: event_street_side
Name: Street Closure Type, Field Name: street_closure_type
Name: Community Board, Field Name: community_board
Name: Police Precinct, Field Name: police_precinct


In [56]:
# API endpoint for the dataset
api_url = "https://data.cityofnewyork.us/resource/bkfu-528j.json"


In [57]:
# Function to filter events for Manhattan
def filter_events_for_manhattan(events):
    return [event for event in events if event.get('event_borough') == 'Manhattan']

In [58]:
# Function to scrape 100 random events per day
def scrape_random_events_per_day(start_date, end_date, output_file_path):
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["Event ID", "Event Name", "Start Date/Time", "End Date/Time",
                      "Event Agency", "Event Type", "Event Borough", "Event Location",
                      "Event Street Side", "Street Closure Type", "Community Board", "Police Precinct"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        current_date = start_date
        while current_date <= end_date:
            # Define the date range for the current day
            date_str = current_date.strftime("%Y-%m-%d")
            start_datetime = f"{date_str}T00:00:00"
            end_datetime = f"{date_str}T23:59:59"

            # Construct query parameters for filtering by date and borough (Manhattan)
            params = {
                "$where": f"start_date_time >= '{start_datetime}' AND start_date_time <= '{end_datetime}' AND event_borough='Manhattan'"
            }

            # Make GET request to fetch events for the current day
            response = requests.get(api_url, params=params)
            if response.status_code == 200:
                events = response.json()
                # Filter events for Manhattan
                manhattan_events = filter_events_for_manhattan(events)
                # Randomly select 100 events (if available)
                if len(manhattan_events) >= 100:
                    random_events = random.sample(manhattan_events, 100)
                    for event in random_events:
                        writer.writerow({
                            "Event ID": event.get("event_id", ""),
                            "Event Name": event.get("event_name", ""),
                            "Start Date/Time": event.get("start_date_time", ""),
                            "End Date/Time": event.get("end_date_time", ""),
                            "Event Agency": event.get("event_agency", ""),
                            "Event Type": event.get("event_type", ""),
                            "Event Borough": event.get("event_borough", ""),
                            "Event Location": event.get("event_location", ""),
                            "Event Street Side": event.get("event_street_side", ""),
                            "Street Closure Type": event.get("street_closure_type", ""),
                            "Community Board": event.get("community_board", ""),
                            "Police Precinct": event.get("police_precinct", "")
                        })
                else:
                    print(f"Not enough events for {date_str}.")
            else:
                print(f"Failed to retrieve events for {date_str}: {response.status_code}, {response.text}")

            # Move to the next day
            current_date += timedelta(days=1)

In [59]:
# Define start and end dates
start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 4, 30)

# Define the directory where the data is located relative to the current working directory
event_dir = os.path.join(os.getcwd(), "..", "Datasets", "events")

# Create the directory if it doesn't exist
os.makedirs(event_dir, exist_ok=True)

# Define the file path relative to the data directory
output_file_path = os.path.join(event_dir, "nyc_historical_events.csv")

# Example usage
scrape_random_events_per_day(start_date, end_date, output_file_path)

Not enough events for 2023-02-13.
Not enough events for 2023-02-15.
Not enough events for 2023-02-16.
Not enough events for 2023-02-17.
Not enough events for 2023-02-18.
Not enough events for 2023-02-19.
Not enough events for 2023-02-20.
Not enough events for 2023-02-21.
Not enough events for 2023-02-22.
Not enough events for 2023-02-23.
Not enough events for 2023-02-24.
Not enough events for 2023-02-25.
Not enough events for 2023-02-26.
Not enough events for 2023-02-27.
Not enough events for 2023-02-28.
Not enough events for 2023-03-01.
Not enough events for 2023-03-02.
Not enough events for 2023-03-03.
Not enough events for 2023-03-04.
Not enough events for 2023-03-05.
Not enough events for 2023-03-06.
Not enough events for 2023-03-07.
Not enough events for 2023-03-08.
Not enough events for 2023-03-09.
Not enough events for 2023-03-10.
Not enough events for 2023-03-12.
Not enough events for 2023-03-15.
Not enough events for 2023-03-16.
Not enough events for 2023-03-17.
Not enough eve

Not enough events for 74:
missing_days = [
    '2023-02-13', '2023-02-15', '2023-02-16', '2023-02-17', '2023-02-18', 
    '2023-02-19', '2023-02-20', '2023-02-21', '2023-02-22', '2023-02-23', 
    '2023-02-24', '2023-02-25', '2023-02-26', '2023-02-27', '2023-02-28', 
    '2023-03-01', '2023-03-02', '2023-03-03', '2023-03-04', '2023-03-05', 
    '2023-03-06', '2023-03-07', '2023-03-08', '2023-03-09', '2023-03-10', 
    '2023-03-12', '2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18', 
    '2023-03-19', '2023-03-20', '2023-03-21', '2023-03-22', '2023-03-23', 
    '2023-03-24', '2023-03-25', '2023-03-26', '2023-03-27', '2023-03-28', 
    '2023-03-29', '2023-03-30', '2023-03-31', '2023-04-01', '2023-04-02', 
    '2023-04-03', '2023-04-04', '2023-04-05', '2023-04-06', '2023-04-07', 
    '2023-04-08', '2023-04-09', '2023-04-10', '2023-04-11', '2023-04-12', 
    '2023-04-13', '2023-04-14', '2023-04-15', '2023-04-16', '2023-04-17', 
    '2023-04-18', '2023-04-19', '2024-03-18', '2024-03-19', '2024-03-20', 
    '2024-03-25', '2024-03-26', '2024-03-27', '2024-03-28', '2024-03-29', 
    '2024-03-30', '2024-04-02', '2024-04-07', '2024-04-30'
]