In [13]:
import pandas as pd

# Prep dataframe for all applicable date ranges for which there will be total_event count
queried_start_date = pd.to_datetime('2020-01-01') #, format='%Y%m%d')
queried_end_date = pd.to_datetime('2023-06-01') #, format='%Y%m%d')
queried_date_range = pd.date_range(start=queried_start_date, end=queried_end_date)
queried_date_range_df = pd.DataFrame({'date': queried_date_range})
queried_date_range_df

Unnamed: 0,date
0,2020-01-01
1,2020-01-02
2,2020-01-03
3,2020-01-04
4,2020-01-05
...,...
1243,2023-05-28
1244,2023-05-29
1245,2023-05-30
1246,2023-05-31


In [14]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

QUERIED_TEXT = 'events'

queried_start_date = pd.to_datetime('2020-01-01')
queried_end_date = pd.to_datetime('2023-06-01')
queried_date_range = pd.date_range(start=queried_start_date, end=queried_end_date)
queried_date_range_df = pd.DataFrame({'date': queried_date_range})

# Set the base URL
base_url = 'https://donyc.com/{}/{}/{}/{}?page={}'

queried_date_range_df = pd.read_csv(f'donyc_{QUERIED_TEXT}.csv', parse_dates=['date'])

# Loop through each date in the date range
for i, row in queried_date_range_df.iterrows():

    date = row['date']

    # Check if the 'total_events' column is not NaN, if yes then break out of the loop
    if not pd.isna(row[f'total_{QUERIED_TEXT}']):
        print(f'{date} is already populated')
        continue

    # Format the URL with the year, month, day, and page number
    year = date.year
    month = date.month
    day = date.day
    page_num = 1
    url = base_url.format(QUERIED_TEXT, year, month, day, page_num)

    # Initialize the count to 0
    count = 0

    # Loop through each page of events for the current date
    while True:
        # Make a request to the current page
        response = requests.get(url)

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all div elements with a class attribute starting with 'ds-listing event-card'
        event_cards = soup.select('div[class^="ds-listing event-card"]')

        # If no event cards are found, break out of the loop
        if not event_cards:
            break

        # Loop through each event card that matches the specified classes
        for card in event_cards:
            # Find the anchor tag with an href attribute that starts with '/events/2023/3/1' and the specified classes
            anchor = card.find('a', href=lambda href: href and href.startswith('/{}/{}/{}/{}'.format(QUERIED_TEXT,year, month, day)), class_='ds-listing-event-title url summary')
            if anchor:
                count += 1

        # Increment the page number and update the URL
        print(page_num)
        page_num += 1
        url = base_url.format(QUERIED_TEXT, year, month, day, page_num)

    # Set the count for the current date in the 'total_events' column of the DataFrame
    queried_date_range_df.loc[i, f"total_{QUERIED_TEXT}"] = count
    queried_date_range_df.to_csv(f'donyc_{QUERIED_TEXT}.csv', index=False)
    print(queried_date_range_df.loc[i])

2020-01-01 00:00:00 is already populated
2020-01-02 00:00:00 is already populated
2020-01-03 00:00:00 is already populated
2020-01-04 00:00:00 is already populated
2020-01-05 00:00:00 is already populated
2020-01-06 00:00:00 is already populated
2020-01-07 00:00:00 is already populated
2020-01-08 00:00:00 is already populated
2020-01-09 00:00:00 is already populated
2020-01-10 00:00:00 is already populated
2020-01-11 00:00:00 is already populated
2020-01-12 00:00:00 is already populated
2020-01-13 00:00:00 is already populated
2020-01-14 00:00:00 is already populated
2020-01-15 00:00:00 is already populated
2020-01-16 00:00:00 is already populated
2020-01-17 00:00:00 is already populated
2020-01-18 00:00:00 is already populated
2020-01-19 00:00:00 is already populated
2020-01-20 00:00:00 is already populated
2020-01-21 00:00:00 is already populated
2020-01-22 00:00:00 is already populated
2020-01-23 00:00:00 is already populated
2020-01-24 00:00:00 is already populated
2020-01-25 00:00