In [5]:
import pandas as pd
import numpy as np

# Load the school data
school_data = pd.read_csv('2019_-_2020_School_Locations_20240617.csv')

# Convert open_date to datetime
school_data['open_date'] = pd.to_datetime(school_data['open_date'], errors='coerce')

# Fill missing status descriptions with 'Open' (assuming if status is not specified, the school is open)
school_data['Status_descriptions'] = school_data['Status_descriptions'].fillna('Open')

# Filter for relevant columns
school_data = school_data[['open_date', 'Status_descriptions', 'LONGITUDE', 'LATITUDE', 'Location_Category_Description']]

# Sort school data by location and open date
school_data = school_data.sort_values(by=['LONGITUDE', 'LATITUDE', 'open_date'])

# Calculate the closure datetime for each school
closure_dates = []
for index, row in school_data.iterrows():
    next_school = school_data[(school_data['LONGITUDE'] == row['LONGITUDE']) & 
                              (school_data['LATITUDE'] == row['LATITUDE']) & 
                              (school_data['open_date'] > row['open_date'])].head(1)
    if not next_school.empty:
        closure_date = next_school['open_date'].values[0]
    else:
        closure_date = pd.Timestamp('2024-12-31 23:59:59')
    
    closure_dates.append(closure_date)

school_data['closure_date'] = closure_dates
school_data.loc[school_data['Status_descriptions'] == 'Closed', 'closure_date'] = school_data['open_date']

# Generate hourly datetime range from 2013 to 2024
hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')
hourly_df = pd.DataFrame(hourly_dates, columns=['datetime'])

# Add school type columns initialized to 0
school_types = [
    'Elementary', 'High school', 'Junior High-Intermediate-Middle', 'K-8', 
    'Secondary School', 'K-12 all grades', 'Early Childhood', 'Ungraded', 
    'Collaborative or Multi-graded'
]

for school_type in school_types:
    hourly_df[school_type] = 0

# Count the number of functional schools for each type at each hour
for school_type in school_types:
    type_schools = school_data[school_data['Location_Category_Description'] == school_type]
    for index, row in hourly_df.iterrows():
        datetime = row['datetime']
        open_schools = type_schools[(type_schools['open_date'] <= datetime) & (type_schools['closure_date'] > datetime)]
        hourly_df.at[index, school_type] = len(open_schools)

# Save the results to a new CSV file
hourly_df.to_csv('school_counts_by_hour.csv', index=False)


  hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')


In [19]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from datetime import datetime, timedelta
import numpy as np
import holidays

# Define function to categorize holidays
def get_holiday_categories(date):
    us_holidays = holidays.US(years=date.year)
    ny2_holidays = holidays.US(subdiv='NY', years=date.year)

    last_day_of_april = pd.Timestamp(year=date.year, month=4, day=30)
    arbor_day = last_day_of_april - pd.DateOffset(days=(last_day_of_april.weekday() - 4) % 7)
    ny_holidays = {
        "Lincoln's Birthday": pd.Timestamp(year=date.year, month=2, day=12),
        "Election Day": pd.Timestamp(year=date.year, month=11, day=(1 + (2 + date.replace(day=1).weekday()) % 7) + 1) if date.year % 2 == 0 else None,
        "Juneteenth": pd.Timestamp(year=date.year, month=6, day=19),
        "Valentine's Day": pd.Timestamp(year=date.year, month=2, day=14),
        "St. Patrick's Day": pd.Timestamp(year=date.year, month=3, day=17),
        "April Fool's Day": pd.Timestamp(year=date.year, month=4, day=1),
        "Mother's Day": pd.Timestamp(year=date.year, month=5, day=8 + (6 - pd.Timestamp(year=date.year, month=5, day=1).weekday()) % 7),
        "Father's Day": pd.Timestamp(year=date.year, month=6, day=15 + (6 - pd.Timestamp(year=date.year, month=6, day=1).weekday()) % 7),
        "Flag Day": pd.Timestamp(year=date.year, month=6, day=14),
        "Halloween": pd.Timestamp(year=date.year, month=10, day=31),
        "Groundhog Day": pd.Timestamp(year=date.year, month=2, day=2),
        "Arbor Day": arbor_day,
        "Patriot Day": pd.Timestamp(year=date.year, month=9, day=11),
        "Constitution Day": pd.Timestamp(year=date.year, month=9, day=17),
        "Christmas Eve": pd.Timestamp(year=date.year, month=12, day=24),
        "New Year's Eve": pd.Timestamp(year=date.year, month=12, day=31)
    }

    religious_holidays = {
        "Hanukkah": pd.Timestamp(year=date.year, month=12, day=10), # Example date
        "Ramadan": pd.Timestamp(year=date.year, month=4, day=12), # Example date
        "Eid al-Fitr": pd.Timestamp(year=date.year, month=5, day=12), # Example date
        "Eid al-Adha": pd.Timestamp(year=date.year, month=7, day=20), # Example date
        "Diwali": pd.Timestamp(year=date.year, month=11, day=4), # Example date
        "Vesak": pd.Timestamp(year=date.year, month=5, day=26), # Example date
        "Lunar New Year": pd.Timestamp(year=date.year, month=2, day=12), # Example date
        "Passover": pd.Timestamp(year=date.year, month=3, day=27), # Example date
        "Rosh Hashanah": pd.Timestamp(year=date.year, month=9, day=6), # Example date
        "Yom Kippur": pd.Timestamp(year=date.year, month=9, day=15), # Example date
        "Good Friday": pd.Timestamp(year=date.year, month=4, day=2), # Example date
        "Easter": pd.Timestamp(year=date.year, month=4, day=4), # Example date
    }

    school_holidays = {
        "Summer Vacation": (pd.Timestamp(year=date.year, month=6, day=25), pd.Timestamp(year=date.year, month=9, day=6)),
        "Winter Recess": (pd.Timestamp(year=date.year, month=12, day=24), pd.Timestamp(year=date.year + 1, month=1, day=2)),
        "Midwinter Recess": (pd.Timestamp(year=date.year, month=2, day=15), pd.Timestamp(year=date.year, month=2, day=19)),
        "Spring Recess": (pd.Timestamp(year=date.year, month=4, day=1), pd.Timestamp(year=date.year, month=4, day=9)),
        "Labor Day": (pd.Timestamp(year=date.year, month=9, day=(1 + (0 - pd.Timestamp(year=date.year, month=9, day=1).weekday()) % 7)), 0),
        "Rosh Hashanah": (pd.Timestamp(year=date.year, month=9, day=6), 0),# Example date
        "Yom Kippur": (pd.Timestamp(year=date.year, month=9, day=15), 0),# Example date
        "Columbus Day": (pd.Timestamp(year=date.year, month=10, day=(8 + (0 - pd.Timestamp(year=date.year, month=10, day=1).weekday()) % 7)), 0),
        "Election Day": (pd.Timestamp(year=date.year, month=11, day=(1 + (2 + pd.Timestamp(year=date.year, month=11, day=1).weekday()) % 7) + 1), 0),
        "Veterans Day": (pd.Timestamp(year=date.year, month=11, day=11), 0),
        "Thanksgiving Break": (pd.Timestamp(year=date.year, month=11, day=(22 + (3 - pd.Timestamp(year=date.year, month=11, day=1).weekday()) % 7)), 
                           pd.Timestamp(year=date.year, month=11, day=(23 + (3 - pd.Timestamp(year=date.year, month=11, day=1).weekday()) % 7))),
        "Dr. Martin Luther King Jr. Day": (pd.Timestamp(year=date.year, month=1, day=(15 + (0 - pd.Timestamp(year=date.year, month=1, day=1).weekday()) % 7)), 0),
        "Lunar New Year": (pd.Timestamp(year=date.year, month=2, day=12), 0),# Example date
        "Memorial Day": (pd.Timestamp(year=date.year, month=5, day=(31 - (pd.Timestamp(year=date.year, month=5, day=31).weekday() - 0))), 0)
    }

    categories = {
        'national_holiday': 0,
        'religious_holiday': 0,
        'special_day': 0,
        'school_holiday': 0
    }

    if date in us_holidays:
        categories['national_holiday'] = 1
    if any(date == d for d in religious_holidays.values() if d is not None):
        categories['religious_holiday'] = 1
    if date in ny2_holidays and date not in ny_holidays:
        categories['special_day'] = 1
    if any(date == d for d in ny_holidays.values() if d is not None):
        categories['special_day'] = 1
    if any(start <= date <= end if isinstance(end, pd.Timestamp) else date == start for start, end in school_holidays.values()):
        categories['school_holiday'] = 1

    return categories

# Generate hourly datetime range from 2013 to 2024
hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')
hourly_df = pd.DataFrame(hourly_dates, columns=['datetime'])

# Initialize holiday categories
hourly_df['national_holiday'] = 0
hourly_df['religious_holiday'] = 0
hourly_df['special_day'] = 0
hourly_df['school_holiday'] = 0

# Apply holiday categories to the dataframe
for index, row in hourly_df.iterrows():
    categories = get_holiday_categories(row['datetime'])
    hourly_df.at[index, 'national_holiday'] = categories['national_holiday']
    hourly_df.at[index, 'religious_holiday'] = categories['religious_holiday']
    hourly_df.at[index, 'special_day'] = categories['special_day']
    hourly_df.at[index, 'school_holiday'] = categories['school_holiday']

# Save the results to a new CSV file
hourly_df.to_csv('holiday_counts_by_hour.csv', index=False)


  hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')


In [22]:
import pandas as pd

# Load the landmarks data
landmark_data = pd.read_csv('Individual_Landmark_Sites_20240617.csv')

# Convert DesDate to datetime
landmark_data['DesDate'] = pd.to_datetime(landmark_data['DesDate'], errors='coerce')

# Filter for relevant columns
landmark_data = landmark_data[['DesDate', 'Shape_Leng', 'Shape_Area']]

# Fill NaN values in Shape_Leng and Shape_Area with 0 (assuming missing values mean 0)
landmark_data['Shape_Leng'] = landmark_data['Shape_Leng'].fillna(0)
landmark_data['Shape_Area'] = landmark_data['Shape_Area'].fillna(0)

# Generate hourly datetime range from 2013 to 2024
hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')
hourly_df = pd.DataFrame(hourly_dates, columns=['datetime'])

# Initialize columns for the number of landmarks and their sum of Shape_Leng and Shape_Area
hourly_df['NumLandmarksOpened'] = 0
hourly_df['Sum_Shape_Leng'] = 0.0
hourly_df['Sum_Shape_Area'] = 0.0

# Count the number of landmarks open at each hour and sum Shape_Leng and Shape_Area
for index, row in hourly_df.iterrows():
    datetime = row['datetime']
    open_landmarks = landmark_data[landmark_data['DesDate'] <= datetime]
    hourly_df.at[index, 'NumLandmarksOpened'] = len(open_landmarks)
    hourly_df.at[index, 'Sum_Shape_Leng'] = open_landmarks['Shape_Leng'].sum()
    hourly_df.at[index, 'Sum_Shape_Area'] = open_landmarks['Shape_Area'].sum()

# Save the results to a new CSV file
hourly_df.to_csv('landmark_counts_by_hour.csv', index=False)


  hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')


In [None]:
import pandas as pd

# Load the business data
business_data = pd.read_csv('Legally_Operating_Businesses_20240617.csv')

# Convert License Creation Date and License Expiration Date to datetime
business_data['License Creation Date'] = pd.to_datetime(business_data['License Creation Date'], errors='coerce')
business_data['License Expiration Date'] = pd.to_datetime(business_data['License Expiration Date'], errors='coerce')

# Filter for relevant columns
business_data = business_data[['License Type', 'License Expiration Date', 'License Status', 'License Creation Date', 'Industry']]

# Only consider businesses that are active
business_data = business_data[business_data['License Status'] == 'Active']

# Generate hourly datetime range from 2013 to 2024
hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')
hourly_df = pd.DataFrame(hourly_dates, columns=['datetime'])

# Initialize columns for each License Type and Industry
license_types = business_data['License Type'].unique()
industries = business_data['Industry'].unique()

for license_type in license_types:
    hourly_df[f'LicenseType_{license_type}'] = 0

for industry in industries:
    hourly_df[f'Industry_{industry}'] = 0

# Count the number of open businesses for each License Type and Industry at each hour
for index, row in hourly_df.iterrows():
    print(index)
    datetime = row['datetime']
    open_businesses = business_data[(business_data['License Creation Date'] <= datetime) & (business_data['License Expiration Date'] >= datetime)]
    
    for license_type in license_types:
        hourly_df.at[index, f'LicenseType_{license_type}'] = len(open_businesses[open_businesses['License Type'] == license_type])
    
    for industry in industries:
        hourly_df.at[index, f'Industry_{industry}'] = len(open_businesses[open_businesses['Industry'] == industry])

# Save the results to a new CSV file
hourly_df.to_csv('business_counts_by_hour.csv', index=False)


In [None]:
import pandas as pd

# Load the event data
event_data = pd.read_csv('NYC_Permitted_Event_Information_-_Historical_20240617.csv')

# Convert Start Date/Time and End Date/Time to datetime
event_data['Start Date/Time'] = pd.to_datetime(event_data['Start Date/Time'], errors='coerce')
event_data['End Date/Time'] = pd.to_datetime(event_data['End Date/Time'], errors='coerce')

# Filter for relevant columns
event_data = event_data[['Start Date/Time', 'End Date/Time', 'Event Type', 'Street Closure Type']]

# Fill NaN values in Street Closure Type with 'None'
event_data['Street Closure Type'] = event_data['Street Closure Type'].fillna('None')

# Generate hourly datetime range from 2013 to 2024
hourly_dates = pd.date_range(start='2013-01-01', end='2024-12-31 23:00:00', freq='H')
hourly_df = pd.DataFrame(hourly_dates, columns=['datetime'])

# Initialize columns for the number of events, constructions, and street closures
event_types = event_data['Event Type'].unique()

for event_type in event_types:
    hourly_df[f'EventType_{event_type}'] = 0

hourly_df['NumEvents'] = 0
hourly_df['NumConstructions'] = 0
hourly_df['NumStreetClosures'] = 0


# Count the number of events, constructions, and street closures at each hour
for index, row in hourly_df.iterrows():
    datetime = row['datetime']
    active_events = event_data[(event_data['Start Date/Time'] <= datetime) & (event_data['End Date/Time'] >= datetime)]
    
    hourly_df.at[index, 'NumEvents'] = len(active_events)
    hourly_df.at[index, 'NumConstructions'] = len(active_events[active_events['Event Type'] == 'Construction'])
    hourly_df.at[index, 'NumStreetClosures'] = len(active_events[active_events['Street Closure Type'] != 'None'])
    
    for event_type in event_types:
        hourly_df.at[index, f'EventType_{event_type}'] = len(active_events[active_events['Event Type'] == event_type])

# Save the results to a new CSV file
hourly_df.to_csv('event_counts_by_hour.csv', index=False)
