In [1]:
pip install fake_useragent


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
import json
import time
import pandas as pd
import numpy as np
from fake_useragent import UserAgent

# Base URL for Eventbrite event listings with a placeholder for the page number
base_url = 'https://www.eventbrite.com/d/ma--boston/all-events/?page=1'

# Number of pages to scrape
num_pages = 50

# User-Agent rotation setup
ua = UserAgent()

# List to store all event data
all_events = []

# Loop through pages
for page_num in range(1, num_pages + 1):
    success = False  # Flag to handle retries
    while not success:
        # Update headers with a random User-Agent
        headers = {"User-Agent": ua.random}

        # Make the request
        r = requests.get(base_url.format(page_num), headers=headers)

        # Check response status
        if r.status_code == 429:  # Rate limiting
            print(f"Rate limit hit on page {page_num}. Retrying after delay...")
            time.sleep(30)  # Backoff delay for 429
        elif r.status_code == 200:
            print(f"Success: {r.status_code} for page {page_num}")
            success = True  # Exit retry loop

            # Parse the HTML content
            soup = BeautifulSoup(r.content, 'html.parser')

            # Extract JSON-LD script tag
            script_tag = soup.find('script', type='application/ld+json')

            if script_tag:
                try:
                    json_data = json.loads(script_tag.string)

                    # Extract events data from JSON-LD
                    events = json_data.get("itemListElement", [])

                    for event_data in events:
                        event = event_data.get('item', {})
                        event_name = event.get('name', 'N/A')
                        event_description = event.get('description', 'N/A')
                        event_url = event.get('url', 'N/A')
                        event_location = event.get('location', {}).get('name', 'N/A')
                        event_start = event.get('startDate', 'N/A')
                        event_end = event.get('endDate', 'N/A')

                        # Geo information (latitude and longitude)
                        geo_info = event.get('location', {}).get('geo', {})
                        latitude = geo_info.get('latitude', 'N/A')
                        longitude = geo_info.get('longitude', 'N/A')

                        # Append event details to the list
                        all_events.append({
                            'name': event_name,
                            'description': event_description,
                            'url': event_url,
                            'location': event_location,
                            'start_date': event_start,
                            'end_date': event_end,
                            'latitude': latitude,
                            'longitude': longitude
                        })
                except Exception as e:
                    print(f"Error parsing JSON-LD data on page {page_num}: {e}")
            else:
                print(f"No JSON-LD script tag found on page {page_num}")
        else:
            print(f"Failed: {r.status_code} for page {page_num}")
            break  # Exit the loop for non-retriable errors

        time.sleep(10)

# Save the scraped data to a JSON file
json_file = 'event_data.json'
with open(json_file, 'w', encoding='utf-8') as file:
    json.dump(all_events, file, ensure_ascii=False, indent=4)

print(f"Data successfully saved to {json_file}")

# Print a summary of the scraped events
for idx, event in enumerate(all_events[:10], start=1):  # Display the first 10 events
    print(f"Event {idx}:")
    print(f"Name: {event['name']}")
    print(f"Description: {event['description']}")
    print(f"URL: {event['url']}")
    print(f"Location: {event['location']}")
    print(f"Start Date: {event['start_date']}")
    print(f"End Date: {event['end_date']}")
    print(f"Latitude: {event['latitude']}")
    print(f"Longitude: {event['longitude']}")
    print()

Success: 200 for page 1
Success: 200 for page 2
Success: 200 for page 3
Success: 200 for page 4
Success: 200 for page 5
Success: 200 for page 6
Success: 200 for page 7
Success: 200 for page 8
Success: 200 for page 9
Success: 200 for page 10
Success: 200 for page 11
Success: 200 for page 12
Success: 200 for page 13
Success: 200 for page 14
Success: 200 for page 15
Success: 200 for page 16
Success: 200 for page 17
Success: 200 for page 18
Success: 200 for page 19
Success: 200 for page 20
Success: 200 for page 21
Success: 200 for page 22
Success: 200 for page 23
Success: 200 for page 24
Success: 200 for page 25
Success: 200 for page 26
Success: 200 for page 27
Success: 200 for page 28
Success: 200 for page 29
Success: 200 for page 30
Success: 200 for page 31
Success: 200 for page 32
Success: 200 for page 33
Success: 200 for page 34
Success: 200 for page 35
Success: 200 for page 36
Success: 200 for page 37
Success: 200 for page 38
Success: 200 for page 39
Success: 200 for page 40
Success: 

In [3]:
df = pd.DataFrame(all_events)

df = df.drop_duplicates()

df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')

df = df.dropna(subset=['start_date', 'location'])

In [4]:
df['day_of_week'] = df['start_date'].dt.day_name()
df['start_hour'] = df['start_date'].dt.hour

df['duration_hours'] = (df['end_date'] - df['start_date']).dt.total_seconds() / 3600

duration_bins = [0, 2, 4, 6, 8, 12, 24, df['duration_hours'].max()]
duration_labels = ['0-2 hours', '2-4 hours', '4-6 hours', '6-8 hours', '8-12 hours', '12-24 hours', '24+ hours']
df['duration_category'] = pd.cut(df['duration_hours'], bins=duration_bins, labels=duration_labels, right=False)
df['duration_days'] = (df['end_date'] - df['start_date']).dt.days

In [5]:
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

df = df.dropna(subset=['latitude', 'longitude'])

In [6]:
df['description'] = df['description'].str.replace('\n', ' ').str.lower().str.strip()

In [7]:
# Display first few rows to verify data preparation
print(df.head())

# Check for any remaining missing values
print("\nMissing values:\n", df.isnull().sum())

                                                name  \
0  Gold Rush Bar Crawl: Boston's Biggest Saint Pa...   
1  Get Lucky Pub Crawl 2025: Boston's Original Sa...   
2         MIT Sloan Sports Analytics Conference 2025   
3        Global Families in Business Conference 2025   
4                   Hundred-Year Book Debate of 1925   

                                         description  \
0  one ticket for all access to 40+ of boston's b...   
1  20,000+ people bar hopping through west end, f...   
2  the 19th annual mit sloan sports analytics con...   
3  the global families business conference explor...   
4   vote for the best book from a hundred years ago!   

                                                 url  \
0  https://www.eventbrite.com/e/gold-rush-bar-cra...   
1  https://www.eventbrite.com/e/get-lucky-pub-cra...   
2  https://www.eventbrite.com/e/mit-sloan-sports-...   
3  https://www.eventbrite.com/e/global-families-i...   
4  https://www.eventbrite.com/e/hundred-year-b

In [8]:
# Descriptive statistics table
descriptive_stats = df.describe()

# Missing values table
missing_values = df.isnull().sum().to_frame(name='Missing Values')

# Display tables
print("Descriptive Statistics:")
display(descriptive_stats)

print("\nMissing Values:")
display(missing_values)

Descriptive Statistics:


Unnamed: 0,start_date,end_date,latitude,longitude,start_hour,duration_hours,duration_days
count,20,20,20.0,20.0,20.0,20.0,20.0
mean,2025-03-11 12:00:00,2025-03-11 21:36:00,42.362834,-71.090249,0.0,9.6,0.4
min,2025-03-02 00:00:00,2025-03-02 00:00:00,42.342639,-71.131718,0.0,0.0,0.0
25%,2025-03-06 00:00:00,2025-03-06 18:00:00,42.347818,-71.119759,0.0,0.0,0.0
50%,2025-03-09 00:00:00,2025-03-09 00:00:00,42.355389,-71.096249,0.0,0.0,0.0
75%,2025-03-15 00:00:00,2025-03-15 06:00:00,42.367715,-71.064984,0.0,24.0,1.0
max,2025-04-17 00:00:00,2025-04-19 00:00:00,42.443142,-71.029931,0.0,48.0,2.0
std,,,0.023946,0.031954,0.0,16.333369,0.680557



Missing Values:


Unnamed: 0,Missing Values
name,0
description,0
url,0
location,0
start_date,0
end_date,0
latitude,0
longitude,0
day_of_week,0
start_hour,0


In [9]:
df.shape

(20, 13)