In [5]:
!pip install beautifulsoup4 requests



In [6]:
import time
import requests
from bs4 import BeautifulSoup
import csv

In [7]:
# Function to scrape a single page
def scrape_page(url):
    try:
        res = requests.get(url)
        # Check if the request was successful
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, "html.parser")
            selector = "div.search-result-preview > div > h3 > a"
            a_eles = soup.select(selector)
            return [x['href'] for x in a_eles]
        else:
            print(f"Failed to retrieve page: {url}")
            return []
    except Exception as e:
        print(f"Error occurred: {e}")
        return []

# Main scraping function
def scrape_events(base_url, pages):
    all_urls = []
    for i in range(1, pages + 1):
        time.sleep(1)  # Delay to respect the server
        page_url = f"{base_url}/page/{i}"
        urls = scrape_page(page_url)
        all_urls.extend(urls)
        print(f"Scraped page {i}, found {len(urls)} URLs")
    return all_urls

# Base URL for the events list
base_url = "https://visitseattle.org/events"
pages = 48

# Scrape the events
event_urls = scrape_events(base_url, pages)
print(event_urls)

# Print the URLs in the requested format
print("[")
for url in event_urls:
    print(f" '{url}',")
print("]")


Scraped page 1, found 9 URLs
Scraped page 2, found 9 URLs
Scraped page 3, found 9 URLs
Scraped page 4, found 9 URLs
Scraped page 5, found 9 URLs
Scraped page 6, found 9 URLs
Scraped page 7, found 9 URLs
Scraped page 8, found 9 URLs
Scraped page 9, found 9 URLs
Scraped page 10, found 9 URLs
Scraped page 11, found 9 URLs
Scraped page 12, found 9 URLs
Scraped page 13, found 9 URLs
Scraped page 14, found 9 URLs
Scraped page 15, found 9 URLs
Scraped page 16, found 9 URLs
Scraped page 17, found 9 URLs
Scraped page 18, found 9 URLs
Scraped page 19, found 9 URLs
Scraped page 20, found 9 URLs
Scraped page 21, found 9 URLs
Scraped page 22, found 9 URLs
Scraped page 23, found 9 URLs
Scraped page 24, found 9 URLs
Scraped page 25, found 9 URLs
Scraped page 26, found 9 URLs
Scraped page 27, found 9 URLs
Scraped page 28, found 9 URLs
Scraped page 29, found 9 URLs
Scraped page 30, found 9 URLs
Scraped page 31, found 9 URLs
Scraped page 32, found 9 URLs
Scraped page 33, found 9 URLs
Scraped page 34, fo

In [8]:
# Function to scrape details from an event page
def scrape_event_details(url):
    try:
        res = requests.get(url)
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, "html.parser")
            name = soup.select_one("#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h1").text.strip()
            date = soup.select_one("#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(1)").text.strip()
            location = soup.select_one("#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > h4 > span:nth-child(2)").text.strip()
            event_type = soup.select_one("#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(3)").text.strip()
            region = soup.select_one("#body > div.global-wrapper > div.container-event-detail.padding-top-bottom > div:nth-child(1) > div.medium-6.columns.event-top > a:nth-child(4)").text.strip()
            return [name, date, location, event_type, region]
        else:
            print(f"Failed to retrieve event details: {url}")
            return [None] * 5
    except Exception as e:
        print(f"Error occurred while scraping {url}: {e}")
        return [None] * 5

# Scrape event details and store in a CSV file
def store_events_details(urls, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Name", "Date", "Location", "Type", "Region"])
        # Write event details
        for url in urls:
            time.sleep(1)  # Delay to respect the server
            event_details = scrape_event_details(url)
            if all(event_details):
                writer.writerow(event_details)
                print(f"Scraped and stored details for {url}")
            else:
                print(f"Skipped incomplete data for {url}")

# Call the function to store event details
store_events_details(event_urls, "events.csv")


Scraped and stored details for https://visitseattle.org/events/steinunn-porarinsdottir-wayfinders/
Scraped and stored details for https://visitseattle.org/events/hanako-oleary-izanami/
Scraped and stored details for https://visitseattle.org/events/gage-alumni-show/
Scraped and stored details for https://visitseattle.org/events/genre-box/
Scraped and stored details for https://visitseattle.org/events/bohemia/
Scraped and stored details for https://visitseattle.org/events/alice-in-shadowland/
Scraped and stored details for https://visitseattle.org/events/lee-ritenour-and-dave-grusin/
Scraped and stored details for https://visitseattle.org/events/tacoma-home-garden-show/
Scraped and stored details for https://visitseattle.org/events/annie-eastwood-with-kimball-and-the-fugitives/
Scraped and stored details for https://visitseattle.org/events/black-and-boujee-panel-discussion/
Scraped and stored details for https://visitseattle.org/events/celebrate-asia/
Scraped and stored details for https

In [9]:
import requests

def get_lat_lon(location):
    params = {
        'q': location,
        'format': 'json'
    }
    response = requests.get("https://nominatim.openstreetmap.org/search", params=params)
    if response.status_code == 200:
        results = response.json()
        if results:
            # Assuming the first result is the most relevant one
            return results[0]['lat'], results[0]['lon']
        else:
            print(f"No results found for location: {location}")
    else:
        print(f"Error retrieving data from Nominatim for location: {location}")
    return None, None

# Test the function and print latitude and longitude
for url in event_urls:
    location = scrape_event_details(url)
    if location:
        lat, lon = get_lat_lon(location)
        if lat and lon:
            print(f"URL: {url}, Latitude: {lat}, Longitude: {lon}")
        else:
            print(f"Location not found for URL: {url}")
    else:
        print(f"Could not find location for URL: {url}")

URL: https://visitseattle.org/events/steinunn-porarinsdottir-wayfinders/, Latitude: 37.539526, Longitude: -122.0096885
URL: https://visitseattle.org/events/hanako-oleary-izanami/, Latitude: 47.615375, Longitude: -122.3270507
URL: https://visitseattle.org/events/gage-alumni-show/, Latitude: 47.650775949999996, Longitude: -122.37810030313341
URL: https://visitseattle.org/events/genre-box/, Latitude: 34.047996, Longitude: -118.2521002
URL: https://visitseattle.org/events/bohemia/, Latitude: 34.047996, Longitude: -118.2521002
URL: https://visitseattle.org/events/alice-in-shadowland/, Latitude: 47.615375, Longitude: -122.3270507
URL: https://visitseattle.org/events/lee-ritenour-and-dave-grusin/, Latitude: 34.047996, Longitude: -118.2521002
URL: https://visitseattle.org/events/tacoma-home-garden-show/, Latitude: 52.34106825, Longitude: -8.145015276623376
URL: https://visitseattle.org/events/annie-eastwood-with-kimball-and-the-fugitives/, Latitude: 47.671350450000006, Longitude: -122.34388239

In [11]:
import requests
from datetime import datetime

def get_gridpoint(lat, lon):
    """
    Get the gridpoint for a given latitude and longitude.
    """
    url = f"https://api.weather.gov/points/{lat},{lon}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data['properties']['gridId'], data['properties']['gridX'], data['properties']['gridY']
    else:
        print(f"Error getting gridpoint: {response.text}")
        return None, None, None

def get_weather_forecast(lat, lon, date):
    """
    Get the weather forecast for a specific latitude, longitude, and date.
    """
    grid_id, grid_x, grid_y = get_gridpoint(lat, lon)
    if grid_id is None:
        return None

    # Fetch the forecast
    url = f"https://api.weather.gov/gridpoints/{grid_id}/{grid_x},{grid_y}/forecast"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error getting forecast: {response.text}")
        return None

    # Parse the forecast data
    forecast_data = response.json()
    for period in forecast_data['properties']['periods']:
        forecast_date = datetime.strptime(period['startTime'], '%Y-%m-%dT%H:%M:%S%z').date()
        if forecast_date == date:
            return {
                'date': forecast_date,
                'temperature': period['temperature'],
                'windSpeed': period['windSpeed'],
                'shortForecast': period['shortForecast']
            }

    return None

In [13]:
import pandas as pd

# Assuming events_df is your DataFrame with event data
# Ensure get_lat_lon and get_weather_forecast functions are defined as previously discussed

# Create a new list to store the extended event data
extended_event_data = []

for index, row in events_df.iterrows():
    event_name = row['Name']
    event_date_str = row['Date']
    event_date = parse_event_date(event_date_str)  # Your function to parse date
    location = row['Location']
    event_type = row['Type']
    region = row['Region']

    lat, lon = get_lat_lon(location)  # Your function to get lat, lon from location
    weather_info = None
    if lat and lon and event_date:
        weather = get_weather_forecast(lat, lon, event_date)
        if weather:
            weather_info = f"Temp: {weather['temperature']}°F, Wind: {weather['windSpeed']}, {weather['shortForecast']}"

    extended_event_data.append({
        'Name': event_name,
        'Date': event_date_str,
        'Location': location,
        'Type': event_type,
        'Region': region,
        'Latitude': lat,
        'Longitude': lon,
        'Weather': weather_info or 'N/A'
    })

# Convert the list of data to a DataFrame
extended_events_df = pd.DataFrame(extended_event_data)

# Write the DataFrame to a CSV file
extended_events_df.to_csv('extended_events_with_weather.csv', index=False)


Error getting gridpoint: {
    "correlationId": "f025218",
    "title": "Data Unavailable For Requested Point",
    "type": "https://api.weather.gov/problems/InvalidPoint",
    "status": 404,
    "detail": "Unable to provide data for requested point 52.9517,-1.1451",
    "instance": "https://api.weather.gov/requests/f025218"
}
No results found for location: The Neptune Theatre
No results found for location: Meany Center for the Performing Arts
Error getting gridpoint: {
    "correlationId": "6818e9b6",
    "title": "Data Unavailable For Requested Point",
    "type": "https://api.weather.gov/problems/InvalidPoint",
    "status": 404,
    "detail": "Unable to provide data for requested point -13.2489,-41.0276",
    "instance": "https://api.weather.gov/requests/6818e9b6"
}
Error getting gridpoint: {
    "correlationId": "6818e9b6",
    "title": "Data Unavailable For Requested Point",
    "type": "https://api.weather.gov/problems/InvalidPoint",
    "status": 404,
    "detail": "Unable to p