In [1]:
from bs4 import BeautifulSoup
import requests # For fetching HTML from a URL

def extract_event_data(html_content):
    """
    Extracts artists, venues, and dates from the Songkick HTML source.

    Args:
        html_content (str): The HTML content of the page.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              'artists' (list of str), 'venue' (str), and 'date' (str)
              for an event.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    events_data = []

    # The main container for event listings is a <ul> with class 'metro-area-calendar-listings'
    # Each individual event is an <li> with class 'event-listings-element'
    event_list_items = soup.find_all('li', class_='event-listings-element')

    if not event_list_items:
        print("No event listings found with class 'event-listings-element'.")
        print("Please check CSS selectors or if the page content is loaded correctly (e.g., JavaScript might be required for dynamic content).")
        return events_data

    current_date_str = "N/A" # To store the date from the preceding date-element

    for item in event_list_items:
        event_info = {}

        # --- Extract Artists ---
        # Artists are within a <p class="artists">
        # Primary artist(s) are usually in <strong>
        # Supporting artist(s) are usually in <span class="support">
        artists_p_tag = item.find('p', class_='artists')
        current_artists = []
        if artists_p_tag:
            # Get primary artists from <strong> tag
            strong_tag = artists_p_tag.find('strong')
            if strong_tag:
                primary_text = strong_tag.get_text(strip=True)
                # Normalize by replacing " and " with ", " then split by comma
                # Also handle cases where artists are already comma-separated within strong
                normalized_primary_text = primary_text.replace(' and ', ', ')
                artists_from_strong = [
                    artist.strip() for artist in normalized_primary_text.split(',') if artist.strip()
                ]
                current_artists.extend(artists_from_strong)

            # Get supporting artists from <span class="support">
            support_span = artists_p_tag.find('span', class_='support')
            if support_span:
                support_text = support_span.get_text(strip=True)
                normalized_support_text = support_text.replace(' and ', ', ')
                artists_from_support = [
                    artist.strip() for artist in normalized_support_text.split(',') if artist.strip()
                ]
                current_artists.extend(artists_from_support)
        
        event_info['artists'] = list(filter(None, current_artists)) # Remove any empty strings

        # --- Extract Venue ---
        # Venue is within <p class="location"> inside an <a class="venue-link">
        venue_name = "N/A"
        location_p_tag = item.find('p', class_='location')
        if location_p_tag:
            venue_link_tag = location_p_tag.find('a', class_='venue-link')
            if venue_link_tag:
                venue_name = venue_link_tag.get_text(strip=True)
        event_info['venue'] = venue_name

        # --- Extract Date ---
        # The date is usually displayed in a <time> tag within a <li class="date-element">
        # that *precedes* the block of 'event-listings-element' li tags.
        # We find the closest preceding 'date-element'.
        date_str = "N/A"
        preceding_date_li = item.find_previous_sibling('li', class_='date-element')
        
        # If not found directly, iterate backwards through siblings
        if not preceding_date_li:
            current_sibling = item.previous_sibling
            while current_sibling:
                if hasattr(current_sibling, 'name') and current_sibling.name == 'li' and 'date-element' in current_sibling.get('class', []):
                    preceding_date_li = current_sibling
                    break
                current_sibling = current_sibling.previous_sibling

        if preceding_date_li:
            date_time_tag = preceding_date_li.find('time', datetime=True)
            if date_time_tag:
                # Get the human-readable date text from the <time> tag
                date_str = date_time_tag.get_text(strip=True)
                # Alternatively, to get the machine-readable datetime:
                # date_str = date_time_tag['datetime']
        
        # Fallback: if the event li itself has a 'title' attribute, it often contains the date.
        if date_str == "N/A" and item.has_attr('title') and item['title']:
             date_str = item['title']
        
        event_info['date'] = date_str
        
        # Only add to results if we found artists, as that's a key piece of info
        if event_info['artists']:
            events_data.append(event_info)
            
    return events_data

if __name__ == '__main__':
    # URL to fetch and parse
    target_url = "https://www.songkick.com/metro-areas/26330-us-sf-bay-area"
    print(f"Attempting to fetch and parse: {target_url}")

    # It's good practice to send some headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9', # Often helps to get content in a predictable language
    }

    try:
        # Make the GET request to the URL
        response = requests.get(target_url, headers=headers, timeout=10) # Added timeout for robustness
        
        # Check if the request was successful (status code 200-299)
        response.raise_for_status()  # This will raise an HTTPError for bad responses (4xx or 5xx)
        
        html_doc_from_url = response.text
        
        # --- Optional: Save HTML to a file for inspection ---
        # with open("songkick_sf_bay_area.html", "w", encoding="utf-8") as f:
        #     f.write(html_doc_from_url)
        # print("HTML content saved to songkick_sf_bay_area.html")
        # --- End Optional ---

        print("\n--- HTML Content Fetched Successfully (first 500 chars) ---")
        print(html_doc_from_url[:500] + "...\n")

        # Call the extraction function with the fetched HTML
        extracted_data = extract_event_data(html_doc_from_url)
        
        if extracted_data:
            print(f"\n--- Extracted {len(extracted_data)} Events ---")
            for i, event in enumerate(extracted_data):
                print(f"\n--- Event {i+1} ---")
                print(f"Artists: {event['artists']}")
                print(f"Venue:   {event['venue']}")
                print(f"Date:    {event['date']}")
        else:
            print("\nNo data extracted from the URL.")
            print("Possible reasons:")
            print("- The website structure might have changed.")
            print("- The content might be loaded dynamically by JavaScript (requests library doesn't execute JS).")
            print("- CSS selectors in the script might need adjustment.")

    except requests.exceptions.HTTPError as errh:
        print(f"Http Error: {errh}")
        print(f"Status Code: {response.status_code if 'response' in locals() else 'N/A'}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred during the web request: {err}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

Attempting to fetch and parse: https://www.songkick.com/metro-areas/26330-us-sf-bay-area

--- HTML Content Fetched Successfully (first 500 chars) ---
<!DOCTYPE html>
<html lang="en" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml">
  <head prefix="og: http://ogp.me/ns# fb: http://www.facebook.com/2008/fbml songkick-concerts: http://ogp.me/ns/fb/songkick-concerts#">

   <meta name="robots" content="all">

      <!-- OneTrust Cookies Consent Notice start -->
<script id="onetrustcdn" src="https://cdn.cookielaw.org/scripttemplates/otSDKStub.js" data-document-language="true" type="text/javascript" charse...


--- Extracted 50 Events ---

--- Event 1 ---
Artists: ['Yerba Buena Gardens Festival 2025', 'Madison McFerrin']
Venue:   Yerba Buena Gardens
Date:    Saturday 10 May 2025 – Thursday 29 May 2025

--- Event 2 ---
Artists: ['Jack White']
Venue:   Fox Theater
Date:    Friday 16 May 2025

--- Event 3 ---
Artists: ['James Taylor', 'Tiny Habits']
Ve