In [4]:
from bs4 import BeautifulSoup
import requests # For fetching HTML from a URL

def extract_event_data(html_content):
    """
    Extracts artists, venues, and dates from the Songkick HTML source.
    Events on the same day at the same venue are merged, combining their artists.

    Args:
        html_content (str): The HTML content of the page.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              'artists' (list of str, sorted), 'venue' (str), and 'date' (str)
              for an event.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    # Using a dictionary to store events, keyed by (date, venue_lowercase)
    # This will help in de-duplicating events at the same venue on the same date.
    deduplicated_events_map = {}

    # The main container for event listings is a <ul> with class 'metro-area-calendar-listings'
    # Each individual event is an <li> with class 'event-listings-element'
    event_list_items = soup.find_all('li', class_='event-listings-element')

    if not event_list_items:
        print("No event listings found with class 'event-listings-element'.")
        print("Please check CSS selectors or if the page content is loaded correctly (e.g., JavaScript might be required for dynamic content).")
        return [] # Return empty list if no items found

    for item in event_list_items:
        # --- Extract Artists ---
        artists_p_tag = item.find('p', class_='artists')
        current_item_artists = []
        if artists_p_tag:
            strong_tag = artists_p_tag.find('strong')
            if strong_tag:
                primary_text = strong_tag.get_text(strip=True)
                normalized_primary_text = primary_text.replace(' and ', ', ')
                artists_from_strong = [
                    artist.strip() for artist in normalized_primary_text.split(',') if artist.strip()
                ]
                current_item_artists.extend(artists_from_strong)

            support_span = artists_p_tag.find('span', class_='support')
            if support_span:
                support_text = support_span.get_text(strip=True)
                normalized_support_text = support_text.replace(' and ', ', ')
                artists_from_support = [
                    artist.strip() for artist in normalized_support_text.split(',') if artist.strip()
                ]
                current_item_artists.extend(artists_from_support)
        
        # Filter out any empty artist names
        processed_artists_for_item = list(filter(None, current_item_artists))

        # Only proceed if we found artists for the current item
        if not processed_artists_for_item:
            continue

        # --- Extract Venue ---
        venue_name = "N/A"
        location_p_tag = item.find('p', class_='location')
        if location_p_tag:
            venue_link_tag = location_p_tag.find('a', class_='venue-link')
            if venue_link_tag:
                venue_name = venue_link_tag.get_text(strip=True)

        # --- Extract Date ---
        date_str = "N/A"
        preceding_date_li = item.find_previous_sibling('li', class_='date-element')
        
        if not preceding_date_li:
            current_sibling = item.previous_sibling
            while current_sibling:
                if hasattr(current_sibling, 'name') and current_sibling.name == 'li' and 'date-element' in current_sibling.get('class', []):
                    preceding_date_li = current_sibling
                    break
                current_sibling = current_sibling.previous_sibling

        if preceding_date_li:
            date_time_tag = preceding_date_li.find('time', datetime=True)
            if date_time_tag:
                date_str = date_time_tag.get_text(strip=True)
        
        if date_str == "N/A" and item.has_attr('title') and item['title']:
             date_str = item['title']
        
        # Create a key for de-duplication: (date, venue_lowercase)
        # Using venue_name.lower() for case-insensitive venue matching for de-duplication.
        event_key = (date_str, venue_name.lower())

        if event_key in deduplicated_events_map:
            # Event on the same date at the same venue already exists. Merge artists.
            existing_event = deduplicated_events_map[event_key]
            
            # Use a set to combine current artists with existing ones to ensure uniqueness
            combined_artists = set(existing_event['artists'])
            combined_artists.update(processed_artists_for_item)
            
            # Store as a sorted list
            existing_event['artists'] = sorted(list(combined_artists))
        else:
            # This is a new event (or first occurrence for this date/venue combination)
            deduplicated_events_map[event_key] = {
                'artists': sorted(list(set(processed_artists_for_item))), # Store unique, sorted artists
                'venue': venue_name, # Store original venue name
                'date': date_str
            }
            
    # Convert the map of de-duplicated events back to a list
    events_data = list(deduplicated_events_map.values())
            
    return events_data

if __name__ == '__main__':
    # URL to fetch and parse
    target_url = "https://www.songkick.com/metro-areas/26330-us-sf-bay-area"
    print(f"Attempting to fetch and parse: {target_url}")

    # It's good practice to send some headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9', # Often helps to get content in a predictable language
    }

    # Define the list of target venues (lowercase for case-insensitive matching)
    target_venues_lower = [
        "the chapel",
        "the independent",
        "the fillmore",
        "the regency ballroom",
        "bill graham", # "Bill Graham Civic Auditorium"
        "the warfield", # Assuming this is a known venue, might need more specific name
        "the masonic",
        "great american music hall",
        "august hall",
        "chase center",
        "oracle park",
        "the midway",
        "1015 folsom"
    ]

    try:
        # Make the GET request to the URL
        response = requests.get(target_url, headers=headers, timeout=10) # Added timeout for robustness
        
        # Check if the request was successful (status code 200-299)
        response.raise_for_status()  # This will raise an HTTPError for bad responses (4xx or 5xx)
        
        html_doc_from_url = response.text
        
        # --- Optional: Save HTML to a file for inspection ---
        # with open("songkick_sf_bay_area.html", "w", encoding="utf-8") as f:
        #     f.write(html_doc_from_url)
        # print("HTML content saved to songkick_sf_bay_area.html")
        # --- End Optional ---

        print("\n--- HTML Content Fetched Successfully (first 500 chars) ---")
        print(html_doc_from_url[:500] + "...\n")

        # Call the extraction function with the fetched HTML
        # all_extracted_data will now be de-duplicated by date and venue.
        all_extracted_data = extract_event_data(html_doc_from_url)
        
        if all_extracted_data:
            print(f"\n--- Extracted {len(all_extracted_data)} Events Total (after de-duplication) ---")

            # Filter events by venue
            filtered_events = []
            for event in all_extracted_data:
                venue_lower = event.get('venue', '').lower()
                if any(target_venue in venue_lower for target_venue in target_venues_lower):
                    filtered_events.append(event)
            
            if filtered_events:
                print(f"\n--- Found {len(filtered_events)} Events at Specified Venues ---")
                for i, event in enumerate(filtered_events):
                    print(f"\n--- Event {i+1} (Filtered) ---")
                    print(f"Artists: {event['artists']}")
                    print(f"Venue:   {event['venue']}")
                    print(f"Date:    {event['date']}")
            else:
                print("\nNo events found at the specified venues.")

        else:
            print("\nNo data extracted from the URL.")
            print("Possible reasons:")
            print("- The website structure might have changed.")
            print("- The content might be loaded dynamically by JavaScript (requests library doesn't execute JS).")
            print("- CSS selectors in the script might need adjustment.")

    except requests.exceptions.HTTPError as errh:
        print(f"Http Error: {errh}")
        print(f"Status Code: {response.status_code if 'response' in locals() else 'N/A'}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred during the web request: {err}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

Attempting to fetch and parse: https://www.songkick.com/metro-areas/26330-us-sf-bay-area

--- HTML Content Fetched Successfully (first 500 chars) ---
<!DOCTYPE html>
<html lang="en" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml">
  <head prefix="og: http://ogp.me/ns# fb: http://www.facebook.com/2008/fbml songkick-concerts: http://ogp.me/ns/fb/songkick-concerts#">

   <meta name="robots" content="all">

      <!-- OneTrust Cookies Consent Notice start -->
<script id="onetrustcdn" src="https://cdn.cookielaw.org/scripttemplates/otSDKStub.js" data-document-language="true" type="text/javascript" charse...


--- Extracted 48 Events Total (after de-duplication) ---

--- Found 19 Events at Specified Venues ---

--- Event 1 (Filtered) ---
Artists: ['White Denim']
Venue:   Great American Music Hall
Date:    Friday 16 May 2025

--- Event 2 (Filtered) ---
Artists: ['ONE OK ROCK', 'Stand Atlantic']
Venue:   Bill Graham Civic Auditorium
Date:    Friday 1