# Music
## Symphony
### https://www.pittsburghsymphony.org/


In [15]:
###################################### Events ######################################

import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


class PittsburghSymphonyScraper:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.base_url = "https://www.pittsburghsymphony.org/calendar?page="
        self.result_file = "../raw_documents/Pittsburgh_Symphony.json"

    def fetch_page(self, page_num):
        url = self.base_url + str(page_num)
        try:
            self.driver.get(url)
            time.sleep(3)  # wait the page to load
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            return soup
        except Exception as e:
            print(f"Error fetching page {page_num}: {e}")
            return None

    def extract_event_info(self, soup):
        events = []
        event_list = soup.find_all('article', class_='event')
        for event in event_list:
            try:
                title = event.find('h3', class_='title').get_text(strip=True)
                time = event.find('time', class_='range').get_text(strip=True)
                venue = event.find('div', class_='venue').get_text(strip=True)
                organization = event.find('div', class_='organization').get_text(strip=True)

                event_info = {
                    "event_name": title,
                    "event_time": time,
                    "venue": venue,
                    "organization": organization
                }
                events.append(event_info)
            except Exception as e:
                print(f"Error extracting event info: {e}")
        return events

    def append_to_json(self, events):    
        # write in time
        try:
            with open(self.result_file, 'a') as f:
                for event in events:
                    json.dump(event, f, indent=4)
                    f.write("\n")
        except Exception as e:
            print(f"Error writing to JSON file: {e}")

    def scrape(self):
        for page_num in range(1, 6):
            print(f"Scraping page {page_num}...")
            soup = self.fetch_page(page_num)
            if soup:
                events = self.extract_event_info(soup)
                self.append_to_json(events)

    def close(self):
        self.driver.quit()


if __name__ == "__main__":
    scraper = PittsburghSymphonyScraper()
    scraper.scrape()
    scraper.close()
    print("Pittsburgh Symphony Scraping completed.")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Pittsburgh Symphony Scraping completed.


In [14]:
###################################### Musicians ######################################
import requests
from bs4 import BeautifulSoup
import json
import os

# Pittsburgh Symphony Orchestra musicians page URL
url = "https://www.pittsburghsymphony.org/pso_home/web/musicians"

# Send a GET request to the page
response = requests.get(url)

# Parse the page content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Initialize a dictionary to store musician data
musicians_data = {}

# Define the path of the existing JSON file
json_file_path = "../raw_documents/Pittsburgh_Symphony.json"

# If the file exists, load the existing data
if os.path.exists(json_file_path):
    with open(json_file_path, "r") as json_file:
        try:
            musicians_data = json.load(json_file)
        except json.JSONDecodeError:
            musicians_data = {}

# Define a helper function to get musician introduction from subpages
def get_musician_introduction(subpage_url):
    try:
        subpage_response = requests.get(subpage_url)
        subpage_soup = BeautifulSoup(subpage_response.content, "html.parser")
        bio_text_div = subpage_soup.find("div", class_="bio-text")
        if bio_text_div:
            return bio_text_div.get_text(strip=True, separator=" ")
    except Exception as e:
        print(f"Error accessing {subpage_url}: {e}")
    return None

# Loop over sections such as First Violin, Second Violin, etc.
for section in soup.find_all("h3"):
    section_name = section.get_text().strip()
    # If this section already exists in the JSON, skip it to avoid duplicates
    if section_name in musicians_data:
        continue
    
    musicians_data[section_name] = []
    
    # Find the <p> tag containing musician list under each section
    musician_list = section.find_next("p")
    
    # Check if musician_list exists
    if musician_list:
        # Get all musician names and titles within the <p> tag
        for musician in musician_list.find_all("a"):
            musician_name = musician.get_text(strip=True)
            musician_title = musician_list.get_text(strip=True).split('|')[1].strip() if '|' in musician_list.get_text() else ""
            musician_data = {
                "name": musician_name,
                "title": musician_title
            }
            
            # If musician has a subpage, fetch their introduction
            musician_subpage_url = musician.get("href")
            if musician_subpage_url:
                full_subpage_url = f"https://www.pittsburghsymphony.org{musician_subpage_url}"
                introduction = get_musician_introduction(full_subpage_url)
                if introduction:
                    musician_data["introduction"] = introduction
            
            # Append each musician's data to the list under the section
            musicians_data[section_name].append(musician_data)

# Append new data to the JSON file
with open(json_file_path, "w") as json_file:
    json.dump(musicians_data, json_file, indent=4)
    json_file.write("\n")  # Optional newline for better readability

print("Musicians' data has been appended to the JSON file.")

Musicians' data has been appended to the JSON file.


## Opera
### https://pittsburghopera.org/

In [2]:
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]  # 例如 'www.example.com'
        self.current_url = url

    def fetch(self, url: str, raw_html: bool = False):
        """capture the webpage and choose whether to remove JS or CSS"""
        soup = self.get_soup(url)
        if not raw_html:
            soup = self.remove_js_css(soup)
        return soup, self.get_links(soup), self.get_title(soup)

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)  # 调用 set_domain 方法
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def get_links(self, soup: BeautifulSoup):
        """get external links"""
        links = [link.get("href") for link in soup.find_all("a")]
        return self.filter_links(links)

    def get_title(self, soup: BeautifulSoup):
        """title"""
        if soup.title is None:
            return f"untitled_{self.get_timestamp()}"
        title = soup.title.string.replace(" ", "_").replace("/", "__")
        return title.replace("\n", "")

    def remove_js_css(self, soup: BeautifulSoup):
        """remove JS or CSS"""
        for script in soup(["script", "style"]):
            script.extract()  # 移除所有的 <script> 和 <style> 标签
        return soup

    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """remove header and footer"""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def filter_links(self, links: list):
        filtered_links = []
        for link in links:
            if link is None:
                continue
            elif link.startswith("#"):
                continue 
            elif link.startswith("http"):
                filtered_links.append(link)
            elif link.startswith("//"):
                filtered_links.append(f"https:{link}")
            elif link.startswith("/"):
                filtered_links.append(f"https://{self.current_domain}{link}") 
            else:
                filtered_links.append(f"{self.current_url}/{link}")
        return filtered_links

    def get_timestamp(self):
        return datetime.now().strftime("%Y%m%d_%H%M%S")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def organize_content_by_heading(soup: BeautifulSoup):
    data = {}
    current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None

    for tag in soup.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if tag.name == 'h2':
            current_h2 = tag.get_text(strip=True)
            data[current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name == 'p':
            if current_h6:
                data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h5:
                data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h4:
                data[current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h3:
                data[current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h2:
                data[current_h2].setdefault('content', []).append(tag.get_text(strip=True))
    
    return data


def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            
            soup = scraper_.remove_js_css(soup)
            soup = scraper_.remove_unnecessary_sections(soup)

            page_data = organize_content_by_heading(soup)
            page_data["url"] = link  

            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")
    
    return all_data


if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://pittsburghopera.org/",
        "https://pittsburghopera.org/about/mission-history",
        "https://pittsburghopera.org/about/inclusion-diversity-equity-accessibility-idea/",
        "https://pittsburghopera.org/calendar?timequery=week&prev=71+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=55+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=47+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=39+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=31+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=23+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=15+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=7+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-1+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-9+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-17+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-25+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-33+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-41+&start=1722484800000&end=17251451400000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-57+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-97+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-105+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-113+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-121+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-129+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-145+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-153+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-161+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-169+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=month&prev=-169+&start=1743480000000&end=1746053940000",
        "https://pittsburghopera.org/calendar?timequery=month&prev=-169+&start=1746072000000&end=1748732340000",
        "https://pittsburghopera.org/season/tosca",
        "https://pittsburghopera.org/season/cavalleria-rusticana-pagliacci",
        "https://pittsburghopera.org/season/armida",
        "https://pittsburghopera.org/season/madama-butterfly"
        "https://pittsburghopera.org/season/woman-with-eyes-closed",
        "https://pittsburghopera.org/season/past-seasons",
        "https://pittsburghopera.org/season/the-barber-of-seville?hsLang=en",
        "https://pittsburghopera.org/season/the-flying-dutchman?hsLang=en",
        "https://pittsburghopera.org/season/iphigenie-en-tauride?hsLang=en",
        "https://pittsburghopera.org/season/proving-up?hsLang=en",
        "https://pittsburghopera.org/season/la-traviata?hsLang=en",
        "https://pittsburghopera.org/season/the-passion-of-mary-cardwell-dawson?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/rusalka?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-marriage-of-figaro?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/ariodante?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/il-trovatore?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/denis-katya?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/we-shall-not-be-moved?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-magic-flute?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-rose-elf?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/in-a-grove?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/carmen?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/blue?hsLang=en"
        "https://pittsburghopera.org/season/past-seasons/semele?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/cosi-fan-tutte?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/soldier-songs?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/charlie-parkers-yardbird?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/last-american-hammer?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/alcina?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/florencia-en-el-amazonas?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/don-giovanni?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/madama-butterfly?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/afterwards?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/glory-denied?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/la-boheme?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/don-pasquale?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/hansel-gretel?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/savage-winter?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-long-walk?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-elixir-of-love?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/moby-dick?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-marriage-of-figaro?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/tosca-2017?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-summer-king?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/turandot?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/as-one?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/richard-the-lionheart?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/salome?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/la-traviata-2016?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-rakes-progress?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/twenty-seven?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-barber-of-seville-2016?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/little-women?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/cos%C3%AC-fan-tutte-2015?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/nabucco?hsLang=en",
        "https://pittsburghopera.org/season/free-low-cost-events",
        "https://pittsburghopera.org/season/free-low-cost-events/songshop",
        "https://pittsburghopera.org/season/free-low-cost-events/opera-up-close",
        "https://pittsburghopera.org/season/free-low-cost-events/pre-opera-talks",
        "https://pittsburghopera.org/season/free-low-cost-events/meet-the-artists",
        "https://pittsburghopera.org/season/free-low-cost-events/community-concerts",
        "https://pittsburghopera.org/season/free-low-cost-events/allegheny-county-summer-concert-series",
        "https://pittsburghopera.org/season/free-low-cost-events/wqed-broadcasts",
        "https://pittsburghopera.org/season/special-events",
        "https://pittsburghopera.org/season/special-events/diamond-horseshoe-ball",
        "https://pittsburghopera.org/season/special-events/pittsburgh-opera-fashion-event",
        "https://pittsburghopera.org/season/special-events/maecenas",
        "https://pittsburghopera.org/resident-artists/2024-25resident-artists",
        "https://pittsburghopera.org/resident-artists/faculty-administration/",
        "https://pittsburghopera.org/resident-artists/auditions/",
        "https://pittsburghopera.org/resident-artists/history-alumni/",
        "https://pittsburghopera.org/our-team/orchestra",
        "https://pittsburghopera.org/our-team/chorus",
        "https://pittsburghopera.org/facilities/pittsburgh-opera-headquarters/",
        "https://pittsburghopera.org/facilities/office-hours",
        "https://pittsburghopera.org/facilities/production-rentals",
        "https://pittsburghopera.org/facilities/hold-your-event-at-pittsburgh-opera/"
    ]

    scraped_data = scrape_links(scraper_, links)
    
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Opera.json")

    scraper_.close()

Fetching: https://pittsburghopera.org/
Fetching: https://pittsburghopera.org/about/mission-history
Fetching: https://pittsburghopera.org/about/inclusion-diversity-equity-accessibility-idea/
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=71+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=55+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=47+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=39+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=31+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=23+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=15+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?

In [21]:
######### Add Event Name #########
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]  # 例如 'www.example.com'
        self.current_url = url

    def fetch(self, url: str, raw_html: bool = False):
        """capture the webpage and choose whether to remove JS or CSS"""
        soup = self.get_soup(url)
        if not raw_html:
            soup = self.remove_js_css(soup)
        return soup, self.get_links(soup), self.get_title(soup)

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)  # 调用 set_domain 方法
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def get_links(self, soup: BeautifulSoup):
        """get external links"""
        links = [link.get("href") for link in soup.find_all("a")]
        return self.filter_links(links)

    def get_title(self, soup: BeautifulSoup):
        """title"""
        if soup.title is None:
            return f"untitled_{self.get_timestamp()}"
        title = soup.title.string.replace(" ", "_").replace("/", "__")
        return title.replace("\n", "")

    def remove_js_css(self, soup: BeautifulSoup):
        """remove JS or CSS"""
        for script in soup(["script", "style"]):
            script.extract()  # 移除所有的 <script> 和 <style> 标签
        return soup

    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """remove header and footer"""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def filter_links(self, links: list):
        filtered_links = []
        for link in links:
            if link is None:
                continue
            elif link.startswith("#"):
                continue 
            elif link.startswith("http"):
                filtered_links.append(link)
            elif link.startswith("//"):
                filtered_links.append(f"https:{link}")
            elif link.startswith("/"):
                filtered_links.append(f"https://{self.current_domain}{link}") 
            else:
                filtered_links.append(f"{self.current_url}/{link}")
        return filtered_links

    def get_timestamp(self):
        return datetime.now().strftime("%Y%m%d_%H%M%S")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            
            page_data = {}
            event_name_tag = soup.find("h4")
            event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
            page_data["event_name"] = event_name  # 存储事件名称

            soup = scraper_.remove_js_css(soup)
            soup = scraper_.remove_unnecessary_sections(soup)

            content_data = organize_content_by_heading(soup)
            page_data.update(content_data)
            page_data["url"] = link  

            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")
    
    return all_data


if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://pittsburghopera.org/",
        "https://pittsburghopera.org/about/mission-history",
        "https://pittsburghopera.org/about/inclusion-diversity-equity-accessibility-idea/",
        "https://pittsburghopera.org/calendar?timequery=week&prev=71+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=55+&start=1722484800000&end=1725145140000"
        "https://pittsburghopera.org/calendar?timequery=week&prev=47+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=39+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=31+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=23+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=15+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=7+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-1+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-9+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-17+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-25+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-33+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-41+&start=1722484800000&end=17251451400000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-57+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-97+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-105+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-113+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-121+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-129+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-145+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-153+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-161+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-169+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=month&prev=-169+&start=1743480000000&end=1746053940000",
        "https://pittsburghopera.org/calendar?timequery=month&prev=-169+&start=1746072000000&end=1748732340000",
        "https://pittsburghopera.org/season/tosca",
        "https://pittsburghopera.org/season/cavalleria-rusticana-pagliacci",
        "https://pittsburghopera.org/season/armida",
        "https://pittsburghopera.org/season/madama-butterfly"
        "https://pittsburghopera.org/season/woman-with-eyes-closed",
        "https://pittsburghopera.org/season/past-seasons",
        "https://pittsburghopera.org/season/the-barber-of-seville?hsLang=en",
        "https://pittsburghopera.org/season/the-flying-dutchman?hsLang=en",
        "https://pittsburghopera.org/season/iphigenie-en-tauride?hsLang=en",
        "https://pittsburghopera.org/season/proving-up?hsLang=en",
        "https://pittsburghopera.org/season/la-traviata?hsLang=en",
        "https://pittsburghopera.org/season/the-passion-of-mary-cardwell-dawson?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/rusalka?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-marriage-of-figaro?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/ariodante?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/il-trovatore?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/denis-katya?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/we-shall-not-be-moved?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-magic-flute?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-rose-elf?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/in-a-grove?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/carmen?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/blue?hsLang=en"
        "https://pittsburghopera.org/season/past-seasons/semele?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/cosi-fan-tutte?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/soldier-songs?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/charlie-parkers-yardbird?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/last-american-hammer?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/alcina?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/florencia-en-el-amazonas?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/don-giovanni?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/madama-butterfly?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/afterwards?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/glory-denied?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/la-boheme?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/don-pasquale?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/hansel-gretel?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/savage-winter?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-long-walk?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-elixir-of-love?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/moby-dick?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-marriage-of-figaro?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/tosca-2017?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-summer-king?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/turandot?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/as-one?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/richard-the-lionheart?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/salome?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/la-traviata-2016?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-rakes-progress?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/twenty-seven?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/the-barber-of-seville-2016?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/little-women?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/cos%C3%AC-fan-tutte-2015?hsLang=en",
        "https://pittsburghopera.org/season/past-seasons/nabucco?hsLang=en",
        "https://pittsburghopera.org/season/free-low-cost-events",
        "https://pittsburghopera.org/season/free-low-cost-events/songshop",
        "https://pittsburghopera.org/season/free-low-cost-events/opera-up-close",
        "https://pittsburghopera.org/season/free-low-cost-events/pre-opera-talks",
        "https://pittsburghopera.org/season/free-low-cost-events/meet-the-artists",
        "https://pittsburghopera.org/season/free-low-cost-events/community-concerts",
        "https://pittsburghopera.org/season/free-low-cost-events/allegheny-county-summer-concert-series",
        "https://pittsburghopera.org/season/free-low-cost-events/wqed-broadcasts",
        "https://pittsburghopera.org/season/special-events",
        "https://pittsburghopera.org/season/special-events/diamond-horseshoe-ball",
        "https://pittsburghopera.org/season/special-events/pittsburgh-opera-fashion-event",
        "https://pittsburghopera.org/season/special-events/maecenas",
        "https://pittsburghopera.org/resident-artists/2024-25resident-artists",
        "https://pittsburghopera.org/resident-artists/faculty-administration/",
        "https://pittsburghopera.org/resident-artists/auditions/",
        "https://pittsburghopera.org/resident-artists/history-alumni/",
        "https://pittsburghopera.org/our-team/orchestra",
        "https://pittsburghopera.org/our-team/chorus",
        "https://pittsburghopera.org/facilities/pittsburgh-opera-headquarters/",
        "https://pittsburghopera.org/facilities/office-hours",
        "https://pittsburghopera.org/facilities/production-rentals",
        "https://pittsburghopera.org/facilities/hold-your-event-at-pittsburgh-opera/"
    ]

    scraped_data = scrape_links(scraper_, links)
    
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Opera_New.json")

    scraper_.close()

Fetching: https://pittsburghopera.org/
Fetching: https://pittsburghopera.org/about/mission-history
Fetching: https://pittsburghopera.org/about/inclusion-diversity-equity-accessibility-idea/
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=71+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=55+&start=1722484800000&end=1725145140000https://pittsburghopera.org/calendar?timequery=week&prev=47+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=39+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=31+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=23+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=15+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=w

In [29]:
import json
import re

def clean_text(text: str):
    """清理文本中的多余换行符和空白字符"""
    # 替换多个连续的换行符为单个空格
    cleaned_text = re.sub(r'\n+', ' ', text)
    # 移除头尾多余的空白字符
    return cleaned_text.strip()

def clean_json_file(input_file, output_file):
    """加载JSON文件，清理内容后保存"""
    try:
        # 读取现有的 JSON 文件
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # 遍历每一项并清理内容
        for entry in data:
            if 'content' in entry:
                entry['content'] = clean_text(entry['content'])

        # 将清理后的数据写入新的 JSON 文件
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

        print(f"Cleaned data saved to {output_file}")

    except Exception as e:
        print(f"Error processing JSON file: {e}")

if __name__ == "__main__":
    input_file = "../raw_documents/Pittsburgh_Opera.json"  # 现有的JSON文件路径
    output_file = "../raw_documents/Pittsburgh_Opera_Cleaned.json"  # 清理后的JSON文件路径

    # 调用清理函数
    clean_json_file(input_file, output_file)

Cleaned data saved to ../raw_documents/Pittsburgh_Opera_Cleaned.json


## Cultural Trust
### https://trustarts.org/

In [9]:
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)  # Adjusted timeout for slower pages
        self.base_url = "https://trustarts.org"

    def get_soup(self, url: str):
        """获取网页内容并解析为 BeautifulSoup 对象"""
        self.driver.get(url)
        time.sleep(2)  # 等待页面加载完成
        return BeautifulSoup(self.driver.page_source, "html.parser")

    # def fetch_event_details(self, subpage_url):
    #     """进入子页面获取详细信息"""
    #     full_url = f"{self.base_url}{subpage_url}"
    #     soup = self.get_soup(full_url)

    #     # 获取 introduction 信息
    #     intro_tag = soup.find('article', class_='description')
    #     introduction = intro_tag.get_text(strip=True) if intro_tag else None

    #     # 获取 address 信息
    #     location_section = soup.find('section', class_='location')
    #     address_tag = location_section.find('li', class_='address') if location_section else None
    #     if address_tag:
    #         street_address = address_tag.find('span', property='streetAddress').get_text(strip=True)
    #         locality = address_tag.find('span', property='addressLocality').get_text(strip=True)
    #         region = address_tag.find('span', property='addressRegion').get_text(strip=True)
    #         postal_code = address_tag.find('span', property='postalCode').get_text(strip=True)
    #         address = f"{street_address}, {locality}, {region} {postal_code}"
    #     else:
    #         address = None

    #     return introduction, address

    def scrape_event(self, soup):
        """爬取单个事件信息"""
        events = []
        event_tags = soup.find_all('article', class_='event')

        for event_tag in event_tags:
            try:
                title = event_tag.find('h3', class_='title').get_text(strip=True)
                date = event_tag.find('time', class_='range').get_text(strip=True)

                # 获取场地和组织者信息
                venue = event_tag.find('div', class_='venue').get_text(strip=True)
                organization = event_tag.find('div', class_='organization').get_text(strip=True)

                # 获取分类
                categories = [cat.get_text(strip=True) for cat in event_tag.find_all('li', class_='category')]

                # 获取子页面URL
                subpage_url = event_tag.find('a')['href']

                # # 进入子页面获取更多信息
                # introduction, address = self.fetch_event_details(subpage_url)

                event_data = {
                    "title": title,
                    "date": date,
                    "venue": venue,
                    "organization": organization,
                    "category": categories,
                    "url": f"{self.base_url}{subpage_url}",
                    # "introduction": introduction,
                    # "address": address
                }

                events.append(event_data)

            except AttributeError as e:
                print(f"Error parsing event: {e}")
        return events

    def scrape_events_from_pages(self, urls):
        """从多个页面爬取事件信息"""
        all_events = []
        for url in urls:
            print(f"Scraping page: {url}")
            soup = self.get_soup(url)
            events = self.scrape_event(soup)
            all_events.extend(events)
        return all_events

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def save_to_json(data, filename):
    """保存数据到JSON文件"""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


if __name__ == "__main__":
    scraper = Scraper()

    # 需要爬取的页面URL
    urls = [
        "https://trustarts.org/",
        "https://trustarts.org/calendar?utf8=%E2%9C%93&utf8=%E2%9C%93&genre=All+Genres&organization_id=&start_date=&end_date=2017%2F06%2F14&filter%5Bmin%5D=2024-10-15T13%3A07%3A06-04%3A00&filter%5Bmax%5D=2026-04-15+13%3A07%3A06+-0400&filter%5Bcurrent_page%5D=production",
        "https://trustarts.org/calendar?end_date=722&genre=&order_by=production&page=2",
        "https://trustarts.org/calendar?ad=40&am=broad&askid=2f811b4c-704f-4054-a821-1b9934816698-0-ab_msb&l=sem&o=22837&page=3&q=Byham+Theater+Pittsburgh&qsrc=999",
        "https://trustarts.org/calendar?cluid=3794577&page=4",
        "https://trustarts.org/calendar?end_date=106&genre=&page=5",
        "https://trustarts.org/calendar?cid=Tess_Order&cluid=294&page=6",
        "https://trustarts.org/calendar?ad=102&am=532&an=msn_s&l=sem&o=22837&page=7",
        "https://trustarts.org/calendar?end_date=364&order_by=production&page=8",
        "https://trustarts.org/calendar?ad=102&am=532&an=324&l=sem&page=9&q=594&qsrc=999",
        "https://trustarts.org/calendar?ad=172&am=broad&an=msn_s&l=sem&o=22837&page=10&q=Pittsburgh%2BCultural%2BTrust&qsrc=999",
        "https://trustarts.org/calendar?end_date=&genre=All+Genres&order_by=production&page=11",
        "https://trustarts.org/calendar?cid=tess_order&cluid=830&page=12",
        "https://trustarts.org/calendar?cid=tess_order&cluid=830&page=13",
        "https://trustarts.org/calendar?cid=tess_order&cluid=830&page=14",
        "https://trustarts.org/calendar?cid=12&page=15",
        "https://trustarts.org/calendar?ad=102&page=16&q=Benedum%2BCenter&qsrc=274"
    ]

    # 开始爬取
    scraped_events = scraper.scrape_events_from_pages(urls)

    # 保存为 JSON 文件
    save_to_json(scraped_events, "../raw_documents/Pittsburgh_Trustarts.json")

    # 关闭爬虫
    scraper.close()

# if __name__ == "__main__":
#     scraper_ = Scraper()

#     links = [
        
#         "https://trustarts.org/pct_home/events/series",
#         "https://trustarts.org/pct_home/events/festivals",
#         "https://trustarts.org/pct_home/events/groups",
#         "https://trustarts.org/pct_home/events/university-student-tickets",
#         "https://trustarts.org/pct_home/events/seating-charts",
#         "https://trustarts.org/pct_home/events/faq---ticketing",
#         "https://trustarts.org/pct_home/events/gift-cards",
#         "https://trustarts.org/pct_home/events/official-ticket-source",
#         "https://trustarts.org/pct_home/events/venue-tours",
#         "https://trustarts.org/pct_home/visual-arts#current",
#         "https://trustarts.org/pct_home/visual-arts#upcoming",
#         "https://trustarts.org/pct_home/visual-arts#galleries",
#         "https://trustarts.org/pct_home/engagement",
#         "https://trustarts.org/pct_home/engagement/lullaby-project",
#         "https://trustarts.org/pct_home/engagement/broadway-talk-back-series",
#         "https://trustarts.org/pct_home/engagement/community-classes-with-mr-messado",
#         "https://trustarts.org/pct_home/engagement/cultural-celebrations",
#         "https://trustarts.org/pct_home/visit",
#         "https://trustarts.org/pct_home/about",
           
#     ]

#     scraped_data = scrape_links(scraper_, links)
    
#     save_to_json(scraped_data, "../raw_documents/Pittsburgh_Trustarts.json")

#     scraper_.close()

Scraping page: https://trustarts.org/
Scraping page: https://trustarts.org/calendar?utf8=%E2%9C%93&utf8=%E2%9C%93&genre=All+Genres&organization_id=&start_date=&end_date=2017%2F06%2F14&filter%5Bmin%5D=2024-10-15T13%3A07%3A06-04%3A00&filter%5Bmax%5D=2026-04-15+13%3A07%3A06+-0400&filter%5Bcurrent_page%5D=production
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Scraping page: https://trustarts.org/calendar?end_date=722&genre=&order_by=production&page=2
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute '

In [12]:
#################### Except events, more information ####################
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os


class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """Set domain name and URL."""
        self.current_domain = url.split("/")[2]  # 例如 'www.example.com'
        self.current_url = url

    def fetch(self, url: str, raw_html: bool = False):
        """Capture the webpage and choose whether to remove JS or CSS."""
        soup = self.get_soup(url)
        if not raw_html:
            soup = self.remove_js_css(soup)
        return soup, self.get_links(soup), self.get_title(soup)

    def get_html(self, url: str):
        """Get HTML."""
        self.set_domain(url)  # 调用 set_domain 方法
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and get BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def get_links(self, soup: BeautifulSoup):
        """Get external links."""
        links = [link.get("href") for link in soup.find_all("a")]
        return self.filter_links(links)

    def get_title(self, soup: BeautifulSoup):
        """Get the title."""
        if soup.title is None:
            return f"untitled_{self.get_timestamp()}"
        title = soup.title.string.replace(" ", "_").replace("/", "__")
        return title.replace("\n", "")

    def remove_js_css(self, soup: BeautifulSoup):
        """Remove JS and CSS."""
        for script in soup(["script", "style"]):
            script.extract()  # 移除所有的 <script> 和 <style> 标签
        return soup

    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """Remove unnecessary sections such as header, footer, nav, etc."""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def filter_links(self, links: list):
        filtered_links = []
        for link in links:
            if link is None:
                continue
            elif link.startswith("#"):
                continue
            elif link.startswith("http"):
                filtered_links.append(link)
            elif link.startswith("//"):
                filtered_links.append(f"https:{link}")
            elif link.startswith("/"):
                filtered_links.append(f"https://{self.current_domain}{link}")
            else:
                filtered_links.append(f"{self.current_url}/{link}")
        return filtered_links

    def get_timestamp(self):
        return datetime.now().strftime("%Y%m%d_%H%M%S")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None

    # Traverse all the relevant tags including headings, paragraphs, divs, articles, spans, etc.
    for tag in soup.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'article']):
        if tag.name == 'h2':
            current_h2 = tag.get_text(strip=True)
            data[current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower-level headings
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name in ['p', 'div', 'span', 'article']:
            # Only add content if it's not empty
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(content)
                elif current_h5:
                    data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(content)
                elif current_h4:
                    data[current_h2][current_h3][current_h4].setdefault('content', []).append(content)
                elif current_h3:
                    data[current_h2][current_h3].setdefault('content', []).append(content)
                elif current_h2:
                    data[current_h2].setdefault('content', []).append(content)

    return data


def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)

            soup = scraper_.remove_js_css(soup)
            soup = scraper_.remove_unnecessary_sections(soup)

            page_data = organize_content_by_heading(soup)
            page_data["url"] = link

            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data


if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://trustarts.org/pct_home/events/series",
        "https://trustarts.org/pct_home/events/festivals",
        "https://trustarts.org/pct_home/events/groups",
        "https://trustarts.org/pct_home/events/university-student-tickets",
        "https://trustarts.org/pct_home/events/seating-charts",
        "https://trustarts.org/pct_home/events/faq---ticketing",
        "https://trustarts.org/pct_home/events/gift-cards",
        "https://trustarts.org/pct_home/events/official-ticket-source",
        "https://trustarts.org/pct_home/events/venue-tours",
        "https://trustarts.org/pct_home/visual-arts#current",
        "https://trustarts.org/pct_home/visual-arts#upcoming",
        "https://trustarts.org/pct_home/visual-arts#galleries",
        "https://trustarts.org/pct_home/engagement",
        "https://trustarts.org/pct_home/engagement/lullaby-project",
        "https://trustarts.org/pct_home/engagement/broadway-talk-back-series",
        "https://trustarts.org/pct_home/engagement/community-classes-with-mr-messado",
        "https://trustarts.org/pct_home/engagement/cultural-celebrations",
        "https://trustarts.org/pct_home/visit",
        "https://trustarts.org/pct_home/about"
    ]

    scraped_data = scrape_links(scraper_, links)

    # 合并并保存到原来的文件，而不是覆盖
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Trustarts_copy.json")

    scraper_.close()

Fetching: https://trustarts.org/pct_home/events/series
Fetching: https://trustarts.org/pct_home/events/festivals
Fetching: https://trustarts.org/pct_home/events/groups
Fetching: https://trustarts.org/pct_home/events/university-student-tickets
Fetching: https://trustarts.org/pct_home/events/seating-charts
Fetching: https://trustarts.org/pct_home/events/faq---ticketing
Fetching: https://trustarts.org/pct_home/events/gift-cards
Fetching: https://trustarts.org/pct_home/events/official-ticket-source
Fetching: https://trustarts.org/pct_home/events/venue-tours
Fetching: https://trustarts.org/pct_home/visual-arts#current
Fetching: https://trustarts.org/pct_home/visual-arts#upcoming
Fetching: https://trustarts.org/pct_home/visual-arts#galleries
Fetching: https://trustarts.org/pct_home/engagement
Fetching: https://trustarts.org/pct_home/engagement/lullaby-project
Fetching: https://trustarts.org/pct_home/engagement/broadway-talk-back-series
Fetching: https://trustarts.org/pct_home/engagement/comm

# Museum
## Carnegie Museum
### https://carnegiemuseums.org/

In [25]:
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")
    
    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """remove header and footer"""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)

            # 查找所有的事件卡片
            events = soup.find_all('article', class_='event-card')

            for event in events:
                page_data = {}

                # 提取事件名称
                event_name_tag = event.find('h2')
                event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
                page_data["event_name"] = event_name

                # 提取开始和结束日期
                event_start = event.get('data-event-start', 'No Start Date')
                event_end = event.get('data-event-end', 'No End Date')
                page_data["event_start"] = event_start
                page_data["event_end"] = event_end

                # 提取场馆
                venue_tag = event.find('a', class_='event-card__venue-tag')
                venue = venue_tag.text.strip() if venue_tag else "No Venue"
                page_data["venue"] = venue

                # 提取事件类型
                event_type_tag = event.find('a', class_='event-card__event-type')
                event_type = event_type_tag.text.strip() if event_type_tag else "No Event Type"
                page_data["event_type"] = event_type

                # 提取是否是 Featured Event
                featured_flag_tag = event.find('span', class_='event-card__featured-flag')
                featured_flag = "Featured" if featured_flag_tag else "Not Featured"
                page_data["featured_flag"] = featured_flag

                # 将结果添加到最终的数据列表中
                all_data.append(page_data)

        except Exception as e:
            print(f"Error fetching {link}: {e}")
    
    return all_data

def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://carnegiemuseums.org/events/",
        "https://carnegiemuseums.org/events/page/2/",
        "https://carnegiemuseums.org/events/page/3/",
        "https://carnegiemuseums.org/events/page/4/",
        "https://carnegiemuseums.org/events/page/5/"
    ]

    scraped_data = scrape_links(scraper_, links)
    
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    scraper_.close()


# if __name__ == "__main__":
#     scraper_ = Scraper()

#     links = [
#         # "https://carnegiemnh.org/explore/explore/exhibitions/",
#         # "https://carnegiesciencecenter.org/exhibits/",
#         # "https://www.warhol.org/exhibitions/",
#         # "https://www.warhol.org/calendar/?_ga=2.140397363.1042301114.1729024596-1109875255.1729024596",

#     ]

#     scraped_data = scrape_links(scraper_, links)
    
#     save_to_json(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

#     scraper_.close()

Fetching: https://carnegiemuseums.org/events/
Fetching: https://carnegiemuseums.org/events/page/2/
Fetching: https://carnegiemuseums.org/events/page/3/
Fetching: https://carnegiemuseums.org/events/page/4/
Fetching: https://carnegiemuseums.org/events/page/5/
Data saved to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [27]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os


class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")
    
    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """remove header and footer"""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        # 移除 header 和 footer
        scraper_.remove_unnecessary_sections(soup)

        page_data = []
        # 提取所有的 h2 标题及其对应的正文
        sections = soup.find_all(['h2', 'p'])
        current_title = None

        for element in sections:
            if element.name == 'h2':
                current_title = element.text.strip()
            elif element.name == 'p' and current_title:
                # 将标题和正文存入 page_data
                page_data.append({
                    "title": current_title,
                    "content": element.text.strip()
                })

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []


def append_to_json_file(data, filename):
    try:
        # 读取现有的 JSON 文件
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        # 添加新数据
        existing_data.extend(data)

        # 将更新后的数据保存回文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


if __name__ == "__main__":
    scraper_ = Scraper()

    link = "https://carnegieart.org/about/our-story/"  # 你要爬取的页面链接

    # 爬取页面内容
    scraped_data = scrape_page(scraper_, link)

    # 将数据追加到现有的 JSON 文件中
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    scraper_.close()

Fetching: https://carnegieart.org/about/our-story/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [35]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []
        # 查找所有展览事件卡片
        events = soup.find_all('div', class_='cmoa-grid-item')

        for event in events:
            event_data = {}

            # 获取事件名称
            event_name_tag = event.find('a', class_='font-bold')
            event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
            event_data["event_name"] = event_name

            # 获取展览时间
            time_tag = event.find('div', class_='break-words')
            event_time = time_tag.text.strip() if time_tag else "No Time Information"
            event_data["time"] = event_time

            # 获取展览地点
            location_tag = event.find('ul', class_='metadata')
            if location_tag:
                location = location_tag.find('li').text.strip()
            else:
                location = "No Location"
            event_data["location"] = location

            # 将该展览信息添加到列表中
            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []

def append_to_json_file(data, filename):
    try:
        # 检查文件是否存在
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        # 添加新数据
        existing_data.extend(data)

        # 保存回 JSON 文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()

    # 要爬取的网页链接
    link = "https://carnegieart.org/art/whats-on-view/"

    # 爬取页面内容
    scraped_data = scrape_page(scraper_, link)

    # 将数据追加到现有的 JSON 文件中
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    scraper_.close()

Fetching: https://carnegieart.org/art/whats-on-view/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [36]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []
        # 查找所有展览事件卡片
        events = soup.find_all('div', class_='cmoa-grid-item')

        for event in events:
            event_data = {}

            # 获取事件名称
            event_name_tag = event.find('a', class_='font-bold')
            event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
            event_data["event_name"] = event_name

            # 获取展览时间
            time_tag = event.find('div', class_='break-words')
            event_time = time_tag.text.strip() if time_tag else "No Time Information"
            event_data["time"] = event_time

            # 获取展览地点
            location_tag = event.find('ul', class_='metadata')
            if location_tag:
                location = location_tag.find('li').text.strip()
            else:
                location = "No Location"
            event_data["location"] = location

            # 将该展览信息添加到列表中
            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []

def append_to_json_file(data, filename):
    try:
        # 检查文件是否存在
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        # 添加新数据
        existing_data.extend(data)

        # 保存回 JSON 文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()

    # 要爬取的网页链接
    link = "https://carnegieart.org/art/whats-on-view/page/2/"

    # 爬取页面内容
    scraped_data = scrape_page(scraper_, link)

    # 将数据追加到现有的 JSON 文件中
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    scraper_.close()

Fetching: https://carnegieart.org/art/whats-on-view/page/2/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [37]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []
        # 查找所有展览事件卡片
        events = soup.find_all('div', class_='cmoa-grid-item')

        for event in events:
            event_data = {}

            # 获取事件名称
            event_name_tag = event.find('a', class_='font-bold')
            event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
            event_data["event_name"] = event_name

            # 获取展览时间
            time_tag = event.find('div', class_='break-words')
            event_time = time_tag.text.strip() if time_tag else "No Time Information"
            event_data["time"] = event_time

            # 获取展览地点
            location_tag = event.find('ul', class_='metadata')
            if location_tag:
                location = location_tag.find('li').text.strip()
            else:
                location = "No Location"
            event_data["location"] = location

            # 将该展览信息添加到列表中
            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []

def append_to_json_file(data, filename):
    try:
        # 检查文件是否存在
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        # 添加新数据
        existing_data.extend(data)

        # 保存回 JSON 文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()

    link = "https://carnegieart.org/art/whats-on-view/page/3/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/4/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/5/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/6/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/7/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/8/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    scraper_.close()

Fetching: https://carnegieart.org/art/whats-on-view/page/3/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/4/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/5/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/6/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/7/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/8/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [45]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(30)  # Increase timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """Set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """Capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """Get HTML content"""
        self.set_domain(url)
        self.driver.get(url)
        try:
            # Explicit wait for the main content to load (adjust the selector as necessary)
            WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.item")))
        except Exception as e:
            print(f"Error while waiting for the page to load: {e}")
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []
        
        # 查找所有活动卡片
        events = soup.find_all('div', class_='item')

        for event in events:
            event_data = {}

            # 获取事件名称
            event_name_tag = event.find('h3')
            event_name = event_name_tag.text.strip() if event_name_tag else 'No Event Name'
            event_data['event_name'] = event_name

            # 获取时间
            time_tag = event.find('time')
            event_time = time_tag.text.strip() if time_tag else 'No Time Information'
            event_data['time'] = event_time

            # 获取地点
            location_tag = event.find('span', class_='screen-reader-text', string='Location:')
            if location_tag:
                # 获取 span 标签后面的所有兄弟节点，包括文本和 <br> 标签
                location_parts = location_tag.find_parent('p').contents
                location = ''.join([str(part).strip() for part in location_parts if isinstance(part, str)]).replace('<br>', ', ')
                location = location.replace('Location:', '').strip()
            else:
                location = 'No Location'
            event_data['location'] = location

            # 获取事件类型
            event_type_list = event.find_all('li')
            event_type = ', '.join([et.text.strip() for et in event_type_list if et])
            event_data['event_type'] = event_type

            # 获取受众类型
            audience_tag = event.find('h4', string=lambda x: 'Audience' in x)
            if audience_tag:
                audience_list = audience_tag.find_next('ul').find_all('li')
                audience = ', '.join([a.text.strip() for a in audience_list if a])
            else:
                audience = 'No Audience Information'
            event_data['audience'] = audience

            # 将该事件信息添加到列表中
            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []
        
def scrape_multiple_pages(scraper_, links):
    all_data = []
    
    # 遍历每个链接并爬取数据
    for link in links:
        data = scrape_page(scraper_, link)
        all_data.extend(data)  # 将每个页面的数据合并到总数据中

    return all_data

def append_to_json_file(data, filename):
    try:
        # 检查文件是否存在
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        # 添加新数据
        existing_data.extend(data)

        # 保存回 JSON 文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()

    # 要爬取的网页链接列表
    links = [
        "https://www.warhol.org/calendar/",
        "https://www.warhol.org/calendar/?date=2024-11-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2024-12-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-01-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-02-01&days=28&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-03-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-04-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-05-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-06-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-07-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-08-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-09-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-10-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-11-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-12-01&days=31&0=#calendar-header"
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_multiple_pages(scraper_, links)

    # 将数据追加到现有的 JSON 文件中
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    scraper_.close()

Fetching: https://www.warhol.org/calendar/
Fetching: https://www.warhol.org/calendar/?date=2024-11-01&days=30&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2024-12-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-01-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-02-01&days=28&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-03-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-04-01&days=30&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-05-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-06-01&days=30&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-07-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-08-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-09-01&days=30&0=#calendar-header
Fetching: h

## Heinz History Center
### https://www.heinzhistorycenter.org/

In [46]:
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os


class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)  # Adjusted timeout for slower pages
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """Set domain name and URL."""
        self.current_domain = url.split("/")[2]  # 例如 'www.example.com'
        self.current_url = url

    def fetch(self, url: str, raw_html: bool = False):
        """Capture the webpage and choose whether to remove JS or CSS."""
        soup = self.get_soup(url)
        if not raw_html:
            soup = self.remove_js_css(soup)
        return soup, self.get_links(soup), self.get_title(soup)

    def get_html(self, url: str):
        """Get HTML."""
        self.set_domain(url)  # 调用 set_domain 方法
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and get BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def get_links(self, soup: BeautifulSoup):
        """Get external links."""
        links = [link.get("href") for link in soup.find_all("a")]
        return self.filter_links(links)

    def remove_js_css(self, soup: BeautifulSoup):
        """Remove JS and CSS."""
        for script in soup(["script", "style"]):
            script.extract()  # 移除所有的 <script> 和 <style> 标签
        return soup

    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """Remove unnecessary sections such as header, footer, nav, etc."""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def filter_links(self, links: list):
        filtered_links = []
        for link in links:
            if link is None:
                continue
            elif link.startswith("#"):
                continue
            elif link.startswith("http"):
                filtered_links.append(link)
            elif link.startswith("//"):
                filtered_links.append(f"https:{link}")
            elif link.startswith("/"):
                filtered_links.append(f"https://{self.current_domain}{link}")
            else:
                filtered_links.append(f"{self.current_url}/{link}")
        return filtered_links

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def extract_event_data(soup: BeautifulSoup):
    """Extract event data from the soup."""
    events = []

    # 找到所有包含活动信息的卡片
    event_cards = soup.find_all("div", class_="card_body")
    for card in event_cards:
        event = {}
        # 提取活动名称
        event_name_tag = card.find("h3", class_="card_title")
        event['event_name'] = event_name_tag.get_text(strip=True) if event_name_tag else "No Event Name"

        # 提取活动时间
        time_tag = card.find("span", class_="card_time")
        event['time'] = time_tag.get_text(strip=True) if time_tag else "No Time Information"

        # 提取活动地点
        location_tag = card.find("span", class_="card_location")
        event['location'] = location_tag.get_text(strip=True) if location_tag else "No Location"

        # 提取活动描述
        description_tag = card.find("div", class_="card_description")
        event['description'] = description_tag.get_text(strip=True) if description_tag else "No Description"

        # 添加到事件列表
        events.append(event)

    return events


def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)

            soup = scraper_.remove_js_css(soup)
            soup = scraper_.remove_unnecessary_sections(soup)

            page_data = extract_event_data(soup)
            all_data.extend(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data


if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.heinzhistorycenter.org/events/"
        "https://www.heinzhistorycenter.org/",
        "https://www.heinzhistorycenter.org/research/detre-library-archives/",
        "https://www.heinzhistorycenter.org/whats-on/exhibits/",

    ]

    scraped_data = scrape_links(scraper_, links)

    # 合并并保存到原来的文件，而不是覆盖
    save_to_json(scraped_data, "../raw_documents/Heinz_History_Center.json")

    scraper_.close()

Fetching: https://www.heinzhistorycenter.org/events/https://www.heinzhistorycenter.org/
Fetching: https://www.heinzhistorycenter.org/research/detre-library-archives/
Fetching: https://www.heinzhistorycenter.org/whats-on/exhibits/
Data saved to ../raw_documents/Heinz_History_Center.json


In [48]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.heinzhistorycenter.org/whats-on/sports-museum/exhibits/",
        "https://www.heinzhistorycenter.org/whats-on/fort-pitt/",
        "https://www.heinzhistorycenter.org/whats-on/meadowcroft/exhibits/",
        "https://www.heinzhistorycenter.org/whats-on/exhibits/past-exhibits/"
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Heinz_History_Center.json")

    scraper_.close()

Fetching: https://www.heinzhistorycenter.org/whats-on/sports-museum/exhibits/
Fetching: https://www.heinzhistorycenter.org/whats-on/fort-pitt/
Fetching: https://www.heinzhistorycenter.org/whats-on/meadowcroft/exhibits/
Fetching: https://www.heinzhistorycenter.org/whats-on/exhibits/past-exhibits/
Data saved to ../raw_documents/Heinz_History_Center.json


## The Frick
### https://www.thefrickpittsburgh.org/

In [49]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.thefrickpittsburgh.org/calendar?search=1&page=1&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025",
        "https://www.thefrickpittsburgh.org/calendar?search=1&page=2&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025",
        "https://www.thefrickpittsburgh.org/stories",
        "https://www.thefrickpittsburgh.org/exhibitions",
        "https://www.thefrickpittsburgh.org/plan-your-visit",
        "https://www.thefrickpittsburgh.org/mission",
        "https://www.thefrickpittsburgh.org/collection",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Carriage%22+OR+object_type%3A%22Cars+and+Carriages%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Costume%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Decorative+Arts%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Painting%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Photography%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Works+on+Paper%2FDrawing%22+OR+object_type%3A%22Works+on+Paper%2FPrint%22&limit=40"
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Frick.json")

    scraper_.close()

Fetching: https://www.thefrickpittsburgh.org/calendar?search=1&page=1&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025
Fetching: https://www.thefrickpittsburgh.org/calendar?search=1&page=2&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025
Fetching: https://www.thefrickpittsburgh.org/stories
Fetching: https://www.thefrickpittsburgh.org/exhibitions
Fetching: https://www.thefrickpittsburgh.org/plan-your-visit
Fetching: https://www.thefrickpittsburgh.org/mission
Fetching: https://www.thefrickpittsburgh.org/collection
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Carriage%22+OR+object_type%3A%22Cars+and+Carriages%22&limit=40
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Costume%22&limit=40
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Decorative+Arts%22&limit=40
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Painting%22&lim

## More

# Food
## Food Festivals
### https://www.visitpittsburgh.com/events-festivals/food-festivals/

In [50]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/",
        "https://www.visitpittsburgh.com/blog/",
        "https://www.visitpittsburgh.com/plan-your-trip/",
        "https://www.visitpittsburgh.com/events-festivals/",
        "https://www.visitpittsburgh.com/events-festivals/?page=2",
        "https://www.visitpittsburgh.com/events-festivals/?page=3",
        "https://www.visitpittsburgh.com/events-festivals/?page=4",
        "https://www.visitpittsburgh.com/events-festivals/?page=5",
        "https://www.visitpittsburgh.com/events-festivals/?page=6",
        "https://www.visitpittsburgh.com/events-festivals/?page=7",
        "https://www.visitpittsburgh.com/events-festivals/?page=8",
        "https://www.visitpittsburgh.com/events-festivals/?page=9",
        "https://www.visitpittsburgh.com/events-festivals/?page=10",
        "https://www.visitpittsburgh.com/events-festivals/?page=11",
        "https://www.visitpittsburgh.com/events-festivals/?page=12",
        "https://www.visitpittsburgh.com/events-festivals/?page=13",
        "https://www.visitpittsburgh.com/events-festivals/?page=14",
        "https://www.visitpittsburgh.com/events-festivals/?page=15",
        "https://www.visitpittsburgh.com/events-festivals/?page=16",
        "https://www.visitpittsburgh.com/things-to-do/",
        "https://www.visitpittsburgh.com/restaurants-culinary/",
        "https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/",
        "https://www.visitpittsburgh.com/things-to-do/family-fun/",
        "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/",
        "https://www.visitpittsburgh.com/things-to-do/arts-culture/",
        "https://www.visitpittsburgh.com/meetings-and-events/",
        "https://www.visitpittsburgh.com/events-festivals/annual-events/",
        "https://www.visitpittsburgh.com/events-festivals/holidays/",
        "https://www.visitpittsburgh.com/events-festivals/halloween-events/",
        "https://www.visitpittsburgh.com/events-festivals/film-festivals/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/soul-food-festival/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/the-original-pittsburgh-taco-festival/",
        "https://www.visitpittsburgh.com/blog/top-beer-festivals-to-attend-in-pittsburgh-this-fall/"
       
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Food_Festivals.json")

    scraper_.close()

Fetching: https://www.visitpittsburgh.com/events-festivals/food-festivals/
Fetching: https://www.visitpittsburgh.com/blog/
Fetching: https://www.visitpittsburgh.com/plan-your-trip/
Fetching: https://www.visitpittsburgh.com/events-festivals/
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=2
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=3
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=4
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=5
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=6
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=7
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=8
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=9
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=10
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=11
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=12
Fetching: https://w

In [57]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML with explicit wait for dynamic content."""
        self.driver.get(url)
        
        try:
            # Explicit wait for a specific element that ensures page has loaded dynamically
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))  # Example: wait for body tag to appear
            )
        except Exception as e:
            print(f"Error while waiting for page to load: {e}")
        
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.visitpittsburgh.com/events-festivals/?page=14",
        "https://www.visitpittsburgh.com/events-festivals/?page=15",
        "https://www.visitpittsburgh.com/events-festivals/?page=16",
        "https://www.visitpittsburgh.com/things-to-do/",
        "https://www.visitpittsburgh.com/restaurants-culinary/",
        "https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/",
        "https://www.visitpittsburgh.com/things-to-do/family-fun/",
        "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/",
        "https://www.visitpittsburgh.com/things-to-do/arts-culture/",
        "https://www.visitpittsburgh.com/meetings-and-events/",
        "https://www.visitpittsburgh.com/events-festivals/annual-events/",
        "https://www.visitpittsburgh.com/events-festivals/holidays/",
        "https://www.visitpittsburgh.com/events-festivals/halloween-events/",
        "https://www.visitpittsburgh.com/events-festivals/film-festivals/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/soul-food-festival/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/the-original-pittsburgh-taco-festival/",
        "https://www.visitpittsburgh.com/blog/top-beer-festivals-to-attend-in-pittsburgh-this-fall/"
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Food_Festivals.json")

    scraper_.close()

Fetching: https://www.visitpittsburgh.com/events-festivals/?page=14
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=15
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=16
Fetching: https://www.visitpittsburgh.com/things-to-do/
Fetching: https://www.visitpittsburgh.com/restaurants-culinary/
Fetching: https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/
Fetching: https://www.visitpittsburgh.com/things-to-do/family-fun/
Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/
Fetching: https://www.visitpittsburgh.com/things-to-do/arts-culture/
Fetching: https://www.visitpittsburgh.com/meetings-and-events/
Fetching: https://www.visitpittsburgh.com/events-festivals/annual-events/
Fetching: https://www.visitpittsburgh.com/events-festivals/holidays/
Fetching: https://www.visitpittsburgh.com/events-festivals/halloween-events/
Fetching: https://www.visitpittsburgh.com/events-festivals/film-festivals/
Fetching: https://www.

## Picklesburgh
### https://www.picklesburgh.com/

In [51]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.picklesburgh.com/",
        "https://www.picklesburgh.com/vendors/",
        "https://www.picklesburgh.com/entertainment/",
        "https://www.picklesburgh.com/games/",
        "https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/",
        "https://www.picklesburgh.com/taste-of-picklesburgh/",
        "https://www.picklesburgh.com/news/",
        "https://www.picklesburgh.com/accessibility/",
        "https://www.picklesburgh.com/visit/getting-here/"
       
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Picklesburgh.json")

    scraper_.close()

Fetching: https://www.picklesburgh.com/
Fetching: https://www.picklesburgh.com/vendors/
Fetching: https://www.picklesburgh.com/entertainment/
Fetching: https://www.picklesburgh.com/games/
Fetching: https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/
Fetching: https://www.picklesburgh.com/taste-of-picklesburgh/
Fetching: https://www.picklesburgh.com/news/
Fetching: https://www.picklesburgh.com/accessibility/
Fetching: https://www.picklesburgh.com/visit/getting-here/
Data saved to ../raw_documents/Picklesburgh.json


## Pittsburgh Taco Fest
### https://www.pghtacofest.com/

In [53]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.pghtacofest.com/",
        "https://www.pghtacofest.com/about",
        "https://www.pghtacofest.com/vendors",
        "https://www.pghtacofest.com/faqs"
       
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Taco_Festival.json")

    scraper_.close()

Fetching: https://www.pghtacofest.com/
Fetching: https://www.pghtacofest.com/about
Fetching: https://www.pghtacofest.com/vendors
Fetching: https://www.pghtacofest.com/faqs
Data saved to ../raw_documents/Pittsburgh_Taco_Festival.json


## Pittsburgh Restaurant Week
### https://pittsburghrestaurantweek.com/

In [54]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://pittsburghrestaurantweek.com/",
        "https://pittsburghrestaurantweek.com/about/",
        "https://pittsburghrestaurantweek.com/about/history/",
        "https://pittsburghrestaurantweek.com/restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2024-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2024-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2023-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2023-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2022-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2022-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2021-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2021-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2020-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2020-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2019-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2019-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2018-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2018-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2017-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2017-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2016-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2016-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2015-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2015-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2014-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2014-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2013-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2013-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2012-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2012-restaurants/"
       
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Restaurant_Week.json")

    scraper_.close()

Fetching: https://pittsburghrestaurantweek.com/
Fetching: https://pittsburghrestaurantweek.com/about/
Fetching: https://pittsburghrestaurantweek.com/about/history/
Fetching: https://pittsburghrestaurantweek.com/restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2024-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2024-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2023-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2023-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2022-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2022-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2021-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2021-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2020-restaurants/
Fetching: https://pi

## Little Italy Days
### https://littleitalydays.com/

In [55]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://littleitalydays.com/",
        "https://littleitalydays.com/entertainment-schedule/",
        "https://littleitalydays.com/getting-around/",
        "https://littleitalydays.com/about-us/",
        "https://littleitalydays.com/faq/",
        "https://littleitalydays.com/bloomfield-businesses/"
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Little_Italy_Day.json")

    scraper_.close()

Fetching: https://littleitalydays.com/
Fetching: https://littleitalydays.com/entertainment-schedule/
Fetching: https://littleitalydays.com/getting-around/
Fetching: https://littleitalydays.com/about-us/
Fetching: https://littleitalydays.com/faq/
Fetching: https://littleitalydays.com/bloomfield-businesses/
Data saved to ../raw_documents/Pittsburgh_Little_Italy_Day.json


## Banana Split Fest
### https://bananasplitfest.com/

In [56]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            # 读取现有的 JSON 文件并合并
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            # 合并新的数据到现有数据
            if isinstance(existing_data, list):
                existing_data.extend(data)  # 如果是列表，将数据合并
            else:
                existing_data.update(data)  # 如果是字典，更新数据
        else:
            existing_data = data  # 如果文件不存在，则新建

        # 保存合并后的数据到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            # 提取内容并按照标题层级组织
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  # 保存爬取的链接
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://bananasplitfest.com/",
        "https://bananasplitfest.com/activities/",
        "https://bananasplitfest.com/events/princess-pageant/",
        "https://bananasplitfest.com/activities/crafts-games-activities/",
        "https://bananasplitfest.com/activities/participating-vendors/",
        "https://bananasplitfest.com/activities/food/",
        "https://bananasplitfest.com/activities/over-21-area/",
        "https://bananasplitfest.com/events/",
        "https://bananasplitfest.com/events/5k-banana-run/",
        "https://bananasplitfest.com/events/great-american-banana-baking-contest/",
        "https://bananasplitfest.com/events/banana-challenge/",
        "https://bananasplitfest.com/events/blood-drive/",
        "https://bananasplitfest.com/events/cornhole-tournament/",
        "https://bananasplitfest.com/events/car-show/",
        "https://bananasplitfest.com/events/yellow-tie-gala/",
        "https://bananasplitfest.com/activities/entertainment/",
        "https://bananasplitfest.com/schedule/",
        "https://bananasplitfest.com/information/parking/",
        "https://bananasplitfest.com/information/plan-your-visit/",
        "https://bananasplitfest.com/history/",
        "https://bananasplitfest.com/information/media/"
    ]

    # 爬取所有页面的内容
    scraped_data = scrape_links(scraper_, links)

    # 保存到 JSON 文件，追加到现有内容中
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Banana_Split_Festival.json")

    scraper_.close()

Fetching: https://bananasplitfest.com/
Fetching: https://bananasplitfest.com/activities/
Fetching: https://bananasplitfest.com/events/princess-pageant/
Fetching: https://bananasplitfest.com/activities/crafts-games-activities/
Fetching: https://bananasplitfest.com/activities/participating-vendors/
Fetching: https://bananasplitfest.com/activities/food/
Fetching: https://bananasplitfest.com/activities/over-21-area/
Fetching: https://bananasplitfest.com/events/
Fetching: https://bananasplitfest.com/events/5k-banana-run/
Fetching: https://bananasplitfest.com/events/great-american-banana-baking-contest/
Fetching: https://bananasplitfest.com/events/banana-challenge/
Fetching: https://bananasplitfest.com/events/blood-drive/
Fetching: https://bananasplitfest.com/events/cornhole-tournament/
Fetching: https://bananasplitfest.com/events/car-show/
Fetching: https://bananasplitfest.com/events/yellow-tie-gala/
Fetching: https://bananasplitfest.com/activities/entertainment/
Fetching: https://bananaspl