# Music
## Symphony
### https://www.pittsburghsymphony.org/


In [15]:
###################################### Events ######################################
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class PittsburghSymphonyScraper:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.base_url = "https://www.pittsburghsymphony.org/calendar?page="
        self.result_file = "../raw_documents/Pittsburgh_Symphony.json"

    def fetch_page(self, page_num):
        url = self.base_url + str(page_num)
        try:
            self.driver.get(url)
            time.sleep(3)  # wait the page to load
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            return soup
        except Exception as e:
            print(f"Error fetching page {page_num}: {e}")
            return None

    def extract_event_info(self, soup):
        events = []
        event_list = soup.find_all('article', class_='event')
        for event in event_list:
            try:
                title = event.find('h3', class_='title').get_text(strip=True)
                time = event.find('time', class_='range').get_text(strip=True)
                venue = event.find('div', class_='venue').get_text(strip=True)
                organization = event.find('div', class_='organization').get_text(strip=True)

                event_info = {
                    "event_name": title,
                    "event_time": time,
                    "venue": venue,
                    "organization": organization
                }
                events.append(event_info)
            except Exception as e:
                print(f"Error extracting event info: {e}")
        return events

    def append_to_json(self, events):    
        # write in time
        try:
            with open(self.result_file, 'a') as f:
                for event in events:
                    json.dump(event, f, indent=4)
                    f.write("\n")
        except Exception as e:
            print(f"Error writing to JSON file: {e}")

    def scrape(self):
        for page_num in range(1, 6):
            print(f"Scraping page {page_num}...")
            soup = self.fetch_page(page_num)
            if soup:
                events = self.extract_event_info(soup)
                self.append_to_json(events)

    def close(self):
        self.driver.quit()


if __name__ == "__main__":
    scraper = PittsburghSymphonyScraper()
    scraper.scrape()
    scraper.close()
    print("Pittsburgh Symphony Scraping completed.")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Pittsburgh Symphony Scraping completed.


In [14]:
###################################### Musicians ######################################
import requests
from bs4 import BeautifulSoup
import json
import os

url = "https://www.pittsburghsymphony.org/pso_home/web/musicians"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

musicians_data = {}
json_file_path = "../raw_documents/Pittsburgh_Symphony.json"

if os.path.exists(json_file_path):
    with open(json_file_path, "r") as json_file:
        try:
            musicians_data = json.load(json_file)
        except json.JSONDecodeError:
            musicians_data = {}

def get_musician_introduction(subpage_url):
    try:
        subpage_response = requests.get(subpage_url)
        subpage_soup = BeautifulSoup(subpage_response.content, "html.parser")
        bio_text_div = subpage_soup.find("div", class_="bio-text")
        if bio_text_div:
            return bio_text_div.get_text(strip=True, separator=" ")
    except Exception as e:
        print(f"Error accessing {subpage_url}: {e}")
    return None

for section in soup.find_all("h3"):
    section_name = section.get_text().strip()
    if section_name in musicians_data:
        continue
    musicians_data[section_name] = []
    musician_list = section.find_next("p")
    
    if musician_list:
        for musician in musician_list.find_all("a"):
            musician_name = musician.get_text(strip=True)
            musician_title = musician_list.get_text(strip=True).split('|')[1].strip() if '|' in musician_list.get_text() else ""
            musician_data = {
                "name": musician_name,
                "title": musician_title
            }
            
            musician_subpage_url = musician.get("href")
            if musician_subpage_url:
                full_subpage_url = f"https://www.pittsburghsymphony.org{musician_subpage_url}"
                introduction = get_musician_introduction(full_subpage_url)
                if introduction:
                    musician_data["introduction"] = introduction
            
            musicians_data[section_name].append(musician_data)

with open(json_file_path, "w") as json_file:
    json.dump(musicians_data, json_file, indent=4)
    json_file.write("\n") 

print("Musicians' data has been appended to the JSON file.")

Musicians' data has been appended to the JSON file.


In [5]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)
            else:
                existing_data.update(data) 
        else:
            existing_data = data 

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()
    links = [
        "https://www.pittsburghsymphony.org/pso_home/web/visit-landing/frequently-asked-questions",
        "https://www.pittsburghsymphony.org/pso_home/web/tickets-landing/seating-charts",
        "https://www.pittsburghsymphony.org/pso_home/web/give-landing/corporate-partnerships/dining-partners",
        "https://www.pittsburghsymphony.org/pso_home/web/visit-landing/directions-parking-lodging",
        "https://www.pittsburghsymphony.org/pso_home/web/visit-landing/accessibility-and-information"
    ]
    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Music_Culture/1_Symphony_Others.json")

    scraper_.close()

Fetching: https://www.pittsburghsymphony.org/pso_home/web/visit-landing/frequently-asked-questions
Fetching: https://www.pittsburghsymphony.org/pso_home/web/tickets-landing/seating-charts
Fetching: https://www.pittsburghsymphony.org/pso_home/web/give-landing/corporate-partnerships/dining-partners
Fetching: https://www.pittsburghsymphony.org/pso_home/web/visit-landing/directions-parking-lodging
Fetching: https://www.pittsburghsymphony.org/pso_home/web/visit-landing/accessibility-and-information
Data saved to ../raw_documents/Music_Culture/1_Symphony_Others.json


In [4]:
import json

def flatten_event(event):
    """Flatten an event entry into a single line of text."""
    parts = []
    if "event_name" in event:
        parts.append(f"Event Name: {event['event_name']}")
    if "event_time" in event:
        parts.append(f"Event Time: {event['event_time']}")
    if "venue" in event:
        parts.append(f"Venue: {event['venue']}")
    if "organization" in event:
        parts.append(f"Organization: {event['organization']}")
    return ", ".join(parts)

def flatten_section(section_name, section_data):
    """Flatten a section with subsections into a single line of text."""
    return f"{section_name}: " + " ".join(section_data)

def process_json(data):
    """Process the JSON data to flatten events and sections."""
    flattened = []
    
    for entry in data:
        if "event_name" in entry:
            flattened.append(flatten_event(entry))
        else:
            for key, value in entry.items():
                if isinstance(value, dict):
                    for subkey, subvalue in value.items():
                        if isinstance(subvalue, str) and subvalue.strip():
                            flattened.append(f"{subkey}: {subvalue}")
                        elif isinstance(subvalue, list):
                            flattened.append(flatten_section(subkey, subvalue))
                elif isinstance(value, str) and value.strip():
                    flattened.append(f"{key}: {value}")
    
    return flattened

with open('../raw_documents/Music_Culture/1_Symphony_All.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

flattened_data = process_json(data)

with open('../raw_documents/Music_Culture/1_Music_Symphony.txt', 'w', encoding='utf-8') as txt_file:
    for line in flattened_data:
        txt_file.write(line + '\n')

print("The file has been processed and saved as '1_Symphony_All.txt'")

The file has been processed and saved as '1_Symphony_All.txt'


In [6]:
import json

def format_musician_data(instrument, musician):
    """Format each musician's data into a sentence."""
    name = musician['name']
    title = musician['title'] if musician['title'] else f"{instrument} musician"
    return f"For {instrument}, {name} is {title}."

def process_json(data, output_file_path):
    """Process the JSON data and append formatted text to the file."""
    with open(output_file_path, 'a', encoding='utf-8') as file:
        if isinstance(data, list):
            for section in data:
                for instrument, musicians in section.items():
                    for musician in musicians:
                        sentence = format_musician_data(instrument, musician)
                        file.write(sentence + '\n')
        elif isinstance(data, dict):
            for instrument, musicians in data.items():
                for musician in musicians:
                    sentence = format_musician_data(instrument, musician)
                    file.write(sentence + '\n')

with open('../raw_documents/Music_Culture/1_Symphony_All.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
process_json(data, '../raw_documents/Music_Culture/1_Music_Symphony.txt')
print("Data has been successfully appended.")

Data has been successfully appended.


In [12]:
import json

with open('../raw_documents/Music_Culture/1_Symphony_All.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

def flatten_json_content(data):
    flattened_text = ""
    for item in data:
        if isinstance(item, str):
            flattened_text += item + ": " 
        elif isinstance(item, list):
            for sentence in item:
                flattened_text += sentence + " "  
        flattened_text += "\n\n"
    return flattened_text.strip()

output_text = flatten_json_content(data)
with open('output.txt', 'w', encoding='utf-8') as file:
    file.write(output_text)
print("Text has been successfully written to output.txt.")

Text has been successfully written to output.txt.


## Opera
### https://pittsburghopera.org/

In [2]:
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]  
        self.current_url = url

    def fetch(self, url: str, raw_html: bool = False):
        """capture the webpage and choose whether to remove JS or CSS"""
        soup = self.get_soup(url)
        if not raw_html:
            soup = self.remove_js_css(soup)
        return soup, self.get_links(soup), self.get_title(soup)

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url) 
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def get_links(self, soup: BeautifulSoup):
        """get external links"""
        links = [link.get("href") for link in soup.find_all("a")]
        return self.filter_links(links)

    def get_title(self, soup: BeautifulSoup):
        """title"""
        if soup.title is None:
            return f"untitled_{self.get_timestamp()}"
        title = soup.title.string.replace(" ", "_").replace("/", "__")
        return title.replace("\n", "")

    def remove_js_css(self, soup: BeautifulSoup):
        """remove JS or CSS"""
        for script in soup(["script", "style"]):
            script.extract()  
        return soup

    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """remove header and footer"""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def filter_links(self, links: list):
        filtered_links = []
        for link in links:
            if link is None:
                continue
            elif link.startswith("#"):
                continue 
            elif link.startswith("http"):
                filtered_links.append(link)
            elif link.startswith("//"):
                filtered_links.append(f"https:{link}")
            elif link.startswith("/"):
                filtered_links.append(f"https://{self.current_domain}{link}") 
            else:
                filtered_links.append(f"{self.current_url}/{link}")
        return filtered_links

    def get_timestamp(self):
        return datetime.now().strftime("%Y%m%d_%H%M%S")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def organize_content_by_heading(soup: BeautifulSoup):
    data = {}
    current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None

    for tag in soup.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if tag.name == 'h2':
            current_h2 = tag.get_text(strip=True)
            data[current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name == 'p':
            if current_h6:
                data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h5:
                data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h4:
                data[current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h3:
                data[current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h2:
                data[current_h2].setdefault('content', []).append(tag.get_text(strip=True))
    
    return data


def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            
            soup = scraper_.remove_js_css(soup)
            soup = scraper_.remove_unnecessary_sections(soup)

            page_data = organize_content_by_heading(soup)
            page_data["url"] = link  

            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")
    
    return all_data


if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://pittsburghopera.org/calendar?timequery=week&prev=7+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-1+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-9+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-17+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-25+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-33+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-41+&start=1722484800000&end=17251451400000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-57+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-97+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-105+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-113+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-121+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-129+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-145+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-153+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-161+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=week&prev=-169+&start=1722484800000&end=1725145140000",
        "https://pittsburghopera.org/calendar?timequery=month&prev=-169+&start=1743480000000&end=1746053940000",
        "https://pittsburghopera.org/calendar?timequery=month&prev=-169+&start=1746072000000&end=1748732340000",
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Opera.json")
    scraper_.close()

Fetching: https://pittsburghopera.org/
Fetching: https://pittsburghopera.org/about/mission-history
Fetching: https://pittsburghopera.org/about/inclusion-diversity-equity-accessibility-idea/
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=71+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=55+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=47+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=39+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=31+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=23+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?timequery=week&prev=15+&start=1722484800000&end=1725145140000
Fetching: https://pittsburghopera.org/calendar?

In [29]:
%pip install webdriver-manager
%pip install --upgrade selenium webdriver-manager

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [32]:
import re

input_file = 'input.txt'
output_file = '../raw_documents/Music_Culture/2_Opera.txt'

def process_event_block(event_block):
    event_name = ''
    event_date = ''
    event_time = ''
    venue = 'No venue'

    event_name_match = re.search(r'title\.textContent\s*=\s*"(.+?)"', event_block)
    if event_name_match:
        event_name = event_name_match.group(1).strip()

    event_date_match = re.search(r'paragraphDate\.textContent\s*=\s*"(.+?)"', event_block)
    if event_date_match:
        event_date = event_date_match.group(1).strip()

    event_time_match = re.search(r'paragraphTime\.textContent\s*=\s*"(.+?)"', event_block)
    if event_time_match:
        event_time = event_time_match.group(1).strip()

    venue_match = re.search(r'paragraph\.innerHTML\s*=\s*\'(.+?)\'', event_block)
    if venue_match:
        venue_html = venue_match.group(1).strip()
        venue = re.sub(r'<.*?>', '', venue_html)
    if venue == '':
        venue = None

    if venue:
        event_detail = f"Event Name: {event_name}, Event Time: {event_date}{event_time}, Venue: {venue}, Organization: Pittsburgh Opera"
    else:
        event_detail = f"Event Name: {event_name}, Event Time: {event_date}{event_time}, Organization: Pittsburgh Opera"

    return event_detail

def append_to_txt_file(event_details):
    with open(output_file, 'a', encoding='utf-8') as file:
        for event in event_details:
            file.write(event + '\n')

def main():
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()

    event_blocks = re.split(r'(?:title\.textContent)', content)
    event_blocks = ['title.textContent' + block for block in event_blocks if block.strip()]

    event_details = []
    for block in event_blocks:
        event_detail = process_event_block(block)
        event_details.append(event_detail)

    append_to_txt_file(event_details)

if __name__ == "__main__":
    main()

活动信息已成功追加到文件 ../raw_documents/Music_Culture/2_Opera.txt 中！


In [44]:
import requests
from bs4 import BeautifulSoup
import sys
import re

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.title.string.strip() if soup.title else 'No Title'

        for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
            tag.decompose()

        for tag in soup.find_all(class_=re.compile(".*(nav|footer|menu|social|sidebar).*", re.I)):
            tag.decompose()
        for tag in soup.find_all(id=re.compile(".*(nav|footer|menu|social|sidebar).*", re.I)):
            tag.decompose()

        stop_phrases = [
            "Follow us on", "Contact Us", "Privacy Policy", "©",
            "BUY TICKETS", "GIVE NOW", "SUBSCRIBE", "Site Map",
            "Board Of Directors Login"
        ]

        content = []
        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']):
            text = ' '.join(element.stripped_strings)
            if text:
                if any(phrase.lower() in text.lower() for phrase in stop_phrases):
                    continue
                content.append(text)

        if not content:
            print(f"No content found for {url}", file=sys.stderr)
            return f"URL: {url}\nTitle: {title}\n\nNo relevant content found.\n\n{'-'*80}\n\n"

        formatted_paragraphs = []
        current_paragraph = ''
        for line in content:
            if line.endswith(':'):
                if current_paragraph:
                    formatted_paragraphs.append(current_paragraph.strip('; '))
                current_paragraph = line.rstrip(':') + ': '
            else:
                current_paragraph += line + '; '

        if current_paragraph:
            formatted_paragraphs.append(current_paragraph.strip('; '))

        formatted_text = '\n\n'.join(formatted_paragraphs)

        return f"URL: {url}\nTitle: {title}\n\n{formatted_text}\n\n{'-'*80}\n\n"
    except Exception as e:
        print(f"Error fetching {url}: {e}", file=sys.stderr)
        return ''

def append_text_to_file(text, file_path):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.write(text)

if __name__ == "__main__":
    links = [
        "https://pittsburghopera.org/tickets/opera-faqs/",
        "https://pittsburghopera.org/tickets/subscribe-today",
        "https://pittsburghopera.org/tickets/season-pass",
        "https://pittsburghopera.org/tickets/groups-tickets",
        "https://pittsburghopera.org/tickets/promotions-and-discounts/",
        "https://pittsburghopera.org/tickets/student-tickets",
        "https://pittsburghopera.org/tickets/cheap-seats",
        "https://pittsburghopera.org/tickets/give-the-gift-of-opera/",
        "https://pittsburghopera.org/tickets/ticket-offices-policies/",
        "https://pittsburghopera.org/tickets/accessibility/",
        "https://pittsburghopera.org/tickets/free-rideshare-vouchers/",
        "https://pittsburghopera.org/tickets/free-childcare-services"
    ]
    output_file = '../raw_documents/Music_Culture/2_Opera.txt'

    for url in links:
        text_content = extract_text_from_url(url)
        append_text_to_file(text_content, output_file)

In [29]:
import json
import re

def clean_text(text: str):
    cleaned_text = re.sub(r'\n+', ' ', text)
    return cleaned_text.strip()

def clean_json_file(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        for entry in data:
            if 'content' in entry:
                entry['content'] = clean_text(entry['content'])

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

        print(f"Cleaned data saved to {output_file}")

    except Exception as e:
        print(f"Error processing JSON file: {e}")

if __name__ == "__main__":
    input_file = "../raw_documents/Pittsburgh_Opera.json"  
    output_file = "../raw_documents/Pittsburgh_Opera_Cleaned.json"
    clean_json_file(input_file, output_file)

Cleaned data saved to ../raw_documents/Pittsburgh_Opera_Cleaned.json


## Cultural Trust
### https://trustarts.org/

In [9]:
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10) 
        self.base_url = "https://trustarts.org"

    def get_soup(self, url: str):
        self.driver.get(url)
        time.sleep(2)  
        return BeautifulSoup(self.driver.page_source, "html.parser")
    
    def scrape_event(self, soup):
        events = []
        event_tags = soup.find_all('article', class_='event')

        for event_tag in event_tags:
            try:
                title = event_tag.find('h3', class_='title').get_text(strip=True)
                date = event_tag.find('time', class_='range').get_text(strip=True)

                venue = event_tag.find('div', class_='venue').get_text(strip=True)
                organization = event_tag.find('div', class_='organization').get_text(strip=True)

                categories = [cat.get_text(strip=True) for cat in event_tag.find_all('li', class_='category')]

                subpage_url = event_tag.find('a')['href']

                event_data = {
                    "title": title,
                    "date": date,
                    "venue": venue,
                    "organization": organization,
                    "category": categories,
                    "url": f"{self.base_url}{subpage_url}",
                }

                events.append(event_data)

            except AttributeError as e:
                print(f"Error parsing event: {e}")
        return events

    def scrape_events_from_pages(self, urls):
        all_events = []
        for url in urls:
            print(f"Scraping page: {url}")
            soup = self.get_soup(url)
            events = self.scrape_event(soup)
            all_events.extend(events)
        return all_events

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


if __name__ == "__main__":
    scraper = Scraper()
    urls = [
        "https://trustarts.org/",
        "https://trustarts.org/calendar?utf8=%E2%9C%93&utf8=%E2%9C%93&genre=All+Genres&organization_id=&start_date=&end_date=2017%2F06%2F14&filter%5Bmin%5D=2024-10-15T13%3A07%3A06-04%3A00&filter%5Bmax%5D=2026-04-15+13%3A07%3A06+-0400&filter%5Bcurrent_page%5D=production",
        "https://trustarts.org/calendar?end_date=722&genre=&order_by=production&page=2",
        "https://trustarts.org/calendar?ad=40&am=broad&askid=2f811b4c-704f-4054-a821-1b9934816698-0-ab_msb&l=sem&o=22837&page=3&q=Byham+Theater+Pittsburgh&qsrc=999",
        "https://trustarts.org/calendar?cluid=3794577&page=4",
        "https://trustarts.org/calendar?end_date=106&genre=&page=5",
        "https://trustarts.org/calendar?cid=Tess_Order&cluid=294&page=6",
        "https://trustarts.org/calendar?ad=102&am=532&an=msn_s&l=sem&o=22837&page=7",
        "https://trustarts.org/calendar?end_date=364&order_by=production&page=8",
        "https://trustarts.org/calendar?ad=102&am=532&an=324&l=sem&page=9&q=594&qsrc=999",
        "https://trustarts.org/calendar?ad=172&am=broad&an=msn_s&l=sem&o=22837&page=10&q=Pittsburgh%2BCultural%2BTrust&qsrc=999",
        "https://trustarts.org/calendar?end_date=&genre=All+Genres&order_by=production&page=11",
        "https://trustarts.org/calendar?cid=tess_order&cluid=830&page=12",
        "https://trustarts.org/calendar?cid=tess_order&cluid=830&page=13",
        "https://trustarts.org/calendar?cid=tess_order&cluid=830&page=14",
        "https://trustarts.org/calendar?cid=12&page=15",
        "https://trustarts.org/calendar?ad=102&page=16&q=Benedum%2BCenter&qsrc=274"
    ]

    scraped_events = scraper.scrape_events_from_pages(urls)
    save_to_json(scraped_events, "../raw_documents/Pittsburgh_Trustarts.json")
    scraper.close()

Scraping page: https://trustarts.org/
Scraping page: https://trustarts.org/calendar?utf8=%E2%9C%93&utf8=%E2%9C%93&genre=All+Genres&organization_id=&start_date=&end_date=2017%2F06%2F14&filter%5Bmin%5D=2024-10-15T13%3A07%3A06-04%3A00&filter%5Bmax%5D=2026-04-15+13%3A07%3A06+-0400&filter%5Bcurrent_page%5D=production
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Scraping page: https://trustarts.org/calendar?end_date=722&genre=&order_by=production&page=2
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute 'get_text'
Error parsing event: 'NoneType' object has no attribute '

In [12]:
#################### Except events, more information ####################
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10) 
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """Set domain name and URL."""
        self.current_domain = url.split("/")[2] 
        self.current_url = url

    def fetch(self, url: str, raw_html: bool = False):
        """Capture the webpage and choose whether to remove JS or CSS."""
        soup = self.get_soup(url)
        if not raw_html:
            soup = self.remove_js_css(soup)
        return soup, self.get_links(soup), self.get_title(soup)

    def get_html(self, url: str):
        """Get HTML."""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and get BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def get_links(self, soup: BeautifulSoup):
        """Get external links."""
        links = [link.get("href") for link in soup.find_all("a")]
        return self.filter_links(links)

    def get_title(self, soup: BeautifulSoup):
        """Get the title."""
        if soup.title is None:
            return f"untitled_{self.get_timestamp()}"
        title = soup.title.string.replace(" ", "_").replace("/", "__")
        return title.replace("\n", "")

    def remove_js_css(self, soup: BeautifulSoup):
        """Remove JS and CSS."""
        for script in soup(["script", "style"]):
            script.extract() 
        return soup

    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """Remove unnecessary sections such as header, footer, nav, etc."""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def filter_links(self, links: list):
        filtered_links = []
        for link in links:
            if link is None:
                continue
            elif link.startswith("#"):
                continue
            elif link.startswith("http"):
                filtered_links.append(link)
            elif link.startswith("//"):
                filtered_links.append(f"https:{link}")
            elif link.startswith("/"):
                filtered_links.append(f"https://{self.current_domain}{link}")
            else:
                filtered_links.append(f"{self.current_url}/{link}")
        return filtered_links

    def get_timestamp(self):
        return datetime.now().strftime("%Y%m%d_%H%M%S")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None

    for tag in soup.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'article']):
        if tag.name == 'h2':
            current_h2 = tag.get_text(strip=True)
            data[current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None 
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name in ['p', 'div', 'span', 'article']:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(content)
                elif current_h5:
                    data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(content)
                elif current_h4:
                    data[current_h2][current_h3][current_h4].setdefault('content', []).append(content)
                elif current_h3:
                    data[current_h2][current_h3].setdefault('content', []).append(content)
                elif current_h2:
                    data[current_h2].setdefault('content', []).append(content)

    return data


def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)
            else:
                existing_data.update(data) 
        else:
            existing_data = data 

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)

            soup = scraper_.remove_js_css(soup)
            soup = scraper_.remove_unnecessary_sections(soup)

            page_data = organize_content_by_heading(soup)
            page_data["url"] = link

            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data


if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://trustarts.org/pct_home/events/series",
        "https://trustarts.org/pct_home/events/festivals",
        "https://trustarts.org/pct_home/events/groups",
        "https://trustarts.org/pct_home/events/university-student-tickets",
        "https://trustarts.org/pct_home/events/seating-charts",
        "https://trustarts.org/pct_home/events/faq---ticketing",
        "https://trustarts.org/pct_home/events/gift-cards",
        "https://trustarts.org/pct_home/events/official-ticket-source",
        "https://trustarts.org/pct_home/events/venue-tours",
        "https://trustarts.org/pct_home/visual-arts#current",
        "https://trustarts.org/pct_home/visual-arts#upcoming",
        "https://trustarts.org/pct_home/visual-arts#galleries",
        "https://trustarts.org/pct_home/engagement",
        "https://trustarts.org/pct_home/engagement/lullaby-project",
        "https://trustarts.org/pct_home/engagement/broadway-talk-back-series",
        "https://trustarts.org/pct_home/engagement/community-classes-with-mr-messado",
        "https://trustarts.org/pct_home/engagement/cultural-celebrations",
        "https://trustarts.org/pct_home/visit",
        "https://trustarts.org/pct_home/about"
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Trustarts_copy.json")
    scraper_.close()

Fetching: https://trustarts.org/pct_home/events/series
Fetching: https://trustarts.org/pct_home/events/festivals
Fetching: https://trustarts.org/pct_home/events/groups
Fetching: https://trustarts.org/pct_home/events/university-student-tickets
Fetching: https://trustarts.org/pct_home/events/seating-charts
Fetching: https://trustarts.org/pct_home/events/faq---ticketing
Fetching: https://trustarts.org/pct_home/events/gift-cards
Fetching: https://trustarts.org/pct_home/events/official-ticket-source
Fetching: https://trustarts.org/pct_home/events/venue-tours
Fetching: https://trustarts.org/pct_home/visual-arts#current
Fetching: https://trustarts.org/pct_home/visual-arts#upcoming
Fetching: https://trustarts.org/pct_home/visual-arts#galleries
Fetching: https://trustarts.org/pct_home/engagement
Fetching: https://trustarts.org/pct_home/engagement/lullaby-project
Fetching: https://trustarts.org/pct_home/engagement/broadway-talk-back-series
Fetching: https://trustarts.org/pct_home/engagement/comm

In [55]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re

visited = set()

def is_valid_url(url, base_url):
    return url.startswith(base_url)

def extract_subpages(url, base_url, current_depth, max_depth):
    if current_depth > max_depth or url in visited:
        return
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
        }
        response = requests.get(url)
        response.raise_for_status() 
        visited.add(url)  
        
        soup = BeautifulSoup(response.content, 'html.parser')
        print(url) 
    
        for link in soup.find_all('a', href=True):
            subpage_url = link['href']
            subpage_url = urljoin(base_url, subpage_url)
            
            if is_valid_url(subpage_url, base_url) and subpage_url not in visited:
                extract_subpages(subpage_url, base_url, current_depth + 1, max_depth)

        time.sleep(5)
    
    except Exception as e:
        print(f"Error fetching {url}: {e}")

if __name__ == "__main__":
    # base_url = "https://trustarts.org/pct_home/"
    base_url = "https://trustarts.org/"
    start_urls = [
        "https://trustarts.org/"
        # "https://trustarts.org/pct_home/events",
        # "https://trustarts.org/pct_home/visual-arts",
        # "https://trustarts.org/pct_home/engagement",
        # "https://trustarts.org/pct_home/support",
        # "https://trustarts.org/pct_home/visit",
        # "https://trustarts.org/pct_home/about"
    ]
    
    max_depth = 5 
    for start_url in start_urls:
        extract_subpages(start_url, base_url, current_depth=0, max_depth=max_depth)

Error fetching https://trustarts.org: 403 Client Error: Forbidden for url: https://trustarts.org/


# Museum
## Carnegie Museum
### https://carnegiemuseums.org/

In [25]:
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20) 
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")
    
    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """remove header and footer"""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            events = soup.find_all('article', class_='event-card')

            for event in events:
                page_data = {}

                event_name_tag = event.find('h2')
                event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
                page_data["event_name"] = event_name

                event_start = event.get('data-event-start', 'No Start Date')
                event_end = event.get('data-event-end', 'No End Date')
                page_data["event_start"] = event_start
                page_data["event_end"] = event_end

                venue_tag = event.find('a', class_='event-card__venue-tag')
                venue = venue_tag.text.strip() if venue_tag else "No Venue"
                page_data["venue"] = venue

                event_type_tag = event.find('a', class_='event-card__event-type')
                event_type = event_type_tag.text.strip() if event_type_tag else "No Event Type"
                page_data["event_type"] = event_type

                featured_flag_tag = event.find('span', class_='event-card__featured-flag')
                featured_flag = "Featured" if featured_flag_tag else "Not Featured"
                page_data["featured_flag"] = featured_flag

                all_data.append(page_data)

        except Exception as e:
            print(f"Error fetching {link}: {e}")
    
    return all_data

def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://carnegiemuseums.org/events/",
        "https://carnegiemuseums.org/events/page/2/",
        "https://carnegiemuseums.org/events/page/3/",
        "https://carnegiemuseums.org/events/page/4/",
        "https://carnegiemuseums.org/events/page/5/"
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")
    scraper_.close()

Fetching: https://carnegiemuseums.org/events/
Fetching: https://carnegiemuseums.org/events/page/2/
Fetching: https://carnegiemuseums.org/events/page/3/
Fetching: https://carnegiemuseums.org/events/page/4/
Fetching: https://carnegiemuseums.org/events/page/5/
Data saved to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [27]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20) 
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")
    
    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """remove header and footer"""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)
        scraper_.remove_unnecessary_sections(soup)

        page_data = []
        sections = soup.find_all(['h2', 'p'])
        current_title = None

        for element in sections:
            if element.name == 'h2':
                current_title = element.text.strip()
            elif element.name == 'p' and current_title:
                page_data.append({
                    "title": current_title,
                    "content": element.text.strip()
                })

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []


def append_to_json_file(data, filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []
        existing_data.extend(data)

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


if __name__ == "__main__":
    scraper_ = Scraper()

    link = "https://carnegieart.org/about/our-story/"

    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")
    scraper_.close()

Fetching: https://carnegieart.org/about/our-story/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [35]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []
        events = soup.find_all('div', class_='cmoa-grid-item')

        for event in events:
            event_data = {}
            event_name_tag = event.find('a', class_='font-bold')
            event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
            event_data["event_name"] = event_name

            time_tag = event.find('div', class_='break-words')
            event_time = time_tag.text.strip() if time_tag else "No Time Information"
            event_data["time"] = event_time

            location_tag = event.find('ul', class_='metadata')
            if location_tag:
                location = location_tag.find('li').text.strip()
            else:
                location = "No Location"
            event_data["location"] = location

            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []

def append_to_json_file(data, filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        existing_data.extend(data)

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()
    link = "https://carnegieart.org/art/whats-on-view/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")
    scraper_.close()

Fetching: https://carnegieart.org/art/whats-on-view/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [36]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20)  
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []
        events = soup.find_all('div', class_='cmoa-grid-item')

        for event in events:
            event_data = {}

            event_name_tag = event.find('a', class_='font-bold')
            event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
            event_data["event_name"] = event_name

            time_tag = event.find('div', class_='break-words')
            event_time = time_tag.text.strip() if time_tag else "No Time Information"
            event_data["time"] = event_time

            location_tag = event.find('ul', class_='metadata')
            if location_tag:
                location = location_tag.find('li').text.strip()
            else:
                location = "No Location"
            event_data["location"] = location

            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []

def append_to_json_file(data, filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        existing_data.extend(data)

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()
    link = "https://carnegieart.org/art/whats-on-view/page/2/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")
    scraper_.close()

Fetching: https://carnegieart.org/art/whats-on-view/page/2/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [37]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(20) 
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """get HTML"""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """parse and get BeautifulSoup object"""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []
        events = soup.find_all('div', class_='cmoa-grid-item')

        for event in events:
            event_data = {}
            event_name_tag = event.find('a', class_='font-bold')
            event_name = event_name_tag.text.strip() if event_name_tag else "No Event Name"
            event_data["event_name"] = event_name

            time_tag = event.find('div', class_='break-words')
            event_time = time_tag.text.strip() if time_tag else "No Time Information"
            event_data["time"] = event_time

            location_tag = event.find('ul', class_='metadata')
            if location_tag:
                location = location_tag.find('li').text.strip()
            else:
                location = "No Location"
            event_data["location"] = location

            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []

def append_to_json_file(data, filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []
        existing_data.extend(data)

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()

    link = "https://carnegieart.org/art/whats-on-view/page/3/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/4/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/5/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/6/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/7/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    link = "https://carnegieart.org/art/whats-on-view/page/8/"
    scraped_data = scrape_page(scraper_, link)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")

    scraper_.close()

Fetching: https://carnegieart.org/art/whats-on-view/page/3/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/4/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/5/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/6/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/7/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json
Fetching: https://carnegieart.org/art/whats-on-view/page/8/
Data appended to ../raw_documents/Pittsburgh_Carnegie_Museums.json


In [45]:
import json
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(30) 
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """Set domain name and URL"""
        self.current_domain = url.split("/")[2]
        self.current_url = url

    def fetch(self, url: str):
        """Capture the webpage"""
        soup = self.get_soup(url)
        return soup

    def get_html(self, url: str):
        """Get HTML content"""
        self.set_domain(url)
        self.driver.get(url)
        try:
            WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.item")))
        except Exception as e:
            print(f"Error while waiting for the page to load: {e}")
        return self.driver.page_source

    def get_soup(self, url: str):
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def scrape_page(scraper_, link):
    try:
        print(f"Fetching: {link}")
        soup = scraper_.get_soup(link)

        page_data = []

        events = soup.find_all('div', class_='item')

        for event in events:
            event_data = {}
            event_name_tag = event.find('h3')
            event_name = event_name_tag.text.strip() if event_name_tag else 'No Event Name'
            event_data['event_name'] = event_name

            time_tag = event.find('time')
            event_time = time_tag.text.strip() if time_tag else 'No Time Information'
            event_data['time'] = event_time

            location_tag = event.find('span', class_='screen-reader-text', string='Location:')
            if location_tag:
                location_parts = location_tag.find_parent('p').contents
                location = ''.join([str(part).strip() for part in location_parts if isinstance(part, str)]).replace('<br>', ', ')
                location = location.replace('Location:', '').strip()
            else:
                location = 'No Location'
            event_data['location'] = location

            event_type_list = event.find_all('li')
            event_type = ', '.join([et.text.strip() for et in event_type_list if et])
            event_data['event_type'] = event_type

            audience_tag = event.find('h4', string=lambda x: 'Audience' in x)
            if audience_tag:
                audience_list = audience_tag.find_next('ul').find_all('li')
                audience = ', '.join([a.text.strip() for a in audience_list if a])
            else:
                audience = 'No Audience Information'
            event_data['audience'] = audience

            page_data.append(event_data)

        return page_data

    except Exception as e:
        print(f"Error fetching {link}: {e}")
        return []
        
def scrape_multiple_pages(scraper_, links):
    all_data = []
    
    for link in links:
        data = scrape_page(scraper_, link)
        all_data.extend(data)

    return all_data

def append_to_json_file(data, filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        else:
            existing_data = []

        existing_data.extend(data)

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        print(f"Data appended to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    scraper_ = Scraper()
    links = [
        "https://www.warhol.org/calendar/",
        "https://www.warhol.org/calendar/?date=2024-11-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2024-12-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-01-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-02-01&days=28&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-03-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-04-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-05-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-06-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-07-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-08-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-09-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-10-01&days=31&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-11-01&days=30&0=#calendar-header",
        "https://www.warhol.org/calendar/?date=2025-12-01&days=31&0=#calendar-header"
    ]

    scraped_data = scrape_multiple_pages(scraper_, links)
    append_to_json_file(scraped_data, "../raw_documents/Pittsburgh_Carnegie_Museums.json")
    scraper_.close()

Fetching: https://www.warhol.org/calendar/
Fetching: https://www.warhol.org/calendar/?date=2024-11-01&days=30&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2024-12-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-01-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-02-01&days=28&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-03-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-04-01&days=30&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-05-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-06-01&days=30&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-07-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-08-01&days=31&0=#calendar-header
Fetching: https://www.warhol.org/calendar/?date=2025-09-01&days=30&0=#calendar-header
Fetching: h

## Heinz History Center
### https://www.heinzhistorycenter.org/

In [46]:
import json
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:

    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)
        self.current_url = None
        self.current_domain = None

    def set_domain(self, url: str):
        """Set domain name and URL."""
        self.current_domain = url.split("/")[2]  
        self.current_url = url

    def fetch(self, url: str, raw_html: bool = False):
        """Capture the webpage and choose whether to remove JS or CSS."""
        soup = self.get_soup(url)
        if not raw_html:
            soup = self.remove_js_css(soup)
        return soup, self.get_links(soup), self.get_title(soup)

    def get_html(self, url: str):
        """Get HTML."""
        self.set_domain(url)
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and get BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def get_links(self, soup: BeautifulSoup):
        """Get external links."""
        links = [link.get("href") for link in soup.find_all("a")]
        return self.filter_links(links)

    def remove_js_css(self, soup: BeautifulSoup):
        """Remove JS and CSS."""
        for script in soup(["script", "style"]):
            script.extract() 
        return soup

    def remove_unnecessary_sections(self, soup: BeautifulSoup):
        """Remove unnecessary sections such as header, footer, nav, etc."""
        for tag in ["header", "footer", "nav", "aside"]:
            for element in soup.find_all(tag):
                element.decompose()
        return soup

    def filter_links(self, links: list):
        filtered_links = []
        for link in links:
            if link is None:
                continue
            elif link.startswith("#"):
                continue
            elif link.startswith("http"):
                filtered_links.append(link)
            elif link.startswith("//"):
                filtered_links.append(f"https:{link}")
            elif link.startswith("/"):
                filtered_links.append(f"https://{self.current_domain}{link}")
            else:
                filtered_links.append(f"{self.current_url}/{link}")
        return filtered_links

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()


def extract_event_data(soup: BeautifulSoup):
    """Extract event data from the soup."""
    events = []
    event_cards = soup.find_all("div", class_="card_body")
    for card in event_cards:
        event = {}
        event_name_tag = card.find("h3", class_="card_title")
        event['event_name'] = event_name_tag.get_text(strip=True) if event_name_tag else "No Event Name"

        time_tag = card.find("span", class_="card_time")
        event['time'] = time_tag.get_text(strip=True) if time_tag else "No Time Information"

        location_tag = card.find("span", class_="card_location")
        event['location'] = location_tag.get_text(strip=True) if location_tag else "No Location"

        description_tag = card.find("div", class_="card_description")
        event['description'] = description_tag.get_text(strip=True) if description_tag else "No Description"

        events.append(event)

    return events


def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)

            if isinstance(existing_data, list):
                existing_data.extend(data) 
            else:
                existing_data.update(data)  
        else:
            existing_data = data 

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")


def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)

            soup = scraper_.remove_js_css(soup)
            soup = scraper_.remove_unnecessary_sections(soup)

            page_data = extract_event_data(soup)
            all_data.extend(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data


if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.heinzhistorycenter.org/events/"
        "https://www.heinzhistorycenter.org/",
        "https://www.heinzhistorycenter.org/research/detre-library-archives/",
        "https://www.heinzhistorycenter.org/whats-on/exhibits/",

    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Heinz_History_Center.json")
    scraper_.close()

Fetching: https://www.heinzhistorycenter.org/events/https://www.heinzhistorycenter.org/
Fetching: https://www.heinzhistorycenter.org/research/detre-library-archives/
Fetching: https://www.heinzhistorycenter.org/whats-on/exhibits/
Data saved to ../raw_documents/Heinz_History_Center.json


In [48]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)
            else:
                existing_data.update(data) 
        else:
            existing_data = data

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.heinzhistorycenter.org/whats-on/sports-museum/exhibits/",
        "https://www.heinzhistorycenter.org/whats-on/fort-pitt/",
        "https://www.heinzhistorycenter.org/whats-on/meadowcroft/exhibits/",
        "https://www.heinzhistorycenter.org/whats-on/exhibits/past-exhibits/"
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Heinz_History_Center.json")
    scraper_.close()

Fetching: https://www.heinzhistorycenter.org/whats-on/sports-museum/exhibits/
Fetching: https://www.heinzhistorycenter.org/whats-on/fort-pitt/
Fetching: https://www.heinzhistorycenter.org/whats-on/meadowcroft/exhibits/
Fetching: https://www.heinzhistorycenter.org/whats-on/exhibits/past-exhibits/
Data saved to ../raw_documents/Heinz_History_Center.json


## The Frick
### https://www.thefrickpittsburgh.org/

In [49]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None 
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data) 
            else:
                existing_data.update(data) 
        else:
            existing_data = data

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.thefrickpittsburgh.org/calendar?search=1&page=1&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025",
        "https://www.thefrickpittsburgh.org/calendar?search=1&page=2&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025",
        "https://www.thefrickpittsburgh.org/stories",
        "https://www.thefrickpittsburgh.org/exhibitions",
        "https://www.thefrickpittsburgh.org/plan-your-visit",
        "https://www.thefrickpittsburgh.org/mission",
        "https://www.thefrickpittsburgh.org/collection",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Carriage%22+OR+object_type%3A%22Cars+and+Carriages%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Costume%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Decorative+Arts%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Painting%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Photography%22&limit=40",
        "https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Works+on+Paper%2FDrawing%22+OR+object_type%3A%22Works+on+Paper%2FPrint%22&limit=40"
    ]
    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Frick.json")
    scraper_.close()

Fetching: https://www.thefrickpittsburgh.org/calendar?search=1&page=1&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025
Fetching: https://www.thefrickpittsburgh.org/calendar?search=1&page=2&search_date_from=10%2F01%2F2024&search_date_to=3%2F31%2F2025
Fetching: https://www.thefrickpittsburgh.org/stories
Fetching: https://www.thefrickpittsburgh.org/exhibitions
Fetching: https://www.thefrickpittsburgh.org/plan-your-visit
Fetching: https://www.thefrickpittsburgh.org/mission
Fetching: https://www.thefrickpittsburgh.org/collection
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Carriage%22+OR+object_type%3A%22Cars+and+Carriages%22&limit=40
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Costume%22&limit=40
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Decorative+Arts%22&limit=40
Fetching: https://collection.thefrickpittsburgh.org/objects?query=object_type%3A%22Painting%22&lim

## More

# Food
## Food Festivals
### https://www.visitpittsburgh.com/events-festivals/food-festivals/

In [50]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data) 
            else:
                existing_data.update(data)  
        else:
            existing_data = data  

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link 
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/",
        "https://www.visitpittsburgh.com/blog/",
        "https://www.visitpittsburgh.com/plan-your-trip/",
        "https://www.visitpittsburgh.com/events-festivals/",
        "https://www.visitpittsburgh.com/events-festivals/?page=2",
        "https://www.visitpittsburgh.com/events-festivals/?page=3",
        "https://www.visitpittsburgh.com/events-festivals/?page=4",
        "https://www.visitpittsburgh.com/events-festivals/?page=5",
        "https://www.visitpittsburgh.com/events-festivals/?page=6",
        "https://www.visitpittsburgh.com/events-festivals/?page=7",
        "https://www.visitpittsburgh.com/events-festivals/?page=8",
        "https://www.visitpittsburgh.com/events-festivals/?page=9",
        "https://www.visitpittsburgh.com/events-festivals/?page=10",
        "https://www.visitpittsburgh.com/events-festivals/?page=11",
        "https://www.visitpittsburgh.com/events-festivals/?page=12",
        "https://www.visitpittsburgh.com/events-festivals/?page=13",
        "https://www.visitpittsburgh.com/events-festivals/?page=14",
        "https://www.visitpittsburgh.com/events-festivals/?page=15",
        "https://www.visitpittsburgh.com/events-festivals/?page=16",
        "https://www.visitpittsburgh.com/things-to-do/",
        "https://www.visitpittsburgh.com/restaurants-culinary/",
        "https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/",
        "https://www.visitpittsburgh.com/things-to-do/family-fun/",
        "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/",
        "https://www.visitpittsburgh.com/things-to-do/arts-culture/",
        "https://www.visitpittsburgh.com/meetings-and-events/",
        "https://www.visitpittsburgh.com/events-festivals/annual-events/",
        "https://www.visitpittsburgh.com/events-festivals/holidays/",
        "https://www.visitpittsburgh.com/events-festivals/halloween-events/",
        "https://www.visitpittsburgh.com/events-festivals/film-festivals/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/soul-food-festival/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/the-original-pittsburgh-taco-festival/",
        "https://www.visitpittsburgh.com/blog/top-beer-festivals-to-attend-in-pittsburgh-this-fall/"
       
    ]
    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Food_Festivals.json")
    scraper_.close()

Fetching: https://www.visitpittsburgh.com/events-festivals/food-festivals/
Fetching: https://www.visitpittsburgh.com/blog/
Fetching: https://www.visitpittsburgh.com/plan-your-trip/
Fetching: https://www.visitpittsburgh.com/events-festivals/
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=2
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=3
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=4
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=5
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=6
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=7
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=8
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=9
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=10
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=11
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=12
Fetching: https://w

In [57]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML with explicit wait for dynamic content."""
        self.driver.get(url)
        
        try:
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body")) 
            )
        except Exception as e:
            print(f"Error while waiting for page to load: {e}")
        
        return self.driver.page_source

    def get_soup(self, url: str):
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):

    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)  
            else:
                existing_data.update(data) 
        else:
            existing_data = data 
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link 
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")
    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.visitpittsburgh.com/events-festivals/?page=14",
        "https://www.visitpittsburgh.com/events-festivals/?page=15",
        "https://www.visitpittsburgh.com/events-festivals/?page=16",
        "https://www.visitpittsburgh.com/things-to-do/",
        "https://www.visitpittsburgh.com/restaurants-culinary/",
        "https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/",
        "https://www.visitpittsburgh.com/things-to-do/family-fun/",
        "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/",
        "https://www.visitpittsburgh.com/things-to-do/arts-culture/",
        "https://www.visitpittsburgh.com/meetings-and-events/",
        "https://www.visitpittsburgh.com/events-festivals/annual-events/",
        "https://www.visitpittsburgh.com/events-festivals/holidays/",
        "https://www.visitpittsburgh.com/events-festivals/halloween-events/",
        "https://www.visitpittsburgh.com/events-festivals/film-festivals/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/soul-food-festival/",
        "https://www.visitpittsburgh.com/events-festivals/food-festivals/the-original-pittsburgh-taco-festival/",
        "https://www.visitpittsburgh.com/blog/top-beer-festivals-to-attend-in-pittsburgh-this-fall/"
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Food_Festivals.json")
    scraper_.close()

Fetching: https://www.visitpittsburgh.com/events-festivals/?page=14
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=15
Fetching: https://www.visitpittsburgh.com/events-festivals/?page=16
Fetching: https://www.visitpittsburgh.com/things-to-do/
Fetching: https://www.visitpittsburgh.com/restaurants-culinary/
Fetching: https://www.visitpittsburgh.com/restaurants-culinary/craft-breweries/
Fetching: https://www.visitpittsburgh.com/things-to-do/family-fun/
Fetching: https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/
Fetching: https://www.visitpittsburgh.com/things-to-do/arts-culture/
Fetching: https://www.visitpittsburgh.com/meetings-and-events/
Fetching: https://www.visitpittsburgh.com/events-festivals/annual-events/
Fetching: https://www.visitpittsburgh.com/events-festivals/holidays/
Fetching: https://www.visitpittsburgh.com/events-festivals/halloween-events/
Fetching: https://www.visitpittsburgh.com/events-festivals/film-festivals/
Fetching: https://www.

## Picklesburgh
### https://www.picklesburgh.com/

In [51]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)
            else:
                existing_data.update(data)  
        else:
            existing_data = data  

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link 
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")
    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.picklesburgh.com/",
        "https://www.picklesburgh.com/vendors/",
        "https://www.picklesburgh.com/entertainment/",
        "https://www.picklesburgh.com/games/",
        "https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/",
        "https://www.picklesburgh.com/taste-of-picklesburgh/",
        "https://www.picklesburgh.com/news/",
        "https://www.picklesburgh.com/accessibility/",
        "https://www.picklesburgh.com/visit/getting-here/"
       
    ]
    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Picklesburgh.json")
    scraper_.close()

Fetching: https://www.picklesburgh.com/
Fetching: https://www.picklesburgh.com/vendors/
Fetching: https://www.picklesburgh.com/entertainment/
Fetching: https://www.picklesburgh.com/games/
Fetching: https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/
Fetching: https://www.picklesburgh.com/taste-of-picklesburgh/
Fetching: https://www.picklesburgh.com/news/
Fetching: https://www.picklesburgh.com/accessibility/
Fetching: https://www.picklesburgh.com/visit/getting-here/
Data saved to ../raw_documents/Picklesburgh.json


In [4]:
import requests
from bs4 import BeautifulSoup
import sys
import re

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status() 
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.title.string.strip() if soup.title else 'No Title'
        for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
            tag.decompose()
        for tag in soup.find_all(class_=re.compile(".*(nav|footer|menu|social|sidebar).*", re.I)):
            tag.decompose()
        for tag in soup.find_all(id=re.compile(".*(nav|footer|menu|social|sidebar).*", re.I)):
            tag.decompose()

        stop_phrases = [
            "Follow us on", "Contact Us", "Privacy Policy", "©",
            "BUY TICKETS", "GIVE NOW", "SUBSCRIBE", "Site Map",
            "Board Of Directors Login"
        ]

        content = []
        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']):
            text = ' '.join(element.stripped_strings)
            if text:
                if any(phrase.lower() in text.lower() for phrase in stop_phrases):
                    continue  
                content.append(text)

        if not content:
            print(f"No content found for {url}", file=sys.stderr)
            return f"URL: {url}\nTitle: {title}\n\nNo relevant content found.\n\n{'-'*80}\n\n"

        formatted_paragraphs = []
        current_paragraph = ''
        for line in content:
            if line.endswith(':'):
                if current_paragraph:
                    formatted_paragraphs.append(current_paragraph.strip('; '))
                current_paragraph = line.rstrip(':') + ': '
            else:
                current_paragraph += line + '; '

        if current_paragraph:
            formatted_paragraphs.append(current_paragraph.strip('; '))

        formatted_text = '\n\n'.join(formatted_paragraphs)

        return f"URL: {url}\nTitle: {title}\n\n{formatted_text}\n\n{'-'*80}\n\n"
    except Exception as e:
        print(f"Error fetching {url}: {e}", file=sys.stderr)
        return ''

def append_text_to_file(text, file_path):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.write(text)

if __name__ == "__main__":
    links = [
        "https://www.picklesburgh.com/",
        "https://www.picklesburgh.com/vendors/",
        "https://www.picklesburgh.com/entertainment/",
        "https://www.picklesburgh.com/games/",
        "https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/",
        "https://www.picklesburgh.com/taste-of-picklesburgh/",
        "https://www.picklesburgh.com/news/",
        "https://www.picklesburgh.com/accessibility/",
        "https://www.picklesburgh.com/visit/getting-here/"
    ]
    output_file = '../raw_documents/Music_Culture/7_Picklesburgh.txt'

    for url in links:
        text_content = extract_text_from_url(url)
        append_text_to_file(text_content, output_file)

Error fetching https://www.picklesburgh.com/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/
Error fetching https://www.picklesburgh.com/vendors/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/vendors/
Error fetching https://www.picklesburgh.com/entertainment/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/entertainment/
Error fetching https://www.picklesburgh.com/games/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/games/
Error fetching https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/
Error fetching https://www.picklesburgh.com/taste-of-picklesburgh/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/taste-of-picklesburgh/
Error fetching https://www.picklesburgh.com/news/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/news/
Error fet

## Pittsburgh Taco Fest
### https://www.pghtacofest.com/

In [53]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)
            else:
                existing_data.update(data) 
        else:
            existing_data = data 

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link 
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://www.pghtacofest.com/",
        "https://www.pghtacofest.com/about",
        "https://www.pghtacofest.com/vendors",
        "https://www.pghtacofest.com/faqs"
       
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Taco_Festival.json")
    scraper_.close()

Fetching: https://www.pghtacofest.com/
Fetching: https://www.pghtacofest.com/about
Fetching: https://www.pghtacofest.com/vendors
Fetching: https://www.pghtacofest.com/faqs
Data saved to ../raw_documents/Pittsburgh_Taco_Festival.json


## Pittsburgh Restaurant Week
### https://pittsburghrestaurantweek.com/

In [54]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)
            else:
                existing_data.update(data)  
        else:
            existing_data = data 

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link 
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://pittsburghrestaurantweek.com/",
        "https://pittsburghrestaurantweek.com/about/",
        "https://pittsburghrestaurantweek.com/about/history/",
        "https://pittsburghrestaurantweek.com/restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2024-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2024-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2023-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2023-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2022-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2022-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2021-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2021-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2020-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2020-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2019-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2019-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2018-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2018-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2017-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2017-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2016-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2016-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2015-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2015-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2014-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2014-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2013-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2013-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/summer-2012-restaurants/",
        "https://pittsburghrestaurantweek.com/restaurants/winter-2012-restaurants/"
       
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Restaurant_Week.json")
    scraper_.close()

Fetching: https://pittsburghrestaurantweek.com/
Fetching: https://pittsburghrestaurantweek.com/about/
Fetching: https://pittsburghrestaurantweek.com/about/history/
Fetching: https://pittsburghrestaurantweek.com/restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2024-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2024-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2023-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2023-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2022-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2022-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2021-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/winter-2021-restaurants/
Fetching: https://pittsburghrestaurantweek.com/restaurants/summer-2020-restaurants/
Fetching: https://pi

## Little Italy Days
### https://littleitalydays.com/

In [55]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data) 
            else:
                existing_data.update(data)
        else:
            existing_data = data 

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link  
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()
    links = [
        "https://littleitalydays.com/",
        "https://littleitalydays.com/entertainment-schedule/",
        "https://littleitalydays.com/getting-around/",
        "https://littleitalydays.com/about-us/",
        "https://littleitalydays.com/faq/",
        "https://littleitalydays.com/bloomfield-businesses/"
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Little_Italy_Day.json")
    scraper_.close()

Fetching: https://littleitalydays.com/
Fetching: https://littleitalydays.com/entertainment-schedule/
Fetching: https://littleitalydays.com/getting-around/
Fetching: https://littleitalydays.com/about-us/
Fetching: https://littleitalydays.com/faq/
Fetching: https://littleitalydays.com/bloomfield-businesses/
Data saved to ../raw_documents/Pittsburgh_Little_Italy_Day.json


In [3]:
import requests
from bs4 import BeautifulSoup
import sys
import re

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title.string.strip() if soup.title else 'No Title'

        for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
            tag.decompose()
        for tag in soup.find_all(class_=re.compile(".*(nav|footer|menu|social|sidebar).*", re.I)):
            tag.decompose()
        for tag in soup.find_all(id=re.compile(".*(nav|footer|menu|social|sidebar).*", re.I)):
            tag.decompose()

        stop_phrases = [
            "Follow us on", "Contact Us", "Privacy Policy", "©",
            "BUY TICKETS", "GIVE NOW", "SUBSCRIBE", "Site Map",
            "Board Of Directors Login"
        ]

        content = []
        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']):
            text = ' '.join(element.stripped_strings)
            if text:
                if any(phrase.lower() in text.lower() for phrase in stop_phrases):
                    continue  
                content.append(text)
        if not content:
            print(f"No content found for {url}", file=sys.stderr)
            return f"URL: {url}\nTitle: {title}\n\nNo relevant content found.\n\n{'-'*80}\n\n"

        formatted_paragraphs = []
        current_paragraph = ''
        for line in content:
            if line.endswith(':'):
                if current_paragraph:
                    formatted_paragraphs.append(current_paragraph.strip('; '))
                current_paragraph = line.rstrip(':') + ': '
            else:
                current_paragraph += line + '; '

        if current_paragraph:
            formatted_paragraphs.append(current_paragraph.strip('; '))
        formatted_text = '\n\n'.join(formatted_paragraphs)

        return f"URL: {url}\nTitle: {title}\n\n{formatted_text}\n\n{'-'*80}\n\n"
    except Exception as e:
        print(f"Error fetching {url}: {e}", file=sys.stderr)
        return ''

def append_text_to_file(text, file_path):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.write(text)

if __name__ == "__main__":
    links = [
        "https://littleitalydays.com/",
        "https://littleitalydays.com/entertainment-schedule/",
        "https://littleitalydays.com/getting-around/",
        "https://littleitalydays.com/about-us/",
        "https://littleitalydays.com/faq/",
        "https://littleitalydays.com/bloomfield-businesses/"
    ]
    output_file = '../raw_documents/Music_Culture/11_Little_Italy_Day.txt'

    for url in links:
        text_content = extract_text_from_url(url)
        append_text_to_file(text_content, output_file)

Error fetching https://littleitalydays.com/: 403 Client Error: Forbidden for url: https://littleitalydays.com/
Error fetching https://littleitalydays.com/entertainment-schedule/: 403 Client Error: Forbidden for url: https://littleitalydays.com/entertainment-schedule/
Error fetching https://littleitalydays.com/getting-around/: 403 Client Error: Forbidden for url: https://littleitalydays.com/getting-around/
Error fetching https://littleitalydays.com/about-us/: 403 Client Error: Forbidden for url: https://littleitalydays.com/about-us/
Error fetching https://littleitalydays.com/faq/: 403 Client Error: Forbidden for url: https://littleitalydays.com/faq/
Error fetching https://littleitalydays.com/bloomfield-businesses/: 403 Client Error: Forbidden for url: https://littleitalydays.com/bloomfield-businesses/


## Banana Split Fest
### https://bananasplitfest.com/

In [56]:
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

class Scraper:
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.driver.set_page_load_timeout(10)

    def get_html(self, url: str):
        """Get HTML."""
        self.driver.get(url)
        return self.driver.page_source

    def get_soup(self, url: str):
        """Parse and return BeautifulSoup object."""
        html = self.get_html(url)
        return BeautifulSoup(html, "html.parser")

    def close(self):
        self.driver.quit()

    def __del__(self):
        self.close()

def organize_content_by_heading(soup: BeautifulSoup):
    """Organize content based on headings, ensuring all relevant text is captured."""
    data = {}
    current_h1, current_h2, current_h3, current_h4, current_h5, current_h6 = None, None, None, None, None, None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {'content': ""}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None  # Reset lower headings
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {'content': ""}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {'content': ""}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {'content': ""}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {'content': ""}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {'content': ""}
        else:
            content = tag.get_text(strip=True)
            if content:
                if current_h6:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6]['content'] += content + " "
                elif current_h5:
                    data[current_h1][current_h2][current_h3][current_h4][current_h5]['content'] += content + " "
                elif current_h4:
                    data[current_h1][current_h2][current_h3][current_h4]['content'] += content + " "
                elif current_h3:
                    data[current_h1][current_h2][current_h3]['content'] += content + " "
                elif current_h2:
                    data[current_h1][current_h2]['content'] += content + " "
                elif current_h1:
                    data[current_h1]['content'] += content + " "

    return data

def save_to_json(data, filename):
    """Save data to a JSON file, ensuring no overwrite of existing content."""
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
            if isinstance(existing_data, list):
                existing_data.extend(data)  
            else:
                existing_data.update(data)  
        else:
            existing_data = data 

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error writing to JSON file: {e}")

def scrape_links(scraper_, links):
    """Scrape multiple links and return organized data."""
    all_data = []
    for link in links:
        try:
            print(f"Fetching: {link}")
            soup = scraper_.get_soup(link)
            page_data = organize_content_by_heading(soup)
            page_data['url'] = link 
            all_data.append(page_data)
        except Exception as e:
            print(f"Error fetching {link}: {e}")

    return all_data

if __name__ == "__main__":
    scraper_ = Scraper()

    links = [
        "https://bananasplitfest.com/",
        "https://bananasplitfest.com/activities/",
        "https://bananasplitfest.com/events/princess-pageant/",
        "https://bananasplitfest.com/activities/crafts-games-activities/",
        "https://bananasplitfest.com/activities/participating-vendors/",
        "https://bananasplitfest.com/activities/food/",
        "https://bananasplitfest.com/activities/over-21-area/",
        "https://bananasplitfest.com/events/",
        "https://bananasplitfest.com/events/5k-banana-run/",
        "https://bananasplitfest.com/events/great-american-banana-baking-contest/",
        "https://bananasplitfest.com/events/banana-challenge/",
        "https://bananasplitfest.com/events/blood-drive/",
        "https://bananasplitfest.com/events/cornhole-tournament/",
        "https://bananasplitfest.com/events/car-show/",
        "https://bananasplitfest.com/events/yellow-tie-gala/",
        "https://bananasplitfest.com/activities/entertainment/",
        "https://bananasplitfest.com/schedule/",
        "https://bananasplitfest.com/information/parking/",
        "https://bananasplitfest.com/information/plan-your-visit/",
        "https://bananasplitfest.com/history/",
        "https://bananasplitfest.com/information/media/"
    ]

    scraped_data = scrape_links(scraper_, links)
    save_to_json(scraped_data, "../raw_documents/Pittsburgh_Banana_Split_Festival.json")
    scraper_.close()

Fetching: https://bananasplitfest.com/
Fetching: https://bananasplitfest.com/activities/
Fetching: https://bananasplitfest.com/events/princess-pageant/
Fetching: https://bananasplitfest.com/activities/crafts-games-activities/
Fetching: https://bananasplitfest.com/activities/participating-vendors/
Fetching: https://bananasplitfest.com/activities/food/
Fetching: https://bananasplitfest.com/activities/over-21-area/
Fetching: https://bananasplitfest.com/events/
Fetching: https://bananasplitfest.com/events/5k-banana-run/
Fetching: https://bananasplitfest.com/events/great-american-banana-baking-contest/
Fetching: https://bananasplitfest.com/events/banana-challenge/
Fetching: https://bananasplitfest.com/events/blood-drive/
Fetching: https://bananasplitfest.com/events/cornhole-tournament/
Fetching: https://bananasplitfest.com/events/car-show/
Fetching: https://bananasplitfest.com/events/yellow-tie-gala/
Fetching: https://bananasplitfest.com/activities/entertainment/
Fetching: https://bananaspl