# Scrape Static Pages
This screen scraper will get all paragraphs, headers, and tables from a page. It is useful for pages that are standalone

In [76]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Fetch the webpage
urls = [
    # Wikipedia Pages
    "https://en.wikipedia.org/wiki/Pittsburgh",
    "https://en.wikipedia.org/wiki/History_of_Pittsburgh",
    "https://en.wikipedia.org/wiki/Pittsburgh_Steelers",
    "https://en.wikipedia.org/wiki/Pittsburgh_Penguins",
    "https://en.wikipedia.org/wiki/Pittsburgh_Pirates",
    "https://en.wikipedia.org/wiki/Pittsburgh_Panthers_football",
    # Britannica Pages
    "https://www.britannica.com/place/Pittsburgh",
    "https://www.britannica.com/topic/Carnegie-Mellon-University",
    
    ]
corpus = []
for url in urls:
    response = requests.get(url)
    texts = []

    # Step 2: Parse the HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 3: Extract useful text and table content in order
    content_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table'])

    # Step 4: Output the text and table content in order
    for element in content_elements:
        if element.name == 'table' and 'wikitable' in element.get('class', []):
            for row in element.find_all('tr'):
                cells = row.find_all(['th', 'td'])
                cell_text = [cell.get_text(strip=True) for cell in cells]
                text = '\t'.join(cell_text)
        else:
            text = element.get_text(strip=False)
        texts.append(text)
    corpus.append('\n'.join(texts))

# Create a DataFrame with columns for texts in corpus and their corresponding URL sources
scraped_df = pd.DataFrame({'text': corpus, 'source': urls})
(scraped_df)

Unnamed: 0,text,source
0,Contents\nPittsburgh\n\n\nPittsburghCityDownto...,https://en.wikipedia.org/wiki/Pittsburgh
1,Contents\nHistory of Pittsburgh\n\n\nThe histo...,https://en.wikipedia.org/wiki/History_of_Pitts...
2,Contents\nPittsburgh Steelers\n\n\nPittsburgh ...,https://en.wikipedia.org/wiki/Pittsburgh_Steelers
3,Contents\nPittsburgh Penguins\n\n\nPittsburgh ...,https://en.wikipedia.org/wiki/Pittsburgh_Penguins
4,Contents\nPittsburgh Pirates\n\n\nPittsburgh P...,https://en.wikipedia.org/wiki/Pittsburgh_Pirates
5,Contents\nPittsburgh Panthers football\n\n\nPi...,https://en.wikipedia.org/wiki/Pittsburgh_Panth...
6,Pittsburgh\nOur editors will review what you’v...,https://www.britannica.com/place/Pittsburgh
7,Carnegie Mellon University\nOur editors will r...,https://www.britannica.com/topic/Carnegie-Mell...


# Handle PDFs

In [78]:
import os
from pypdf import PdfReader

# Step 1: Specify the folder path
folder_path = "./PDF Documents"

# Step 2: List all PDF files in the folder
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
corpus = []

# Step 3: Iterate through each PDF file
for pdf_file in pdf_files:
    pdf_path = os.path.join(folder_path, pdf_file)
    reader = PdfReader(pdf_path)
    
    print(f"Extracting text from: {pdf_file}")
    full_document = ""
    # Step 4: Iterate through all pages
    for page in reader.pages:
        # Step 5: Extract text from each page
        text = page.extract_text()
        # Step 6: Print the extracted text
        full_document += text
    corpus.append(full_document)
# Create a DataFrame with columns for texts in corpus and their corresponding PDF file names
pdf_df = pd.DataFrame({'text': corpus, 'source': pdf_files})
(pdf_df)

Extracting text from: 9627_UF_Regulations.pdf
Extracting text from: 9625_Parking_Tax_Regulations.pdf
Extracting text from: 9626_Payroll_Tax_Regulations.pdf
Extracting text from: 9622_Amusement_Tax_Regulations.pdf
Extracting text from: 9624_Local_Services_Tax_Regulations.pdf
Extracting text from: 9623_ISP_Tax_Regulations.pdf
Extracting text from: 23255_2024_Operating_Budget.pdf


Unnamed: 0,text,source
0,CI\nTY OF PITTSBURGH \nNO\nN-RESIDENT \nSPORT...,9627_UF_Regulations.pdf
1,CITY OF PITTSBURGH \nPARKING TAX REGULATION...,9625_Parking_Tax_Regulations.pdf
2,CITY OF PITTSBURGH \nPAYROLL TAX \nREGULATIONS...,9626_Payroll_Tax_Regulations.pdf
3,\nCITY OF PITTSBURGH \nAMUSEMENT TAX \nREGULA...,9622_Amusement_Tax_Regulations.pdf
4,CITY OF PITTSBURGH \nLOCAL SERVICES TAX \nREG...,9624_Local_Services_Tax_Regulations.pdf
5,-1-CITY OF PITTSBURGH \nTHE INSTITUTIO N AND S...,9623_ISP_Tax_Regulations.pdf
6,"CITY OF PITTSBURGH\nEd Gainey, Mayor\nDeputy M...",23255_2024_Operating_Budget.pdf


# Handle Nested Pages

In [95]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm

def fetch_and_parse(url, verify=True):
    response = requests.get(url, verify=verify)
    return BeautifulSoup(response.content, 'html.parser')

def extract_text(soup):
    text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    return "\n".join(element.get_text(strip=True) for element in text_elements)

def scrape_entire_website(url, required_section, verify=True):

    all_links = set()
    new_links = set([url])
    corpus = []
    links = []
    while len(new_links) > 0:
        print(f"Processing {len(new_links)} new links...")
        for link in tqdm(new_links):
            # Get all unique links on the current page
            try:
                main_soup = fetch_and_parse(link, verify=verify)
            except Exception as e:
                print(f"Failed to fetch {link}. Error: {e}")
                continue
            next_links = main_soup.find_all('a', href=True)
            unique_links = set(urljoin(link, a['href']) for a in next_links if 'href' in a.attrs)
            unique_links = {link for link in unique_links if required_section in link}
            unique_links = {link for link in unique_links if not ".pdf" in link}

            # Extract and print text from the current page
            text = extract_text(main_soup)
            corpus.append(text)
            links.append(link)
        all_links.update(new_links)
        new_links = unique_links - all_links
    # Create a DataFrame with columns for texts in corpus and their corresponding PDF file names
    return pd.DataFrame({'text': corpus, 'source': links})

In [None]:
print("Scraping Visit Pittsburgh Website")
visit_pgh_df = scrape_entire_website("https://www.visitpittsburgh.com/", "visitpittsburgh.com")

In [96]:
print("Scraping Pittsburgh PA Government Website")
gov_site_df = scrape_entire_website("https://pittsburghpa.gov", "pittsburghpa.gov", verify=False)

Scraping Pittsburgh PA Government Website
Processing 1 new links...


100%|██████████| 1/1 [00:00<00:00,  3.86it/s]


Processing 102 new links...


100%|██████████| 102/102 [00:55<00:00,  1.84it/s]


Processing 30 new links...


 17%|█▋        | 5/30 [00:02<00:09,  2.76it/s]

Failed to fetch https://webstats.pittsburghpa.gov/. Error: HTTPSConnectionPool(host='webstats.pittsburghpa.gov', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f921112d310>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Processing 2 new links...


100%|██████████| 2/2 [00:00<00:00,  3.22it/s]

Failed to fetch mailto:RealEstateTaxInfo@pittsburghpa.gov. Error: No connection adapters were found for 'mailto:RealEstateTaxInfo@pittsburghpa.gov'





In [86]:
print("Scraping Carnegie Mellon University Website")
cmu_df = scrape_entire_website("https://www.cmu.edu/about/index.html", "www.cmu.edu")

Scraping Carnegie Mellon University Website
Processing 1 new links...


100%|██████████| 1/1 [00:00<00:00,  8.90it/s]


Processing 45 new links...


100%|██████████| 45/45 [00:06<00:00,  6.76it/s]


Processing 275 new links...


100%|██████████| 275/275 [01:17<00:00,  3.54it/s]


Processing 64 new links...


100%|██████████| 64/64 [00:16<00:00,  3.77it/s]


Processing 1 new links...


100%|██████████| 1/1 [00:00<00:00,  3.43it/s]


Processing 81 new links...


100%|██████████| 81/81 [00:16<00:00,  4.85it/s]


Processing 108 new links...


100%|██████████| 108/108 [00:19<00:00,  5.43it/s]


# Combine DFs and export

In [102]:
# Combine all the dataframes
combined_df = pd.concat([scraped_df, pdf_df, visit_pgh_df, gov_site_df, cmu_df], ignore_index=True)

# export the combined dataframe to a csv file
combined_df.to_csv('general_info.csv', index=False)