In [2]:
import requests
from bs4 import BeautifulSoup

def fetch_page_text(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        # Parse the page content
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract and return the text from the page
        return soup.get_text(separator='\n', strip=True)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def crawl_urls(url_list):
    results = {}
    for url in url_list:
        print(f"Fetching: {url}")
        text = fetch_page_text(url)
        if text:
            results[url] = text
    return results

if __name__ == "__main__":
    # Example list of Wikipedia URLs
    urls = [
        "https://en.wikipedia.org/wiki/Pittsburgh"
    ]
    
    # Start crawling the URLs
    crawled_data = crawl_urls(urls)
    
    # Print or process the results
    for url, text in crawled_data.items():
        print(f"\nText from {url}:\n{text}...")  # Print first 500 characters

Fetching: https://en.wikipedia.org/wiki/Pittsburgh

Text from https://en.wikipedia.org/wiki/Pittsburgh:
Pittsburgh - Wikipedia
Jump to content
Main menu
Main menu
move to sidebar
hide
Navigation
Main page
Contents
Current events
Random article
About Wikipedia
Contact us
Contribute
Help
Learn to edit
Community portal
Recent changes
Upload file
Search
Search
Donate
Appearance
Create account
Log in
Personal tools
Create account
Log in
Pages for logged out editors
learn more
Contributions
Talk
Contents
move to sidebar
hide
(Top)
1
Etymology
2
History
Toggle History subsection
2.1
Native Americans
2.2
18th century
2.3
19th century
2.4
20th century
2.5
21st century
3
Geography
Toggle Geography subsection
3.1
Cityscape
3.1.1
Areas
3.1.1.1
Golden Triangle
3.1.1.2
North Side
3.1.1.3
South Side
3.1.1.4
East End
3.1.1.5
West End
3.1.2
Ethnicities
3.1.3
Population densities
3.1.4
Images
3.2
Regional identity
3.3
Climate
3.3.1
Air quality
3.3.2
Water quality
4
Demographics
Toggle Demographics subse

In [2]:
pip install selenium beautifulsoup4 webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm

# Function to fetch the page content using Selenium
def fetch_page_text_selenium(url):
    try:
        driver = webdriver.Chrome(driverPath) 
        # Initialize Chrome WebDriver
        driver.get(url)
        
        # Wait for the page to load (adjust time if needed)
        time.sleep(3)
        
        # Get page source and close the browser
        page_source = driver.page_source
        driver.quit()

        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract and clean the text from the page
        page_text = soup.get_text(separator='\n', strip=True)
        return page_text

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to read URLs from CSV and crawl each one
def crawl_urls_from_csv(csv_file_path, url_column_name):
    with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for index, row in enumerate(reader):
            url = row[url_column_name]
            print(f"Fetching: {url}")
            text = fetch_page_text_selenium(url)
            if text:
                # Save the crawled text to a file with the index as the filename
                output_file = f"/Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/text_data/{index}.txt"
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(text)
                print(f"Saved content to {output_file}")

if __name__ == "__main__":
    csv_file_path = '/Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/data_source/data_source.csv'
    url_column_name = 'Source URL'
    driverPath = '/Users/alan/Downloads/chromedriver-mac-arm64/chromedriver'


    # Start crawling the URLs from the CSV
    crawl_urls_from_csv(csv_file_path, url_column_name)

Fetching: https://www.cmu.edu/cfa/music/concerts-events/opera-events.html
Saved content to /Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/text_data/0.txt
Fetching: https://www.cmu.edu/cfa/music/concerts-events/index.html
Saved content to /Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/text_data/1.txt
Fetching: https://events.cmu.edu/all
Saved content to /Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/text_data/2.txt
Fetching: https://pittsburghmusicals.com/season/
Saved content to /Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/text_data/3.txt
Fetching: https://www.chambermusicpittsburgh.org/2024-2025-mainstage-live/
Saved content to /Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/text_data/4.txt
Fetching: https://makemusicpittsburgh.org/
Saved content to /Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/text_data/5.txt
Fetching: https://en.wikipedia.org/wiki/Pittsburgh_Symphony_Orchestra
Saved con

KeyboardInterrupt: 