In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_page_text(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        # Parse the page content
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract and return the text from the page
        return soup.get_text(separator='\n', strip=True)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def crawl_urls(url_list):
    results = {}
    for url in url_list:
        print(f"Fetching: {url}")
        text = fetch_page_text(url)
        if text:
            results[url] = text
    return results

def save_crawled_data(crawled_data, urls):
    # Iterate over each URL and its corresponding text
    for index, url in enumerate(urls):
        # Fetch the text corresponding to the URL from the crawled_data dictionary
        text = crawled_data.get(url, "")

        # Remove all newline characters from the text
        cleaned_text = text.replace('\n', ' ')

        # Define the file name with the index from the URLs list
        output_file = f"/Users/alan/11711/nlp-from-scratch-assignment/data/1010_160_entries/crawled/crawled_text_data/{index}.txt"

        # Save the cleaned text to the file
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)

        print(f"Saved content from URL {url} to {output_file}")

if __name__ == "__main__":
    file_path = '/Users/alan/11711/nlp-from-scratch-assignment/data/1010_160_entries/raw/raw_csv_data/data_source.csv'
    data = pd.read_csv(file_path)

    # Extract non-empty URLs from the 'Source URL' column
    urls = data[data['Select'] == 'Webpage']['Source URL'].dropna().unique()
    # Start crawling the URLs
    crawled_data = crawl_urls(urls)
    
    # Print or process the results
    save_crawled_data(crawled_data, urls)
    print("Crawling complete!")

Fetching: https://en.wikipedia.org/wiki/Pittsburgh
Fetching: https://en.wikipedia.org/wiki/History_of_Pittsburgh
Fetching: https://pittsburghpa.gov/pittsburgh/pgh-about
Error fetching https://pittsburghpa.gov/pittsburgh/pgh-about: HTTPSConnectionPool(host='pittsburghpa.gov', port=443): Max retries exceeded with url: /pittsburgh/pgh-about (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)')))
Fetching: https://www.britannica.com/technology/steel/Testing-of-properties
Fetching: https://pittsburghpa.gov/events/index.html
Error fetching https://pittsburghpa.gov/events/index.html: HTTPSConnectionPool(host='pittsburghpa.gov', port=443): Max retries exceeded with url: /events/index.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)')))
Fetching: https://www.visitpittsbu

In [2]:
pip install selenium beautifulsoup4 webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm

# Function to fetch the page content using Selenium
def fetch_page_text_selenium(url):
    try:
        driver = webdriver.Chrome(driverPath) 
        # Initialize Chrome WebDriver
        driver.get(url)
        
        # Wait for the page to load (adjust time if needed)
        time.sleep(3)
        
        # Get page source and close the browser
        page_source = driver.page_source
        driver.quit()

        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract and clean the text from the page
        page_text = soup.get_text(separator='\n', strip=True)
        return page_text

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to read URLs from CSV and crawl each one
def crawl_urls_from_csv(csv_file_path, url_column_name):
    with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for index, row in enumerate(reader):
            url = row[url_column_name]
            print(f"Fetching: {url}")
            text = fetch_page_text_selenium(url)
            if text:
                # Save the crawled text to a file with the index as the filename
                output_file = f"/Users/alan/11711/nlp-from-scratch-assignment/data/1010_160_entries/crawled/crawled_text_data/{index}.txt"
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(text)
                print(f"Saved content to {output_file}")

if __name__ == "__main__":
    csv_file_path = '/Users/alan/11711/nlp-from-scratch-assignment/data/1010_160_entries/raw/raw_csv_data/data_source.csv'
    url_column_name = 'Source URL'
    driverPath = '/Users/alan/Downloads/chromedriver-mac-arm64/chromedriver'


    # Start crawling the URLs from the CSV
    crawl_urls_from_csv(csv_file_path, url_column_name)

Fetching: https://en.wikipedia.org/wiki/Pittsburgh


KeyboardInterrupt: 