In [3]:
import pandas as pd
import requests
from requests.exceptions import ConnectionError, Timeout, RequestException
from bs4 import BeautifulSoup
import warnings
from urllib3.exceptions import InsecureRequestWarning
import time
import random

# Suppress InsecureRequestWarnings
warnings.filterwarnings('ignore', category=InsecureRequestWarning)

# Load the CSV file
file_path = '/Users/alan/11711/nlp-from-scratch-assignment/data/160_entries/data_source/data_source.csv'
data = pd.read_csv(file_path)

# Extract non-empty URLs from the 'Source URL' column
# urls = data['Source URL'].dropna().unique()
urls = ['https://en.wikipedia.org/wiki/Pittsburgh']

# Define a list of user-agents to simulate different browsers
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
]

# Define headers for requests
headers = {'User-Agent': random.choice(user_agents)}

# Create a session to reuse connections
session = requests.Session()

# Function to scrape a web page and extract information
def scrape_url(url, retries=3, delay=5, timeout=10):
    attempt = 0
    while attempt < retries:
        try:
            response = session.get(url, headers=headers, timeout=timeout, verify=False)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'lxml')

            # Extract title
            title = soup.title.string if soup.title else "No title"

            # Extract headings
            headings = [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]

            # Extract paragraphs
            paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]

            # Extract links
            links = [a['href'] for a in soup.find_all('a', href=True)]

            # Extract meta description
            meta_description = soup.find('meta', attrs={'name': 'description'})
            description = meta_description['content'] if meta_description else "No meta description"

            # Extract images
            images = [img['src'] for img in soup.find_all('img', src=True)]

            return {
                "Title": title,
                "Headings": headings,
                "Paragraphs": paragraphs,
                "Links": links,
                "Description": description,
                "Images": images
            }

        except (ConnectionError, Timeout, RequestException) as e:
            attempt += 1
            if attempt < retries:
                print(f"Retrying {url}, attempt {attempt}/{retries} after {delay} seconds.")
                time.sleep(delay)
            else:
                return f"Failed to retrieve {url}: {e}"
        except Exception as e:
            return f"Parsing failed for {url}: {e}"

# Scrape each URL and store the results
scraped_data = []
for url in urls:
    info = scrape_url(url)
    scraped_data.append((url, info))

# Convert scraped data to a DataFrame for display
scraped_df = pd.DataFrame(scraped_data, columns=['URL', 'Website Information'])

# Display the scraped data
print(scraped_df)


                                        URL  \
0  https://en.wikipedia.org/wiki/Pittsburgh   

                                 Website Information  
0  {'Title': 'Pittsburgh - Wikipedia', 'Headings'...  


In [5]:
scraped_df['Website Information'][0]

{'Title': 'Pittsburgh - Wikipedia',
 'Headings': ['Contents',
  'Pittsburgh',
  'Etymology',
  'History',
  'Native Americans',
  '18th century',
  '19th century',
  '20th century',
  '21st century',
  'Geography',
  'Cityscape',
  'Areas',
  'Golden Triangle',
  'North Side',
  'South Side',
  'East End',
  'West End',
  'Ethnicities',
  'Population densities',
  'Images',
  'Regional identity',
  'Climate',
  'Air quality',
  'Water quality',
  'Demographics',
  '2020 census',
  'Demographic changes',
  'Economy',
  'Arts and culture',
  'Entertainment',
  'Music',
  'Theatre',
  'Literature',
  'Food',
  'Local dialect',
  'Livability',
  'Sports',
  'Professional',
  'College',
  'Baseball',
  'Football',
  'Hockey',
  'Basketball',
  'Soccer',
  'Golf',
  'Professional wrestling',
  'Annual sporting events',
  'Government and politics',
  'Government',
  'Politics',
  'Law enforcement',
  'Crime',
  'Education',
  'Colleges and universities',
  'Primary education',
  'Media',
  'N