<a href="https://colab.research.google.com/github/Celgitembe/Celgitembe/blob/main/Web_Scraping_Checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1.1) Write a function to Get and parse html content from a Wikipedia page

import requests
from bs4 import BeautifulSoup

def get_wikipedia_content(url):
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the main content element
        content = soup.find(id='content')

        # Extract the text from the main content element
        text = content.get_text()

        return text
    else:
        # If the request was not successful, print an error message
        print(f"Error: Unable to fetch content from {url}. Status code: {response.status_code}")
        return None


In [None]:
#1.2) Write a function to Extract article title

from bs4 import BeautifulSoup

def extract_article_title(html_content):
    try:
        # Parse HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract the title of the article
        title = soup.find('h1', id='firstHeading').text.strip()

        return title
    except Exception as e:
        print("Error extracting article title:", e)
        return None



In [None]:
#1.3) Write a function to Extract article text for each paragraph with their respective headings.
#Map those headings to their respective paragraphs in the dictionary.

from bs4 import BeautifulSoup

def extract_article_text(html_content):
    try:
        # Parse HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract the title of the article
        title = soup.find('h1', id='firstHeading').text.strip()

        # Extract all headings and paragraphs
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        paragraphs = soup.find_all('p')

        # Create a dictionary to store headings mapped to their respective paragraphs
        headings_paragraphs_map = {}
        current_heading = None

        for element in headings + paragraphs:
            if element.name.startswith('h'):  # If the element is a heading
                current_heading = element.text.strip()
            elif current_heading:
                # If there is a current heading, map the paragraph text to it
                headings_paragraphs_map.setdefault(current_heading, []).append(element.text.strip())

        return {
            "title": title,
            "headings_paragraphs_map": headings_paragraphs_map
        }
    except Exception as e:
        print("Error extracting article text:", e)
        return None


In [None]:
#1.4) Write a function to collect every link that redirects to another Wikipedia page

from bs4 import BeautifulSoup

def collect_wikipedia_links(html_content):
    try:
        # Parse HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all anchor tags (links)
        links = soup.find_all('a', href=True)

        # Collect links that redirect to another Wikipedia page
        wikipedia_links = []
        for link in links:
            href = link['href']
            if href.startswith('/wiki/') and ':' not in href:
                # Only include links that start with '/wiki/' and do not contain ':'
                wikipedia_links.append(href)

        return wikipedia_links
    except Exception as e:
        print("Error collecting Wikipedia links:", e)
        return None


In [None]:
#1.5) Wrap all the previous functions into a single function that takes as parameters a Wikipedia link
#1.6) Test the last function on a Wikipedia page of your choice



import requests
from bs4 import BeautifulSoup

def scrape_wikipedia_page(url):
    try:
        # Fetch HTML content of the Wikipedia page
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for 4xx or 5xx status codes
        html_content = response.text

        # Parse HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract the title of the article
        title = soup.find('h1', id='firstHeading').text.strip()

        # Extract all headings and paragraphs
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        paragraphs = soup.find_all('p')

        # Create a dictionary to store headings mapped to their individual paragraphs
        headings_paragraphs_map = {}
        current_heading = None

        for element in headings + paragraphs:
            if element.name.startswith('h'):  # If the element is a heading
                current_heading = element.text.strip()
            elif current_heading:
                # If there is a current heading, map the paragraph text to it
                headings_paragraphs_map.setdefault(current_heading, []).append(element.text.strip())

        # Find all anchor tags (links)
        links = soup.find_all('a', href=True)

        # Collect links that redirect to another Wikipedia page
        wikipedia_links = []
        for link in links:
            href = link['href']
            if href.startswith('/wiki/') and ':' not in href:
                # Only include links that start with '/wiki/' and do not contain ':'
                wikipedia_links.append(href)

        return {
            "title": title,
            "headings_paragraphs_map": headings_paragraphs_map,
            "wikipedia_links": wikipedia_links
        }
    except requests.exceptions.RequestException as e:
        print("Error fetching content:", e)
        return None
    except Exception as e:
        print("Error:", e)
        return None

# Example usage:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
result = scrape_wikipedia_page(url)
if result:
    print("Title:", result["title"])
    print("Headings and their respective paragraphs:")
    for heading, paragraphs in result["headings_paragraphs_map"].items():
        print(f"Heading: {heading}")
        for paragraph in paragraphs:
            print(f"Paragraph: {paragraph}")
        print()
    print("Wikipedia Links:")
    for link in result["wikipedia_links"]:
        print(link)