In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://en.wikipedia.org/wiki/HIV"

# Send a GET request to the specified URL and store the response
response = requests.get(url)

# Check if the request was successful (status code 200 indicates success)
if response.status_code == 200:
    # If successful, print a success message
    print("Successfully fetched the webpage!")
else:
    # If not successful, print a failure message
    print("Failed to fetch the webpage.")

Successfully fetched the webpage!


In [3]:
soup = BeautifulSoup(response.content , 'html.parser')


In [4]:
soup.prettify()

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-enabled vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   HIV - Wikipedia\n  </title>\n  <script>\n   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled 

In [14]:
def get_title(soup):
    title = soup.find('title')
    if title is not None:
        return title.text.strip()  # strip() to remove leading/trailing whitespaces
    return "Title not found"

In [17]:
#Write a function to Extract article title
title = get_title(soup)
print(title)

HIV - Wikipedia


In [26]:
###Write a function to Extract article text for each paragraph with their respective headings. Map those headings to their respective paragraphs in the dictionary###
def extract_article_text(soup):
    article_text = {}

    # Find all headings (h1, h2, h3, etc.)
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    # Iterate through headings and extract corresponding paragraphs
    for heading in headings:
        heading_text = heading.text.strip()
        paragraphs = []
        paragraph = heading.find_next('p')
        while paragraph:
            paragraphs.append(paragraph.text.strip())
            paragraph = paragraph.find_next('p')
        article_text[heading_text] = paragraphs

    return article_text

# Call the function and store the result in a variable
article_text = extract_article_text(soup)

# Print the result
for heading, paragraphs in article_text.items():
    print(f"**{heading}**")
    for paragraph in paragraphs:
        print(paragraph)
    print()

**Contents**


The human immunodeficiency viruses (HIV) are two species of Lentivirus (a subgroup of retrovirus) that infect humans. Over time, they cause acquired immunodeficiency syndrome (AIDS),[1][2] a condition in which progressive failure of the immune system allows life-threatening opportunistic infections and cancers to thrive.[3] Without treatment, the average survival time after infection with HIV is estimated to be 9 to 11 years, depending on the HIV subtype.[4]
In most cases, HIV is a sexually transmitted infection and occurs by contact with or transfer of blood, pre-ejaculate, semen, and vaginal fluids.[5][6] Non-sexual transmission can occur from an infected mother to her infant during pregnancy, during childbirth by exposure to her blood or vaginal fluid, and through breast milk.[7][8][9][10] Within these bodily fluids, HIV is present as both free virus particles and virus within infected immune cells.
Research has shown (for both same-sex and opposite-sex couples) that 

In [27]:
###Write a function to collect every link that redirects to another Wikipedia page###
def collect_wikipedia_links(soup):
    wikipedia_links = []

    # Find all links on the page
    links = soup.find_all('a', href=True)

    # Iterate through links and check if they point to another Wikipedia page
    for link in links:
        href = link['href']
        if href.startswith('/wiki/') and ':' not in href:
            wikipedia_links.append(f"https://en.wikipedia.org{href}")

    return wikipedia_links

# Call the function and store the result in a variable
wikipedia_links = collect_wikipedia_links(soup)

# Print the result
for link in wikipedia_links:
    print(link)

https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/HIV
https://en.wikipedia.org/wiki/HIV
https://en.wikipedia.org/wiki/HIV
https://en.wikipedia.org/wiki/HIV/AIDS
https://en.wikipedia.org/wiki/HIV_(disambiguation)
https://en.wikipedia.org/wiki/AIDS_(computer_virus)
https://en.wikipedia.org/wiki/Scanning_electron_micrograph
https://en.wikipedia.org/wiki/Lymphocyte
https://en.wikipedia.org/wiki/Taxonomy_(biology)
https://en.wikipedia.org/wiki/Virus
https://en.wikipedia.org/wiki/Riboviria
https://en.wikipedia.org/wiki/Revtraviricetes
https://en.wikipedia.org/wiki/Revtraviricetes
https://en.wikipedia.org/wiki/Revtraviricetes
https://en.wikipedia.org/wiki/Ortervirales
https://en.wikipedia.org/wiki/Retrovirus
https://en.wikipedia.org/wiki/Orthoretrovirinae
https://en.wikipedia.org/wiki/Lentivirus
https://en.wikipedia.org/wiki/Subtypes_of_HIV#HIV-1
https://en.wikipedia.org/wiki/Subtypes_of_HIV#HIV-2
https://en.wikipedia.org/wiki/Bovine

In [28]:
def wikipedia_link_analyzer(wikipedia_link):
    response = requests.get(wikipedia_link)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.find('h1', id='firstHeading').text
    paragraphs = soup.find_all('p')
    text = '\n'.join([p.text for p in paragraphs])

    links = soup.find_all('a', href=True)
    wikipedia_links = []

    for link in links:
        href = link['href']
        if href.startswith('/wiki/') and ':' not in href:
            new_link = f"https://en.wikipedia.org{href}"
            response = requests.head(new_link, allow_redirects=True)
            if response.status_code == 200:
                wikipedia_links.append(new_link)

    return {
        'title': title,
        'text': text,
        'links': wikipedia_links
    }

# Example usage:
wikipedia_link = "https://en.wikipedia.org/wiki/HIV"
result = wikipedia_link_analyzer(wikipedia_link)
print("Title:", result['title'])
print("Text:", result['text'])
print("Links:", result['links'])

Title: HIV
Text: 



The human immunodeficiency viruses (HIV) are two species of Lentivirus (a subgroup of retrovirus) that infect humans. Over time, they cause acquired immunodeficiency syndrome (AIDS),[1][2] a condition in which progressive failure of the immune system allows life-threatening opportunistic infections and cancers to thrive.[3] Without treatment, the average survival time after infection with HIV is estimated to be 9 to 11 years, depending on the HIV subtype.[4]

In most cases, HIV is a sexually transmitted infection and occurs by contact with or transfer of blood, pre-ejaculate, semen, and vaginal fluids.[5][6] Non-sexual transmission can occur from an infected mother to her infant during pregnancy, during childbirth by exposure to her blood or vaginal fluid, and through breast milk.[7][8][9][10] Within these bodily fluids, HIV is present as both free virus particles and virus within infected immune cells.
Research has shown (for both same-sex and opposite-sex couples

In [29]:
import requests
from bs4 import BeautifulSoup

def wikipedia_page_analyzer(url):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the webpage!")
    else:
        print("Failed to fetch the webpage.")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    def get_title(soup):
        title = soup.find('title')
        if title is not None:
            return title.text.strip()
        return "Title not found"

    def extract_article_text(soup):
        article_text = {}
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        for heading in headings:
            heading_text = heading.text.strip()
            paragraphs = []
            paragraph = heading.find_next('p')
            while paragraph:
                paragraphs.append(paragraph.text.strip())
                paragraph = paragraph.find_next('p')
            article_text[heading_text] = paragraphs
        return article_text

    def collect_wikipedia_links(soup):
        wikipedia_links = []
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            if href.startswith('/wiki/') and ':' not in href:
                wikipedia_links.append(f"https://en.wikipedia.org{href}")
        return wikipedia_links

    title = get_title(soup)
    print("Title:", title)

    article_text = extract_article_text(soup)
    print("Article Text:")
    for heading, paragraphs in article_text.items():
        print(f"**{heading}**")
        for paragraph in paragraphs:
            print(paragraph)
        print()

    wikipedia_links = collect_wikipedia_links(soup)
    print("Wikipedia Links:")
    for link in wikipedia_links:
        print(link)

    return {
        'title': title,
        'article_text': article_text,
        'wikipedia_links': wikipedia_links
    }

# Example usage:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
result = wikipedia_page_analyzer(url)

Successfully fetched the webpage!
Title: Python (programming language) - Wikipedia
Article Text:
**Contents**

Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[32]
Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.[33][34]
Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[35] Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.[36]
Python consistently ranks as one of the most popular programming la