# IMPORTING PACKAGES OR LIBRARIES

In [9]:
import requests  # Library for sending HTTP requests
from bs4 import BeautifulSoup  # Library for parsing HTML
from transformers import BartForConditionalGeneration, BartTokenizer  # Transformers library for BART model
import re  # Regular expressions library for text cleaning


# STEP1:WEB SCRAPPING 

In [10]:
def scrape_wikipedia(url):
    # Send an HTTP GET request to the given URL
    response = requests.get(url)
    
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Return the parsed HTML (BeautifulSoup object)
    return soup

# Function to extract headings and contents from the parsed HTML

In [11]:
def extract_headings_and_contents(soup):
    headings = []
    contents = []

    # Find all headings (h1, h2, h3, h4, h5, h6) in the HTML
    for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        # Clean the heading text and append it to the headings list
        headings.append(clean_text(heading.text.strip()))

    # Find all paragraphs (p) in the HTML
    for paragraph in soup.find_all('p'):
        # Clean the paragraph text and append it to the contents list
        contents.append(clean_text(paragraph.text))

    # Return the lists of headings and contents
    return headings, contents


# Step 2: Text Cleaning

In [12]:
def clean_text(text):
    # Remove numbers, spaces, symbols, and brackets using regular expressions
    cleaned_text = re.sub(r'\d', '', text)  # Remove numbers
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra spaces
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  # Remove symbols and brackets
    
    # Strip any leading or trailing whitespace
    return cleaned_text.strip()


# Step 3: Text Summarization with Hugging Face Transformers (BART)

In [13]:
def summarize_with_bart(contents, ratio=0.2):
    # Specify the pre-trained BART model
    model_name = "facebook/bart-large-cnn"
    
    # Initialize BART tokenizer and model
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    
    # Initialize an empty list to store summaries
    summaries = []

    # Generate summaries for each content using the BART model
    for content in contents:
        # Tokenize and encode the content for input to the model
        inputs = tokenizer([content], max_length=1024, return_tensors="pt", truncation=True)
        
        # Generate the summary using the BART model
        summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
        
        # Decode the summary from token IDs to text
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
        # Append the summary to the list
        summaries.append(summary)

    # Return the list of summaries
    return summaries

# Step 4: Main Function

In [14]:
def main():
    # Define the Wikipedia URL to scrape
    wikipedia_url = "https://en.wikipedia.org/wiki/Alexander_the_Great"
    
    # Scrape the Wikipedia page and get the parsed HTML
    soup = scrape_wikipedia(wikipedia_url)
    
    # Extract headings and contents from the parsed HTML
    headings, contents = extract_headings_and_contents(soup)

    # Summarize content using Hugging Face Transformers (BART)
    summaries_bart = summarize_with_bart(contents, ratio=0.5)  # You can adjust the ratio for summary length

    # Step 5: Print BART Summarized Content Only
    for i in range(len(headings)):
        print(headings[i])
        print( summaries_bart[i])
        print("="*50)  # Separator between sections

# Run the main function if the script is executed
if __name__ == "__main__":
    main()


Contents
CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots of the U.S. for next week. Visit CNN.com/Travel next Friday for a new gallery of snapshots from around the world. Please share your best photos of the world with CNN iReport.
Alexander the Great
Alexander III of Macedon was a king of the ancient Greek kingdom of Macedona. He succeeded his father Philip II to the throne in BC at the age of  25. He spent most of his ruling years conducting a lengthy military campaign throughout Western Asia and Egypt. He was undefeated in battle and is widely considered to be one of historys greatest and most successful military commanders.
Early life
Until the age of  Alexander was tutored by Aristotle. In BC shortly after his assumption of kingship over Macedon he reasserted control over Thrace and parts of Illyria. Alexander then led the League of Corinth and used his authority to launch the panHellenic project envisaged by his father.
Li

# In this cell we can run the BART method without given the content ratio 


In [15]:
# Step 4: Main Function
def main():
    wikipedia_url = "https://en.wikipedia.org/wiki/Alexander_the_Great"
    soup = scrape_wikipedia(wikipedia_url)
    headings, contents = extract_headings_and_contents(soup)

    # Summarize content using Hugging Face Transformers (BART)
    summaries_bart = []
    i = 0  # Initialize index
    while i < len(headings):
        current_heading = headings[i]
        next_heading = headings[i + 1] if i + 1 < len(headings) else None  # Get the next heading if it exists

        # Gather content for summarization
        content_to_summarize = []
        while i < len(headings) and headings[i] == current_heading:
            content_to_summarize.append(contents[i])
            i += 1

        # Include the content after the heading if there's another heading following
        if next_heading:
            content_to_summarize.extend(contents[i:i+3])  # Adjust this as needed to include more content

        # Summarize the gathered content
        summarized_content = " ".join(content_to_summarize)
        summaries_bart.append(summarized_content)

    # Step 5: Print BART Summarized Content Only
    for i in range(len(headings)):
        print(headings[i])
        print(summaries_bart[i])
        print("="*50)  # Separator between sections

if __name__ == "__main__":
    main()


Contents
 Alexander III of Macedon Ancient Greek Ἀλέξανδρος romanized Alexandros  July BC   June BC commonly known as Alexander the Greata was a king of the ancient Greek kingdom of Macedona He succeeded his father Philip II to the throne in BC at the age of  and spent most of his ruling years conducting a lengthy military campaign throughout Western Asia and Egypt By the age of  he had created one of the largest empires in history stretching from Greece to northwestern India He was undefeated in battle and is widely considered to be one of historys greatest and most successful military commanders Until the age of  Alexander was tutored by Aristotle In BC shortly after his assumption of kingship over Macedon he campaigned in the Balkans and reasserted control over Thrace and parts of Illyria before marching on the city of Thebes which was subsequently destroyed in battle Alexander then led the League of Corinth and used his authority to launch the panHellenic project envisaged by his f