In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re

In [3]:
def scrape_article(url):
    try:
        # Send a GET request to the provided URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the title
        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else 'No title found'
        title = re.sub(r'[\\/*?:"<>|]', "", title)  # Clean the title for use as a filename

        # Extract paragraphs
        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]

        # Create summary (first few paragraphs)
        summary = "\n\n".join(paragraphs[:3])  # Adjust the number of paragraphs for summary as needed

        return title, summary, url

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None, None, None

def read_links(file_path):
    try:
        with open(file_path, 'r') as file:
            links = file.readlines()
        links = [link.strip() for link in links]  # Remove any surrounding whitespace/newline characters
        return links
    except Exception as e:
        print(f"Error reading the file: {e}")
        return []

def write_article_to_file(title, summary, source):
    if title and summary:
        filename = f"summary/{title}.txt"
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(summary)
            file.write("\n\nSource: " + source)
        print(f"Article '{title}' saved as '{filename}'")
    else:
        print(f"Skipping article with insufficient data.")

def main():
    input_file = 'links.txt'  # Replace with your input file path
    links = read_links(input_file)

    for link in links:
        title, summary, source = scrape_article(link)
        write_article_to_file(title, summary, source)

if __name__ == "__main__":
    main()


Article 'Royal Mail hires new boss who triggered strike threat at Heathrow' saved as 'summary/Royal Mail hires new boss who triggered strike threat at Heathrow.txt'
Article 'Rolls-Royce and Turkish Airlines celebrate ongoing strategic partnership' saved as 'summary/Rolls-Royce and Turkish Airlines celebrate ongoing strategic partnership.txt'
Article 'Royal Mail names senior Heathrow executive as next boss' saved as 'summary/Royal Mail names senior Heathrow executive as next boss.txt'
