In [1]:
import requests
from bs4 import BeautifulSoup
from typing import List
import json

class Comment:
    def __init__(self, comment_id: int, user: str, comment_text: str):
        self.comment_id = comment_id
        self.user = user
        self.comment_text = comment_text

    def __str__(self):
        return f"""
        Comment ID: {self.comment_id}
        User: {self.user}
        Comment:
        {self.comment_text}
        """

class ScrapedData:
    def __init__(self, 
                 aid: int, 
                 title: str, 
                 date: str, 
                 publisher: str, 
                 views: int, 
                 comments_count: int, 
                 content: str, 
                 comments: List[Comment]) -> None:
        self.aid = aid
        self.title = title
        self.date = date
        self.publisher = publisher
        self.views = views
        self.comments_count = comments_count
        self.content = content
        self.comments = comments

    def __str__(self) -> str:
        comments_str = "\n".join(str(comment) for comment in self.comments)
        return f"""
        Aid: {self.aid}
        Title: {self.title}
        Date: {self.date}
        Publisher: {self.publisher}
        Views: {self.views}
        Comments Count: {self.comments_count}
        Content:
        {self.content}
        Comments Section:
        {comments_str}
        """

def scrape_comments(soup: BeautifulSoup) -> List[Comment]:
    comments: List[Comment] = []
    comments_container = soup.find('div', id='comment_ul')
    if not comments_container:
        return comments

    comment_tags = comments_container.find_all(['dl', 'dI'], id=True)
    for comment_tag in comment_tags:
        comment_id_str = comment_tag.get('id', '').replace('comment_', '').split('_')[0]
        comment_id = int(comment_id_str) if comment_id_str.isdigit() else 0

        user_tag = comment_tag.find('a', class_='xi2')
        user = user_tag.text.strip() if user_tag else "Anonymous"

        comment_text_tag = comment_tag.find('dd')
        if comment_text_tag:
            quote_tags = comment_text_tag.find_all('div', class_='quote')
            for quote_tag in quote_tags:
                quote_tag.extract()
            comment_text = comment_text_tag.get_text(strip=True)
        else:
            comment_text = "No comment text"

        comments.append(Comment(comment_id=comment_id, user=user, comment_text=comment_text))
    return comments

def scrape_article(url: str, aid: int) -> ScrapedData:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = str(soup.find('title').text if soup.find('title') else "Unknown")

    date_tag = soup.find('p', class_='xg1')
    date = str(date_tag.text.split('|')[0].strip()) if date_tag else "Unknown"

    publisher_tag = date_tag.find('a') if date_tag else None
    publisher = str(publisher_tag.text if publisher_tag else "Unknown")

    views_tag = soup.find('em', id='_viewnum')
    views_str = views_tag.text if views_tag else "0"
    views = int(views_str.replace(',', '')) if views_str.isdigit() else 0

    comments_tag = soup.find('em', id='_commentnum')
    comments_count_str = comments_tag.text if comments_tag else "0"
    comments_count = int(comments_count_str.replace(',', '')) if comments_count_str.isdigit() else 0

    content_tag = soup.find('td', id='article_content')
    content = str(content_tag.get_text(strip=False)) if content_tag else ""

    comments = scrape_comments(soup)

    return ScrapedData(aid, title, date, publisher, views, comments_count, content, comments)

def fetch_articles_from_newsapi(api_key: str, query: str, page_size: int = 5):
    url = f"https://newsapi.org/v2/everything?q={query}&pageSize={page_size}&apiKey={api_key}"
    response = requests.get(url)
    data = response.json()

    if 'articles' not in data:
        raise ValueError("Error fetching articles from NewsAPI: " + data.get('message', 'Unknown error'))

    return [(article['url'], article['title']) for article in data['articles']]

def main():
    api_key = "3e0754c21dbb4aae92e463269d0830da"
    query = "technology"  # Example query to fetch tech-related articles

    articles = fetch_articles_from_newsapi(api_key, query)

    for i, (url, title) in enumerate(articles, start=1):
        print(f"Scraping article {i}: {title} ({url})")
        try:
            scraped_data = scrape_article(url, i)
            print(scraped_data)
            print("\n------------------------\n")
        except Exception as e:
            print(f"Error scraping {url}: {e}")

main()


Scraping article 1: [Removed] (https://removed.com)

        Aid: 1
        Title: removed.com
        Date: Unknown
        Publisher: Unknown
        Views: 0
        Comments Count: 0
        Content:
        
        Comments Section:
        
        

------------------------

Scraping article 2: Lasers Are Making It Easier to Find Buried Land Mines (https://www.wired.com/story/this-laser-system-can-locate-landmines-with-high-accuracy/)

        Aid: 2
        Title: Lasers Are Making It Easier to Find Buried Land Mines | WIRED
        Date: Unknown
        Publisher: Unknown
        Views: 0
        Comments Count: 0
        Content:
        
        Comments Section:
        
        

------------------------

Scraping article 3: [Removed] (https://removed.com)

        Aid: 3
        Title: removed.com
        Date: Unknown
        Publisher: Unknown
        Views: 0
        Comments Count: 0
        Content:
        
        Comments Section:
        
        

-------------