<a href="https://colab.research.google.com/github/Brynlai/Data-Engineering-Assignment-RDSY2S2/blob/alia/scraping_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install requests beautifulsoup4



In [7]:
import requests
from bs4 import BeautifulSoup

# Class to represent a Scraped Comment
class Comment:
   def __init__(self, comment_id, user, comment_text):
     self.comment_id = comment_id
     self.user = user
     self.comment_text = comment_text

   def __str__(self):
      return f"""
      Comment ID: {self.comment_id}
      User: {self.user}
      Comment:
      {self.comment_text}
      """

# Class to hold scraped data
class ScrapedData:
  def __init__(self, aid, title, date, publisher, views, comments_count, content, comments):
    self.aid = aid
    self.title = title
    self.date = date
    self.publisher = publisher
    self.views = views
    self.comments_count = comments_count
    self.content = content
    self.comments = comments

  # String representation of the object
  def __str__(self):
      comments_str = "\n".join(str(comment) for comment in self.comments)
      return f"""
      Aid: {self.aid}
      Title: {self.title}
      Date: {self.date}
      Publisher: {self.publisher}
      Views: {self.views}
      Comments Count: {self.comments_count}
      Content:
      {self.content}
      Comments Section:
      {comments_str}
      """

# Function to scrape all comments
def scrape_comments(soup):
    comments = []  # List to store all comments

    # Locate the comments container
    comments_container = soup.find('div', id='comment_ul')
    if not comments_container:
        return comments

    # Find all <dl> or <dI> tags inside the container
    comment_tags = comments_container.find_all(['dl', 'dI'], id=True)

    for comment_tag in comment_tags:
        # Extract unique comment ID
        comment_id = comment_tag.get('id').replace('comment_', '').split('_')[0]

        # Extract username
        user_tag = comment_tag.find('a', class_='xi2')
        user = user_tag.text.strip() if user_tag else "Anonymous"

        # Extract the comment text
        comment_text_tag = comment_tag.find('dd')
        if comment_text_tag:
            # Remove quote
            quote_tags = comment_text_tag.find_all('div', class_='quote')
            for quote_tag in quote_tags:
                quote_tag.extract()

            # Get the cleaned text
            comment_text = comment_text_tag.get_text(strip=True)
        else:
            comment_text = "No comment text"

        # Append the extracted comment to the list
        comments.append(Comment(comment_id=comment_id, user=user, comment_text=comment_text))

    return comments

# Function to Scrape article from URL and aid
def scrape_article(url, aid):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract title
    title = soup.find('title').text

    # Extract date
    date_tag = soup.find('p', class_='xg1')
    date = date_tag.text.split('|')[0].strip() if date_tag else "Unknown"

    # Extract publisher
    publisher_tag = date_tag.find('a') if date_tag else None
    publisher = publisher_tag.text if publisher_tag else "Unknown"

    # Extract views
    views_tag = soup.find('em', id='_viewnum')
    views = views_tag.text if views_tag else "0"

    # Extract comments count
    comments_tag = soup.find('em', id='_commentnum')
    comments_count = comments_tag.text if comments_tag else "0"

    # Extract content
    content_tag = soup.find('td', id='article_content')
    content = content_tag.get_text(strip=True) if content_tag else ""

    # Extract comments
    comments = scrape_comments(soup)

    return ScrapedData(aid, title, date, publisher, views, comments_count, content, comments)

# Main function to orchestrate scraping process
def main():
    base_url = "https://b.cari.com.my/portal.php?mod=view&aid="

    # List of aids to scrape
    aid_values = list(range(1,6))  # Should be until 25000+
    aid_values.append(20000) # Because AID is sequential.

    for aid in aid_values:
        url = f"{base_url}{aid}"   # Construct full URL

        try:
            scraped_data = scrape_article(url, aid)   # Scrape data

            print(scraped_data)                         # Print data

            print("\n------------------------\n")       # Print separator

        except Exception as e:
            print(f"Error scraping {url}: {e}")       # Handle exceptions

if __name__ == "__main__":
    main()                                           # Run main function


      Aid: 1
      Title: Laporan polis terhadap Uncle Seekers didakwa hina Sultan Johor - CariDotMy 
      Date: 19-7-2012 05:16 PM
      Publisher: cmcadmin
      Views: 243700
      Comments Count: 1917
      Content:
      Post Last Edit by the_killer at 1-7-2012 17:14JOHOR BAHARU 1 Julai - Tiga puluh individu hari ini membuat laporan polis terhadap pengamal paranormal Syed Abdullah Hussein Al-Attas atau lebih dikenali sebagai Uncle Seekers berhubung penyiaran artikel dalam blognya yang dikatakan berunsur provokasi, hasutan dan mengaibkan Sultan Johor.Laporan polis berkenaan dibuat pada pukul 12.10 tengah hari di Balai Polis Sentral, di sini, bagi memohon pihak berkuasa menjalankan siasatan terhadap blog uncleseekers.blogspot.com yang turut berunsur mengaibkan Tunku Mahkota Johor, pemimpin dan penjawat awam kerajaan Johor.Wakil kesemua individu berkenaan, seorang peniaga, Khairil Azlishah Jalil, 32, ketika ditemui pemberita berkata pihaknya meminta blog berkenaan ditutup memandang