In [6]:
import sys
sys.path.append('/home/student/.local/lib/python3.10/site-packages')

from hdfs import InsecureClient






In [7]:
import requests
from bs4 import BeautifulSoup
from hdfs import InsecureClient

# Class to hold scraped data
class ScrapedData:
    def __init__(self, aid, title, content):
        self.aid = aid
        self.title = title
        self.content = content

    # String representation of the object
    def __str__(self):
        return f"""
        Aid: {self.aid}
        Title: {self.title}
        Content:
        {self.content}
        """

# Function to remove "-CariDotMy" from text and trim spaces
def remove_cari_dot_my(text):
    """
    Removes '-CariDotMy' from the given text and trims leading/trailing spaces.
    """
    return text.replace("- CariDotMy", "").strip()

# Scrape article from URL and aid
def scrape_article(url, aid):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract title
    title = soup.find('title').text if soup.find('title') else "Unknown"
    title = remove_cari_dot_my(title)  # Clean title

    # Extract content
    content_tag = soup.find('td', id='article_content')
    content = content_tag.get_text(strip=True) if content_tag else "No content available"

    return ScrapedData(aid, title, content)

# Function to save data to HDFS
def save_to_hdfs(client, data, hdfs_path):
    """
    Saves scraped data to HDFS as a JSON-like text.
    """
    with client.write(hdfs_path, encoding='utf-8', overwrite=True) as writer:
        for entry in data:
            writer.write(f"{entry}\n")  # Store each ScrapedData entry as a new line

# Main function to orchestrate scraping process
def main():
    base_url = "https://b.cari.com.my/portal.php?mod=view&aid="
    hdfs_client = InsecureClient('http://localhost:9870', user='hdfs')  # Update URL and user as needed
    hdfs_path = "/user/hdfs/scraped_articles.txt"  # HDFS file path

    # List of aids to scrape
    aid_values = list(range(1, 6))  # Should be until 25000+
    aid_values.append(20000)  # Example inclusion of a higher AID.

    scraped_data = []

    for aid in aid_values:
        url = f"{base_url}{aid}"  # Construct full URL

        try:
            article = scrape_article(url, aid)  # Scrape data
            scraped_data.append(article)
            print(article)  # Print scraped data for verification

        except Exception as e:
            print(f"Error scraping {url}: {e}")  # Handle exceptions

    # Save data to HDFS
    try:
        save_to_hdfs(hdfs_client, scraped_data, hdfs_path)
        print(f"Scraped data saved to HDFS at {hdfs_path}")
    except Exception as e:
        print(f"Error saving data to HDFS: {e}")

if __name__ == "__main__":
    main()


        Aid: 1
        Title: Laporan polis terhadap Uncle Seekers didakwa hina Sultan Johor
        Content:
        Post Last Edit by the_killer at 1-7-2012 17:14JOHOR BAHARU 1 Julai - Tiga puluh individu hari ini membuat laporan polis terhadap pengamal paranormal Syed Abdullah Hussein Al-Attas atau lebih dikenali sebagai Uncle Seekers berhubung penyiaran artikel dalam blognya yang dikatakan berunsur provokasi, hasutan dan mengaibkan Sultan Johor.Laporan polis berkenaan dibuat pada pukul 12.10 tengah hari di Balai Polis Sentral, di sini, bagi memohon pihak berkuasa menjalankan siasatan terhadap blog uncleseekers.blogspot.com yang turut berunsur mengaibkan Tunku Mahkota Johor, pemimpin dan penjawat awam kerajaan Johor.Wakil kesemua individu berkenaan, seorang peniaga, Khairil Azlishah Jalil, 32, ketika ditemui pemberita berkata pihaknya meminta blog berkenaan ditutup memandangkan terdapat dokumen-dokumen sulit yang dipaparkan pada blog tersebut."Sebagai rakyat Johor yang taat setia k

In [9]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Split Sentences into Words") \
    .getOrCreate()

# Path to the file in HDFS
hdfs_path = "hdfs:///user/hdfs/scraped_articles.txt"

# Read the data as an RDD
data_rdd = spark.sparkContext.textFile(hdfs_path)

# Split sentences into words
words_rdd = data_rdd.flatMap(lambda sentence: sentence.split())

# Collect and print the result (for demonstration purposes)
words = words_rdd.collect()
print(words)

# If you'd like to save the output back to HDFS
output_path = "hdfs:///user/hdfs/split_words"
words_rdd.saveAsTextFile(output_path)

# Stop the Spark session
spark.stop()


['Aid:', '1', 'Title:', 'Laporan', 'polis', 'terhadap', 'Uncle', 'Seekers', 'didakwa', 'hina', 'Sultan', 'Johor', 'Content:', 'Post', 'Last', 'Edit', 'by', 'the_killer', 'at', '1-7-2012', '17:14JOHOR', 'BAHARU', '1', 'Julai', '-', 'Tiga', 'puluh', 'individu', 'hari', 'ini', 'membuat', 'laporan', 'polis', 'terhadap', 'pengamal', 'paranormal', 'Syed', 'Abdullah', 'Hussein', 'Al-Attas', 'atau', 'lebih', 'dikenali', 'sebagai', 'Uncle', 'Seekers', 'berhubung', 'penyiaran', 'artikel', 'dalam', 'blognya', 'yang', 'dikatakan', 'berunsur', 'provokasi,', 'hasutan', 'dan', 'mengaibkan', 'Sultan', 'Johor.Laporan', 'polis', 'berkenaan', 'dibuat', 'pada', 'pukul', '12.10', 'tengah', 'hari', 'di', 'Balai', 'Polis', 'Sentral,', 'di', 'sini,', 'bagi', 'memohon', 'pihak', 'berkuasa', 'menjalankan', 'siasatan', 'terhadap', 'blog', 'uncleseekers.blogspot.com', 'yang', 'turut', 'berunsur', 'mengaibkan', 'Tunku', 'Mahkota', 'Johor,', 'pemimpin', 'dan', 'penjawat', 'awam', 'kerajaan', 'Johor.Wakil', 'kesemua