In [1]:
"""
Author: Lai ZhonPoa
"""
from UtilsProcessor import ScrapedDataProcessor, WordDetailsProcessor
from ForumScraper import ForumScraper
from UtilsCleaner import UtilsCleaner
from WordDetailsGenerator import WordDetailsGenerator 
import google.generativeai as genai
import redis

# Initialize SparkSession
from GlobalSparkSession import GlobalSparkSession
spark = GlobalSparkSession.get_instance()

# Initialize ScrapedDataProcessor
scraped_data_processor = ScrapedDataProcessor()

# Initialize ForumScraper
forum_scraper = ForumScraper(base_url="https://b.cari.com.my/portal.php?mod=view&aid=")

# Define a wrapper function to use with setup_udf
def scrape_data_udf(article_id: int):
    data = forum_scraper.scrape_data(article_id)
    return data

# Setup the UDF with the wrapper function
scraped_data_processor.setup_udf(scrape_data_udf)

  from .autonotebook import tqdm as notebook_tqdm
24/12/22 18:10:07 WARN Utils: Your hostname, R2D3. resolves to a loopback address: 127.0.1.1; using 172.28.168.56 instead (on interface eth0)
24/12/22 18:10:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/22 18:10:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/22 18:10:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# === 1. Data Collection and preparation ===
# === * cari.com.my and wikipedia api ===
# Define AID values
aid_values = list(range(100, 300))

# Process articles and comments
article_df, comments_df = scraped_data_processor.process_articles(aid_values)
scraped_data_processor.save_dataframes(article_df, comments_df)

Scraping article ID: 196Scraping article ID: 208                  (0 + 16) / 16]

Scraping article ID: 184
Scraping article ID: 220
Scraping article ID: 256
Scraping article ID: 136
Scraping article ID: 244
Scraping article ID: 172
Scraping article ID: 232
Scraping article ID: 280
Scraping article ID: 268
Scraping article ID: 148
Scraping article ID: 160
Scraping article ID: 124
Scraping article ID: 100
Scraping article ID: 112
Scraping article ID: 197
Scraping article ID: 281
Scraping article ID: 185
Scraping article ID: 257
Scraping article ID: 209
Scraping article ID: 221
Scraping article ID: 269
Scraping article ID: 233
Scraping article ID: 245
Scraping article ID: 173
Scraping article ID: 149
Scraping article ID: 137
Scraping article ID: 113
Scraping article ID: 161
Scraping article ID: 125
Scraping article ID: 101
Scraping article ID: 198
Scraping article ID: 186
Scraping article ID: 282
Scraping article ID: 258
Scraping article ID: 210
Scraping article ID: 222
Scraping article I

In [3]:
# Process words
scraped_combined_words_df = scraped_data_processor.process_words('assignData/articles_data_csv_test', 'assignData/comments_data_csv_test')
print(f"For cari.coom.my, number of words:",scraped_combined_words_df.show(5))

# Read CSV file produced by kafka_consumer_show.py
crawled_data_df = spark.read.option("inferSchema", "true").csv("assignData/wiki_word_data_csv_test", header=True)
crawled_data_df.show(20)
print("Start of Union")
# Combine scraped and crawled words
combined_words_df = scraped_combined_words_df.union(crawled_data_df)

# Save the combined DataFrame
scraped_data_processor.save_cleaned_words(combined_words_df, UtilsCleaner().process_words)

+----------+
|      Word|
+----------+
|   Menteri|
|Pendidikan|
|         :|
|     garis|
|   panduan|
+----------+
only showing top 5 rows

For cari.coom.my, number of words: None
+--------------+
|          Word|
+--------------+
|         Badan|
|        Bahasa|
|         Sabah|
|        BAHASA|
|         telah|
|    ditubuhkan|
|        secara|
|      rasminya|
|          pada|
|         tahun|
|         March|
|          Yang|
|      Dipertua|
|        BAHASA|
|         Encik|
|     Dzulkifli|
|         Abdul|
|         Hamid|
|         telah|
|memperkenalkan|
+--------------+
only showing top 20 rows

Start of Union


                                                                                

Distinct Cleaned Combined Words:


                                                                                

+--------------------+
|        Cleaned_Word|
+--------------------+
|              online|
|              hingga|
|               often|
|           consulate|
|            muscular|
|           tingkatan|
|               garis|
|              antara|
|               pecah|
|                film|
|           countries|
|           berangkat|
|emailprotectedpet...|
|            komposer|
|         mengelakkan|
|                grow|
|        jawatankuasa|
|              kering|
|            diplomat|
|             lesbian|
|              cerita|
|            ministry|
|                para|
|              berisi|
|             dicedok|
|           penyiaran|
|               warna|
|       tersebutvideo|
|             bermula|
|           seseorang|
|           mengalami|
|                  us|
|             species|
|               sebab|
|              filmby|
|            arowanas|
|                   x|
|           perhatian|
|               untuk|
|               orang|
|          

                                                                                

Distinct Cleaned Combined Words Count: 8082


                                                                                

In [4]:
# === 2. Lexicon Creation ===
# === 3. Lexicon Enrichment ===
# === * Definition, Antonym, Synonym, Tatabahasa, Sentiment ===
# Initialize WordDetailsProcessor
gemini_api = 'AIzaSyDh-BwGFasADkSzkHzvw86rxamepBZX1Ew'  # Replace with your actual Free Gemini API key: https://aistudio.google.com/apikey
word_details_processor = WordDetailsProcessor(gemini_api)

# Read and process clean words
clean_words_df = word_details_processor.read_clean_words('assignData/clean_words_data_csv_test')
clean_words_df = word_details_processor.add_row_number(clean_words_df)

# Batch process word details (Labelling): Word, Definition, Antonym, Synonym, Tatabahasa, Sentiment.
all_csv_data = word_details_processor.batch_process(clean_words_df, 80, WordDetailsGenerator.get_word_details)
word_details_processor.parse_and_save(all_csv_data, 'assignData/word_details_csv_test')

# Filter usable words
word_details_processor.filter_usable_words('assignData/word_details_csv_test', 'assignData/word_details_csv_cleaned_test')

add_row_number running
Start get_word_details
get_word_details completed. 
<Start>
text
"hingga","selama tempoh masa; sampai ke","dari","sampai","kata sendi nama","0.0"
"often","kerap kali; selalu","jarang","sering","kata keterangan","0.0"
"consulate","pejabat wakil kerajaan asing","kedutaan","tidak diketahui","kata nama","0.0"
"muscular","berkaitan otot yang kuat dan tegap","lemah","kuat","kata sifat","0.0"
"tingkatan","peringkat atau tahap","tidak diketahui","darjah","kata nama","0.0"
"garis","lukisan lurus dan nipis","lengkung","urat","kata nama","0.0"
"antara","di tengah-tengah atau di antara dua perkara","luar","di tengah","kata sendi nama","0.0"
"pecah","terbelah atau hancur menjadi beberapa bahagian","utuh","retak","kata kerja","-0.2"
"film","wayang gambar atau filem sinematografi","tidak diketahui","filem wayang","kata nama","0.0"
"countries","negara-negara di dunia","tidak diketahui","negara","kata nama","0.0"
"berangkat","mula perjalanan atau meninggalkan sesuatu tempat","tib

                                                                                