In [1]:
"""
Author: Lai ZhonPoa
"""
from UtilsProcessor import ScrapedDataProcessor, WordDetailsProcessor
from ForumScraper import ForumScraper
from UtilsCleaner import UtilsCleaner
from WordDetailsGenerator import WordDetailsGenerator 
import google.generativeai as genai
import redis

# Initialize SparkSession
from GlobalSparkSession import GlobalSparkSession
spark = GlobalSparkSession.get_instance()

# Initialize ScrapedDataProcessor
scraped_data_processor = ScrapedDataProcessor()

# Initialize ForumScraper
forum_scraper = ForumScraper(base_url="https://b.cari.com.my/portal.php?mod=view&aid=")

# Define a wrapper function to use with setup_udf
def scrape_data_udf(article_id: int):
    data = forum_scraper.scrape_data(article_id)
    return data

# Setup the UDF with the wrapper function
scraped_data_processor.setup_udf(scrape_data_udf)

  from .autonotebook import tqdm as notebook_tqdm
24/12/21 14:23:08 WARN Utils: Your hostname, R2D3. resolves to a loopback address: 127.0.1.1; using 172.28.168.56 instead (on interface eth0)
24/12/21 14:23:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/21 14:23:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/21 14:23:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# === 1. Data Collection and preparation ===
# === * cari.com.my and wikipedia api ===
# Define AID values
aid_values = list(range(100, 500))

# Process articles and comments
article_df, comments_df = scraped_data_processor.process_articles(aid_values)
scraped_data_processor.save_dataframes(article_df, comments_df)

Scraping AID: 232                                                 (0 + 16) / 16]
Scraping AID: 160
Scraping AID: 244
Scraping AID: 208
Scraping AID: 112
Scraping AID: 148
Scraping AID: 220
Scraping AID: 280
Scraping AID: 136
Scraping AID: 256
Scraping AID: 196
Scraping AID: 172Scraping AID: 100

Scraping AID: 268
Scraping AID: 124
Scraping AID: 245
Scraping AID: 233
Scraping AID: 184
Scraping AID: 149
Scraping AID: 209
Scraping AID: 221
Scraping AID: 246
Scraping AID: 197
Scraping AID: 269
Scraping AID: 281
Scraping AID: 257
Scraping AID: 185
Scraping AID: 161
Scraping AID: 234
Scraping AID: 113
Scraping AID: 101
Scraping AID: 173
Scraping AID: 125
Scraping AID: 137
Scraping AID: 210
Scraping AID: 270
Scraping AID: 150
Scraping AID: 222
Scraping AID: 247
Scraping AID: 198
Scraping AID: 258
Scraping AID: 235
Scraping AID: 282
Scraping AID: 186
Scraping AID: 114
Scraping AID: 162
Scraping AID: 211
Scraping AID: 138
Scraping AID: 126
Scraping AID: 174
Scraping AID: 102
Scraping AID: 271
S

In [3]:
# Process words
scraped_combined_words_df = scraped_data_processor.process_words('assignData/articles_data_csv_test', 'assignData/comments_data_csv_test')
print(f"For cari.coom.my, number of words:",scraped_combined_words_df.show(5))

# Read CSV file produced by kafka_consumer_show.py
crawled_data_df = spark.read.option("inferSchema", "true").csv("assignData/wiki_word_data_csv_test", header=True)
crawled_data_df.show(20)
print("Start of Union")
# Combine scraped and crawled words
combined_words_df = scraped_combined_words_df.union(crawled_data_df)

# Save the combined DataFrame
scraped_data_processor.save_cleaned_words(combined_words_df, UtilsCleaner().process_words)

+----------+
|      Word|
+----------+
|   Menteri|
|Pendidikan|
|         :|
|     garis|
|   panduan|
+----------+
only showing top 5 rows

For cari.coom.my, number of words: None
+--------------+
|          Word|
+--------------+
|         Badan|
|        Bahasa|
|         Sabah|
|        BAHASA|
|         telah|
|    ditubuhkan|
|        secara|
|      rasminya|
|          pada|
|         tahun|
|         March|
|          Yang|
|      Dipertua|
|        BAHASA|
|         Encik|
|     Dzulkifli|
|         Abdul|
|         Hamid|
|         telah|
|memperkenalkan|
+--------------+
only showing top 20 rows

Start of Union


                                                                                

Distinct Cleaned Combined Words:


                                                                                

+--------------------+
|        Cleaned_Word|
+--------------------+
|              online|
|              hingga|
|               often|
|           consulate|
|            muscular|
|           tingkatan|
|               garis|
|              antara|
|               pecah|
|                film|
|           countries|
|           berangkat|
|emailprotectedpet...|
|            komposer|
|         mengelakkan|
|                grow|
|        jawatankuasa|
|              kering|
|            diplomat|
|             lesbian|
|              cerita|
|            ministry|
|                para|
|              berisi|
|             dicedok|
|           penyiaran|
|               warna|
|       tersebutvideo|
|             bermula|
|           seseorang|
|           mengalami|
|                  us|
|             species|
|               sebab|
|              filmby|
|            arowanas|
|                   x|
|           perhatian|
|               untuk|
|               orang|
|          

                                                                                

Distinct Cleaned Combined Words Count: 7180


                                                                                

In [5]:
# === 2. Lexicon Creation ===
# === 3. Lexicon Enrichment ===
# === * Definition, Antonym, Synonym, Tatabahasa, Sentiment ===
# Initialize WordDetailsProcessor
gemini_api = 'AIzaSyArmrBA_urVXM5rJl7rh3xzNq40iRrKp-4'  # Replace with your actual Free Gemini API key: https://aistudio.google.com/apikey
word_details_processor = WordDetailsProcessor(gemini_api)

# Read and process clean words
clean_words_df = word_details_processor.read_clean_words('assignData/clean_words_data_csv_test')
clean_words_df = word_details_processor.add_row_number(clean_words_df)

# Batch process word details (Labelling): Word, Definition, Antonym, Synonym, Tatabahasa, Sentiment.
all_csv_data = word_details_processor.batch_process(clean_words_df, 80, WordDetailsGenerator.get_word_details)
word_details_processor.parse_and_save(all_csv_data, 'assignData/word_details_csv_test')

# Filter usable words
word_details_processor.filter_usable_words('assignData/word_details_csv_test', 'assignData/word_details_csv_cleaned_test')

add_row_number running
Start get_word_details
get_word_details completed. Returned:
"hingga","sehingga; sampai; setakat","tidak diketahui","sampai","kata hubung","0.0"
"often","sering; kerap","jarang","selalu","kata adjektif","0.0"
"consulate","kedutaan kecil; pejabat konsul","tidak diketahui","kedutaan","kata nama","0.0"
"muscular","bertenaga; berotot","lemah; nipis","kuat; gagah","kata adjektif","0.0"
"tingkatan","peringkat; tahap","tidak diketahui","darjah","kata nama","0.0"
"garis","garisan; sempadan; garisan","tidak diketahui","batas; garisan","kata nama","0.0"
"antara","di antara; di tengah-tengah","tidak diketahui","dengan","kata sendi nama","0.0"
"pecah","retak; hancur; putus","utuh; lengkap","pisah","kata kerja","0.0"
"film","filem; wayang gambar","tidak diketahui","filem","kata nama","0.0"
"countries","negara-negara","tidak diketahui","tanah besar","kata nama","0.0"
"berangkat","bertolak; pergi","kembali; pulang","pergi","kata kerja","0.0"
"emailprotectedpetaling","tidak dike

                                                                                