In [1]:
from neo4j import GraphDatabase
from WordDetailsGenerator import WordDetailsGenerator
from pyspark.sql.functions import udf, split, col, concat, regexp_replace, explode, row_number, monotonically_increasing_id
from typing import List
from GlobalSparkSession import GlobalSparkSession
from UtilsRedis import Redis_Utilities
spark = GlobalSparkSession.get_instance()

import redis
# Setup Redis client
redis_utilities = Redis_Utilities()

  from .autonotebook import tqdm as notebook_tqdm
24/12/22 20:15:52 WARN Utils: Your hostname, R2D3. resolves to a loopback address: 127.0.1.1; using 172.28.168.56 instead (on interface eth0)
24/12/22 20:15:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/22 20:15:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def setup_neo4j_driver(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    driver.verify_connectivity()
    return driver

def find_words_without_properties(tx):
    query = """
    MATCH (w:Word)
    WHERE w.definition IS NULL AND w.tatabahasa IS NULL
    RETURN w.word AS word
    """
    result = tx.run(query)
    return [record["word"] for record in result]

def insert_word_properties(tx, word, definition, tatabahasa, sentiment):
    query = """
    MATCH (w:Word {word: $word})
    SET w.definition = $definition, w.tatabahasa = $tatabahasa
    """
    tx.run(query, word=word, definition=definition, tatabahasa=tatabahasa)
    
    # Redis operations
    redis_utilities.store_sentiment(word, sentiment)
    redis_utilities.update_tatabahasa_count(tatabahasa)
    redis_utilities.update_sentiment_count(sentiment)

def parse_word_details(word_details_str):
    details_list = []
    rows = word_details_str.strip().split("\n")
    for row in rows:
        columns = row.split(",")
        if len(columns) == 6:
            word, definition, antonym, synonym, tatabahasa, sentiment = columns
            details_list.append({
                "word": word.strip('"'),
                "definition": definition.strip('"'),
                "antonym": antonym.strip('"'),
                "synonym": synonym.strip('"'),
                "tatabahasa": tatabahasa.strip('"'),
                "sentiment": float(sentiment.strip('"'))
            })
        else:
            print(f"Skipping malformed row: {row}")
    return details_list

def process_words_in_batches(spark, words_df, batch_size, gemini_api_key):
    # Add row number to DataFrame
    words_df = words_df.withColumn("row_number", monotonically_increasing_id())

    # Calculate number of batches
    total_rows = words_df.count()
    num_batches = (total_rows // batch_size) + 1

    for batch_id in range(num_batches):
        batch_df = words_df.filter((col("row_number") >= batch_id * batch_size) & (col("row_number") < (batch_id + 1) * batch_size))
        batch_words = batch_df.select("word").rdd.map(lambda row: row[0]).collect()
        word_details_str = WordDetailsGenerator.get_word_details(batch_words, gemini_api_key)
        word_details = parse_word_details(word_details_str)
        print(f"Word details for batch {batch_id + 1}:", word_details)
        for detail in word_details:
            print(f"\nWord Details: {detail['word']}, {detail['definition']}, {detail['tatabahasa']}, {detail['sentiment']}")
        
        # Insert word properties into Neo4j
        with driver.session() as session:
            for detail in word_details:
                session.write_transaction(insert_word_properties, detail["word"], detail["definition"], detail["tatabahasa"], detail["sentiment"])


In [3]:
# Configuration
neo4j_uri = "neo4j+s://75fb82ba.databases.neo4j.io"
neo4j_user = "neo4j"
neo4j_password = "E2znDHtP7x2Hs0B5_BM1tnglu6fTkM5YPTX18DkubIk" # Replace with your actual password
gemini_api_key = 'AIzaSyDh-BwGFasADkSzkHzvw86rxamepBZX1Ew'  # Replace with your actual Free Gemini API key: https://aistudio.google.com/apikey

# Setup Neo4j driver
driver = setup_neo4j_driver(neo4j_uri, neo4j_user, neo4j_password)

# Retrieve words without properties
with driver.session() as session:
    words_without_properties = session.execute_read(find_words_without_properties)
    words_df = spark.createDataFrame([(word,) for word in words_without_properties], ["word"])

# Process words in batches of 80 using PySpark
process_words_in_batches(spark, words_df, 80, gemini_api_key)

# Close the Neo4j driver
driver.close()
print("Completed updating Neo4j with word properties.")

                                                                                

Start get_word_details
get_word_details completed. 
<Start>
csv
"bahasa atasan","bahasa yang digunakan oleh golongan atasan atau berkuasa","bahasa rakyat","bahasa istana","kata nama","0.0"
"diri sendiri","merujuk kepada seseorang itu sendiri","orang lain","sendiri","kata ganti nama diri","0.0"
"berbeza bentuk","mempunyai bentuk yang berlainan","sama bentuk","bervariasi bentuk","kata sifat","0.0"
"bentuk tunggal","bentuk kata yang merujuk kepada satu sahaja","bentuk jamak","tunggal","kata nama","0.0"
"ketidakmampuan menganalisis","kekurangan kebolehan untuk menganalisis sesuatu","kemampuan menganalisis","tidak berupaya menganalisis","kata nama","-0.8"
"tidak guna","tidak bermanfaat atau berguna","berguna","tidak berguna","kata sifat","-0.7"
"kelemahannya","aspek negatif atau kekurangan sesuatu","kekuatannya","kelemahan","kata nama","-0.6"
"dunia kecil","dunia yang kecil atau sempit","dunia luas","dunia mini","kata nama","0.0"
"lebih kecil","mempunyai saiz yang lebih kurang daripada sesu

  session.write_transaction(insert_word_properties, detail["word"], detail["definition"], detail["tatabahasa"], detail["sentiment"])


Start get_word_details
get_word_details completed. 
<Start>
csv
"tidak sesuai","tidak menepati kehendak atau keperluan","sesuai","tidak kena","kata sifat","-0.5"
"disatukan","dijadikan satu atau bergabung","dipisahkan","digabungkan","kata kerja pasif","0.0"
"tidak bersetuju","menunjukkan perbezaan pendapat atau pandangan","bersetuju","tidak setuju","kata sifat","-0.7"


<Stop> Ended get_word_details
Skipping malformed row: csv
Word details for batch 2: [{'word': 'tidak sesuai', 'definition': 'tidak menepati kehendak atau keperluan', 'antonym': 'sesuai', 'synonym': 'tidak kena', 'tatabahasa': 'kata sifat', 'sentiment': -0.5}, {'word': 'disatukan', 'definition': 'dijadikan satu atau bergabung', 'antonym': 'dipisahkan', 'synonym': 'digabungkan', 'tatabahasa': 'kata kerja pasif', 'sentiment': 0.0}, {'word': 'tidak bersetuju', 'definition': 'menunjukkan perbezaan pendapat atau pandangan', 'antonym': 'bersetuju', 'synonym': 'tidak setuju', 'tatabahasa': 'kata sifat', 'sentiment': -0.7}]

Wor