In [None]:
"""
Author: Lai ZhonPoa
"""
# Bryans individual is not limited to BryanIndividual.ipynb
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import networkx as nx
from UtilsNeo4J import DataBaseHandler
from UtilsRedis import Redis_Utilities

from GlobalSparkSession import GlobalSparkSession
from pyspark.sql.functions import explode, col
spark = GlobalSparkSession.get_instance()

# Setup Neo4j driver and Redis client
neo4j_uri = "neo4j+s://f2d488e8.databases.neo4j.io"
neo4j_user = "neo4j"
neo4j_password = "EEftBBSnXlP8rzseU038drph7Ue5SzVVxDvlX2kL2y8" # Replace with your actual password
redis_utils = Redis_Utilities()

db_handler = DataBaseHandler(neo4j_uri, neo4j_user, neo4j_password, redis_utils)

# Get the total number of unique entries in the lexicon
total_unique_entries = db_handler.get_total_unique_entries()
print(f"Total number of unique entries: {total_unique_entries}")

In [None]:
# Graph Visualization Functions
def fetch_synonyms(tx, limit=25):
    query = f"""
    MATCH (w:Word)-[:SYNONYM]-(s:Word)
    RETURN w.word AS word, collect(DISTINCT s.word) AS synonyms
    LIMIT {limit}
    """
    result = tx.run(query)
    return result.values()

def create_synonym_network_pyspark(spark, db_handler, limit=20):
    with db_handler.neo4j_driver.session() as session:
        synonyms = session.execute_read(fetch_synonyms, limit)
    
    # Convert to DataFrame
    synonyms_df = spark.createDataFrame(synonyms, ["word", "synonyms"])
    
    # Explode the synonyms list into individual rows
    exploded_df = synonyms_df.withColumn("synonym", explode(col("synonyms"))).select("word", "synonym")
    
    # Use toLocalIterator to avoid memory issues
    edges = [(row.word, row.synonym) for row in exploded_df.toLocalIterator()]
    
    # Create a graph
    G = nx.Graph()
    G.add_edges_from(edges)
    
    return G

def visualize_network(G):
    pos = nx.spring_layout(G, k=0.55)
    plt.figure(figsize=(20, 10))
    nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=5000, font_size=13)
    plt.title("Synonym Network")
    plt.show()

def identify_clusters(G):
    clusters = nx.community.greedy_modularity_communities(G)
    themes = {i: list(cluster) for i, cluster in enumerate(clusters)}
    return themes

G = create_synonym_network_pyspark(spark, db_handler, limit=30) 
visualize_network(G)

themes = identify_clusters(G)
for theme_id, words in themes.items():
    print(f"Theme {theme_id}: {', '.join(words)}")

In [None]:
from UtilsRedis import Redis_Utilities
import redis
def get_word_data(word_to_search):
    redis_utils = Redis_Utilities()
    
    sentiment_data = redis_utils.get_sentiment(word_to_search)
    
    synonyms = db_handler.get_synonyms(word_to_search)
    antonyms = db_handler.get_antonyms(word_to_search)
    
    print(f"Synonyms for '{word_to_search}': {', '.join(synonyms)}")
    print(f"Antonyms for '{word_to_search}': {', '.join(antonyms)}")
    print(f"Sentiment for '{word_to_search}':", sentiment_data)

get_word_data("sedih")
get_word_data("gembira")
get_word_data("ibu")
get_word_data("hasil")

In [None]:
from pyspark.sql import Row
from pyspark.sql.functions import col
from UtilsRedis import Redis_Utilities

def analyze_word_frequencies(spark, num_rows=10):
    """
    Analyze and display word frequencies.

    Args:
        num_rows (int): Number of rows to show in the preview and frequency lists.
    """
    # Initialize Redis client and retrieve all frequencies
    redis_client = Redis_Utilities()
    frequencies = redis_utils.get_all_word_frequencies()

    # Convert the dictionary to a PySpark DataFrame
    word_frequencies_list = [{"Cleaned_Word": word, "Frequency": int(freq)} for word, freq in frequencies.items()]
    word_frequencies_df = spark.createDataFrame(Row(**x) for x in word_frequencies_list)

    # Show the preview of word frequencies
    print(f"Preview of all word frequencies (first {num_rows} rows):")
    word_frequencies_df.show(num_rows)

    # Most common words
    most_common_words_df = word_frequencies_df.orderBy(col("Frequency").desc()).limit(num_rows)
    print("Most common words:")
    most_common_words_df.show()

    # Least common words
    least_common_words_df = word_frequencies_df.orderBy(col("Frequency").asc()).limit(num_rows)
    print("Least common words:")
    least_common_words_df.show()

    # Words used exactly once
    once_used_df = word_frequencies_df.filter(col("Frequency") == 1)
    once_used = [row["Cleaned_Word"] for row in once_used_df.collect()]
    print("\nWords used once:")
    print(", ".join(once_used))

# Call the function with the desired number of rows to display
analyze_word_frequencies(spark, num_rows=10)