In [0]:
# Configuration and Imports
from bertopic import BERTopic
from huggingface_hub import login
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, ArrayType
import os

# Define the Delta table paths
preprocessed_delta_table_path = "/mnt/topic-prediction/delta/reviews/"  # Table with preprocessed data
predictions_delta_table_path = "/mnt/topic-prediction/delta/reviews_predictions/"  # Table to store predictions


In [0]:
# Load preprocessed data from Delta table
raw_df = spark.read.format("delta").load(preprocessed_delta_table_path)

# Check if the predictions Delta table exists
predictions_table_exists = DeltaTable.isDeltaTable(spark, predictions_delta_table_path)

if predictions_table_exists:
    # Load predicted data from the Delta table if it exists
    predicted_df = spark.read.format("delta").load(predictions_delta_table_path).select("review_id").distinct()
    print(f"Loaded existing predictions. Found {predicted_df.count()} distinct reviews already processed.")
else:
    print("Predictions table does not exist. All data will be considered unprocessed.")


In [0]:
if predictions_table_exists:
    # Find unprocessed reviews using a left anti-join
    unprocessed_ids = raw_df.select("review_id").distinct().join(predicted_df, on="review_id", how="left_anti")
    unprocessed_df = raw_df.join(unprocessed_ids, on="review_id", how="inner")
else:
    # All data is unprocessed
    unprocessed_df = raw_df

#remove null scores from unprocessed df
unprocessed_df = unprocessed_df.filter(col("score").isNotNull())


# Display unprocessed data for quick inspection (optional)
display(unprocessed_df)


In [0]:
# Convert to Pandas DataFrame for BERTopic processing
unprocessed_pd = unprocessed_df.toPandas()

# Check if there are no new reviews and exit notebook successfully
if unprocessed_pd.shape[0] == 0:
    dbutils.notebook.exit("No new reviews to process. Notebook finished successfully.")


In [0]:
# Retrieve HuggingFace token from secrets
HUGGINGFACE_TOKEN = dbutils.secrets.get(scope="hugging_face", key="login_token")

# Login to HuggingFace Hub
login(HUGGINGFACE_TOKEN)

# Load the BERTopic model from HuggingFace
loaded_model = BERTopic.load("DobreMihai/bertopic_ready_labeled")

print("Successfully loaded BERTopic model.")


In [0]:
# Apply BERTopic model on the review content
docs = unprocessed_pd['content'].tolist()
topics, probs = loaded_model.transform(docs)

# Get topic representations
topic_representations = loaded_model.get_topic_info()[['Topic', 'CustomName']]

topic_representations = topic_representations.rename(columns={'Topic': 'topic', 'CustomName': 'topic_name'})

# Add topics to the original DataFrame
unprocessed_pd['topic'] = topics

# Merge with topic representations
predicted_pd = unprocessed_pd.merge(topic_representations, on='topic')


In [0]:
# Prepare to get back to spark

# Convert float columns to integers
predicted_pd['score'] = predicted_pd['score'].astype('Int64')


# define schema for making it back to spark dataframe from pandas dataframe
SCHEMA = StructType([
    StructField("review_id", StringType(), True),
    StructField("content", StringType(), True),
    StructField("reviewCreatedVersion", StringType(), True),
    StructField("score", IntegerType(), True),
    StructField("review_timestamp", TimestampType(), True),
    StructField("topic", IntegerType(), True),
    StructField("topic_name", StringType(), True)
])

In [0]:
#check final dataframe before going back to spark
predicted_pd

In [0]:
# Convert the Pandas DataFrame back to a Spark DataFrame
predicted_df = spark.createDataFrame(predicted_pd, schema = SCHEMA)

# Change topic label for uncategorised reviews to "Uncategorised"
predicted_df = predicted_df.na.replace('-1_be_it_the_to', 'Uncategorised', 'topic_name')

# Save the predictions to the Delta table, creating it if it does not exist
predicted_df.write.format("delta").mode("append").save(predictions_delta_table_path)
print(f"Successfully saved predictions to Delta table: {predictions_delta_table_path}")
