In [None]:
import pandas as pd
from sqlalchemy import create_engine, inspect, text
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from dotenv import dotenv_values
import random
from tqdm import tqdm
import psycopg2.extras # Import psycopg2.extras for execute_values
import torch # Import torch to check for MPS/CUDA availability

# Import specific BERTopic sub-models for fine-tuning
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# --- Configuration and Setup ---

# Load environment variables from .env file
config = dotenv_values()

# PostgreSQL database connection details
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config.get("POSTGRES_SCHEMA", "public")
pg_pass = config['POSTGRES_PASS']

# Validate environment variables
if not all([pg_user, pg_host, pg_port, pg_db, pg_pass]):
    raise ValueError("Missing one or more required PostgreSQL environment variables (POSTGRES_USER, POSTGRES_HOST, POSTGRES_PORT, POSTGRES_DB, POSTGRES_PASS).")
if pg_schema == "public":
    print("WARNING: POSTGRES_SCHEMA not found in .env, defaulting to 'public'.")

# Constants for file paths, table names, and batch processing
CHUNK_SIZE = 10_000
INPUT_DB_TABLE = "review_2019"
OUTPUT_DB_TABLE = "bertopic_analysis_results_update_4"
MODEL_DIR = "bertopic_model_update_4"
TOPIC_JSON_PATH = "topic_keywords_update_4.json"

# --- MODIFIED SAMPLE CONFIGURATION ---
# Configure how your training sample is selected. Choose one of the following:

# Option 1: Random Sampling
# Set to None to use random sampling from the full dataset.
# The `TRAINING_SAMPLE_SIZE` constant below will be used.
# TRAINING_SAMPLE_SOURCE = None

# Option 2: Use a specific list of review_ids
# Uncomment the line below and replace with your list of review_ids.
# Ensure these review_ids exist in your INPUT_DB_TABLE.
# TRAINING_SAMPLE_SOURCE = ["review_abc_123", "review_def_456", "review_ghi_789"]

# Option 3: Load sample from a CSV file
# Uncomment the line below and replace with the path to your CSV file.
# The CSV file MUST contain a column named "review_id".
# It's recommended that it also contains the "text" column for clarity,
# though the script will fetch the text from the database based on review_id.
# TRAINING_SAMPLE_SOURCE = "path/to/your_modified_sample.csv"

# Option 4: Load sample from a specific database table
# Uncomment the line below. This table MUST contain 'review_id' and 'text' columns.
TRAINING_SAMPLE_SOURCE = "DB_TABLE"
TRAINING_DB_TABLE_NAME = "training_sample" # <--- Specify your table name here (e.g., "review_training_subset")


# If using random sampling (TRAINING_SAMPLE_SOURCE = None), define the size here:
#TRAINING_SAMPLE_SIZE = 200000

# --- Database Utility Functions ---

def get_database_connection():
    """
    Establishes and returns a SQLAlchemy database engine for PostgreSQL.
    The connection string is constructed from environment variables.

    Returns:
        sqlalchemy.engine.base.Engine: A SQLAlchemy engine object.
    """
    url = f"postgresql+psycopg2://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}"
    return create_engine(url, pool_pre_ping=True)

def create_results_table(engine, schema_name, table_name):
    """
    Creates a PostgreSQL table to store the BERTopic analysis results if it doesn't already exist.

    Args:
        engine (sqlalchemy.engine.base.Engine): The SQLAlchemy engine.
        schema_name (str): The name of the database schema.
        table_name (str): The name of the table to create.
    """
    inspector = inspect(engine)
    if inspector.has_table(table_name, schema=schema_name):
        print(f"Table '{schema_name}.{table_name}' already exists. Skipping creation.")
        return

    # SQL DDL statement to create the table
    create_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (
        business_id VARCHAR(255),
        review_id VARCHAR(255) PRIMARY KEY,
        text TEXT,
        topic INTEGER,
        probability NUMERIC(5, 4)
    );
    """
    with engine.connect() as connection:
        connection.execute(text(create_table_sql))
        connection.commit()
    print(f"Successfully created table '{schema_name}.{table_name}' for results.")

def load_reviews_from_db(engine, schema, table):
    """
    Loads all reviews from a specified database table into a pandas DataFrame.

    Args:
        engine (sqlalchemy.engine.base.Engine): The SQLAlchemy engine.
        schema (str): The name of the database schema.
        table (str): The name of the table to load data from.

    Returns:
        pandas.DataFrame: A DataFrame containing the review data.
    """
    query = f'SELECT review_id, text, business_id FROM "{schema}"."{table}"'
    return pd.read_sql_query(query, engine)

def save_topic_info(model, path):
    """
    Retrieves and saves the BERTopic model's topic information to a JSON file.

    Args:
        model (bertopic.BERTopic): The trained BERTopic model.
        path (str): The file path to save the JSON output.
    """
    topic_info = model.get_topic_info()
    topic_info.to_json(path, orient="records", indent=2)

def write_batch_to_db(engine, df, table, schema):
    """
    Writes a batch of DataFrame rows to a specified database table using psycopg2.extras.execute_values
    for efficient bulk updates/inserts (upsert).

    Args:
        engine (sqlalchemy.engine.base.Engine): The SQLAlchemy engine.
        df (pandas.DataFrame): The DataFrame batch to write. Must contain 'review_id', 'topic', 'probability',
                               'business_id', and 'text'.
        table (str): The name of the target database table.
        schema (str): The name of the database schema.
    """
    if df.empty:
        return

    # Prepare data for bulk upsert: list of tuples (business_id, review_id, text, topic, probability)
    data_to_upsert = [(row["business_id"], row["review_id"], row["text"], int(row["topic"]), float(row["probability"]))
                      for _, row in df.iterrows()]

    with engine.begin() as conn:
        # Get the underlying psycopg2 connection for execute_values
        db_connection = conn.connection
        cursor = db_connection.cursor()

        # SQL for bulk upsert (INSERT or UPDATE if conflict on review_id)
        upsert_sql = f"""
            INSERT INTO "{schema}"."{table}" (business_id, review_id, text, topic, probability)
            VALUES %s
            ON CONFLICT (review_id) DO UPDATE SET
                business_id = EXCLUDED.business_id,
                text = EXCLUDED.text,
                topic = EXCLUDED.topic,
                probability = EXCLUDED.probability;
        """
        psycopg2.extras.execute_values(
            cursor,
            upsert_sql,
            data_to_upsert,
            template="(%s, %s, %s, %s, %s)",
            page_size=CHUNK_SIZE
        )
        print(f"Bulk upserted {len(df)} rows with topic results into {table}.")


# --- Main Script Execution ---

if __name__ == "__main__":
    engine = get_database_connection()
    create_results_table(engine, pg_schema, OUTPUT_DB_TABLE)

    print("Loading all review data from DB...")
    df_full = load_reviews_from_db(engine, pg_schema, INPUT_DB_TABLE)

    print(f"Initial dataset size: {len(df_full)} rows.")
    df_full = df_full.drop_duplicates(subset='text', keep='first').copy()
    print(f"Dataset size after dropping exact text duplicates: {len(df_full)} rows.")

    # --- Select Training Data based on TRAINING_SAMPLE_SOURCE ---
    df_sample = pd.DataFrame()
    if TRAINING_SAMPLE_SOURCE is None:
        print(f"Sampling {TRAINING_SAMPLE_SIZE} reviews randomly for training...")
        if len(df_full) < TRAINING_SAMPLE_SIZE:
            print(f"WARNING: Full dataset ({len(df_full)}) is smaller than requested sample size ({TRAINING_SAMPLE_SIZE}). Using full dataset for training.")
            df_sample = df_full.copy()
        else:
            df_sample = df_full.sample(n=TRAINING_SAMPLE_SIZE, random_state=42).copy()
    elif TRAINING_SAMPLE_SOURCE == "DB_TABLE":
        if 'TRAINING_DB_TABLE_NAME' not in locals() and 'TRAINING_DB_TABLE_NAME' not in globals():
             raise ValueError("TRAINING_DB_TABLE_NAME must be defined in constants when TRAINING_SAMPLE_SOURCE is 'DB_TABLE'.")
        print(f"Loading training data from database table '{pg_schema}.{TRAINING_DB_TABLE_NAME}'...")
        try:
            df_sample = load_reviews_from_db(engine, pg_schema, TRAINING_DB_TABLE_NAME)
            if df_sample.empty:
                raise ValueError(f"No data found in training table '{pg_schema}.{TRAINING_DB_TABLE_NAME}'.")
            if 'review_id' not in df_sample.columns or 'text' not in df_sample.columns:
                raise ValueError(f"Training table '{pg_schema}.{TRAINING_DB_TABLE_NAME}' must contain 'review_id' and 'text' columns.")
            original_sample_size = len(df_sample)
            df_sample = df_sample.drop_duplicates(subset='text', keep='first').copy()
            if len(df_sample) < original_sample_size:
                print(f"INFO: Removed {original_sample_size - len(df_sample)} duplicate texts from training sample loaded from DB table.")

        except Exception as e:
            raise ValueError(f"Error loading training sample from database table '{TRAINING_DB_TABLE_NAME}': {e}")
    elif isinstance(TRAINING_SAMPLE_SOURCE, list):
        print("Using provided list of review_ids for training data...")
        df_sample = df_full[df_full["review_id"].isin(TRAINING_SAMPLE_SOURCE)].copy()
        if len(df_sample) != len(TRAINING_SAMPLE_SOURCE):
            print(f"WARNING: Not all provided review_ids ({len(TRAINING_SAMPLE_SOURCE)}) were found in the full dataset. Found {len(df_sample)}.")
    elif isinstance(TRAINING_SAMPLE_SOURCE, str):
        print(f"Loading training data from '{TRAINING_SAMPLE_SOURCE}' (CSV file)...")
        try:
            df_source_ids = pd.read_csv(TRAINING_SAMPLE_SOURCE)
            if "review_id" not in df_source_ids.columns:
                raise ValueError(f"CSV sample file '{TRAINING_SAMPLE_SOURCE}' must contain a 'review_id' column.")

            df_sample = df_full[df_full["review_id"].isin(df_source_ids["review_id"])].copy()

            if len(df_sample) != len(df_source_ids):
                print(f"WARNING: Not all review_ids from '{TRAINING_SAMPLE_SOURCE}' ({len(df_source_ids)}) were found in the (deduplicated) database data. Using {len(df_sample)} IDs found.")

        except FileNotFoundError:
            raise FileNotFoundError(f"Training sample file not found at '{TRAINING_SAMPLE_SOURCE}'")
        except Exception as e:
            raise ValueError(f"Error loading training sample from CSV: {e}")
    else:
        raise ValueError("Invalid type for TRAINING_SAMPLE_SOURCE. Must be None, 'DB_TABLE', a list of review_ids, or a file path string.")

    if df_sample.empty:
        raise ValueError("No training data could be loaded. Please check your TRAINING_SAMPLE_SOURCE configuration and data.")

    sampled_texts = df_sample["text"].tolist()
    print(f"Training BERTopic model on {len(sampled_texts)} reviews.")

    # --- Determine the optimal device for SentenceTransformer ---
    embedding_device = 'cpu'
    if torch.cuda.is_available():
        embedding_device = 'cuda'
        print(f"CUDA is available! Using GPU: {torch.cuda.get_device_name(0)}")
    elif torch.backends.mps.is_available(): # Check for Apple Silicon MPS
        embedding_device = 'mps'
        print("MPS (Metal Performance Shaders) is available! Using Apple Silicon GPU.")
    else:
        print("No GPU (CUDA or MPS) detected. Falling back to CPU for embeddings.")

    # Load embedding model, explicitly telling it which device to use
    embedding_model = SentenceTransformer("intfloat/e5-large-v2", device=embedding_device)

    print("Training BERTopic model with customized settings...")
    # 1. Custom CountVectorizer for stop words and min_df
    custom_stop_words = [
        "a", "an", "the", "and", "or", "but",
        "if", "while", "is", "am", "are", "was", "were", "be", "been", "being",
        "do", "does", "did", "doing",
        "have", "has", "had", "having",
        "i", "me", "my", "myself",
        "we", "our", "ours", "ourselves",
        "you", "your", "yours", "yourself", "yourselves",
        "he", "him", "his", "himself",
        "she", "her", "hers", "herself",
        "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves",
        "this", "that", "these", "those",
        "to", "in", "of","for","on","at","with",
        "as", "such", "too",
        "can", "will", "would", "should", "could",
        "just", "only", "also", "so", "than", "then", "there", "here",
        "what", "which", "who", "whom", "whose",
        "when", "where", "why", "how"
    ]

    vectorizer_model = CountVectorizer(stop_words=custom_stop_words, min_df=10, ngram_range=(1, 2))

    # 2. Custom HDBSCAN for clustering
    # Retaining the specific parameters for consistency and better topic formation
    hdbscan_model = HDBSCAN(min_cluster_size=50, prediction_data=True)


    # 3. Initialize BERTopic with all customized models and settings
    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        hdbscan_model=hdbscan_model,
        nr_topics="auto",
        language="english",
        verbose=True
    )

    # Fit the model to your selected training data. This step trains the model.
    topics_train, probabilities_train = topic_model.fit_transform(sampled_texts)
    print(f"BERTopic model training complete. Found {len(topic_model.get_topics())} topics.")

    # Save the trained model and topic information
    print(f"Saving BERTopic model to {MODEL_DIR}...")
    topic_model.save(MODEL_DIR)
    print(f"Saving topic summary to {TOPIC_JSON_PATH}...")
    save_topic_info(topic_model, TOPIC_JSON_PATH)

    # --- Process ALL Data (including training data) in Chunks ---
    print("Processing all reviews (including training data) in chunks and writing to DB...")

    with tqdm(total=len(df_full), desc="Processing All Reviews", unit="reviews") as pbar_full_analysis:
        for start in range(0, len(df_full), CHUNK_SIZE):
            end = start + CHUNK_SIZE
            batch = df_full.iloc[start:end].copy()

            texts_batch = batch["text"].tolist()
            topics_batch, probs_batch = topic_model.transform(texts_batch)

            batch["topic"] = topics_batch
            batch["probability"] = probs_batch

            write_batch_to_db(engine, batch[["business_id", "review_id", "text", "topic", "probability"]],
                              OUTPUT_DB_TABLE, pg_schema)

            pbar_full_analysis.update(len(batch))
            print(f"Processed and upserted {len(batch)} rows.")

    print("Finished topic modeling and database update for all reviews.")

    # Close the database connection
    engine.dispose()

Successfully created table 'capstone_yelp.bertopic_analysis_results_update_4' for results.
Loading all review data from DB...
Initial dataset size: 907284 rows.
Dataset size after dropping exact text duplicates: 905331 rows.
Loading training data from database table 'capstone_yelp.training_sample'...
INFO: Removed 131 duplicate texts from training sample loaded from DB table.
Training BERTopic model on 199869 reviews.
MPS (Metal Performance Shaders) is available! Using Apple Silicon GPU.


Error while downloading from https://cdn-lfs.hf.co/repos/de/b9/deb90e29ca6ddad4c7f23805a5a02770ba9d3cc3054242541d4fc18a10fd5886/d741c1a688a6169af0ecb5a047c44645cd992c31e1bf431269f98bba9ae2911a?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1750092614&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MDA5MjYxNH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9kZS9iOS9kZWI5MGUyOWNhNmRkYWQ0YzdmMjM4MDVhNWEwMjc3MGJhOWQzY2MzMDU0MjQyNTQxZDRmYzE4YTEwZmQ1ODg2L2Q3NDFjMWE2ODhhNjE2OWFmMGVjYjVhMDQ3YzQ0NjQ1Y2Q5OTJjMzFlMWJmNDMxMjY5Zjk4YmJhOWFlMjkxMWE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=NrFVjI56rnvjN-Lt0kageqk8jCIPrFInLHo0s1qjwOb1vrV1wX%7EAkjfVQHvq5j3OknKI5mfPYylvn3twKMv3WE08J3Wx3d6h0quTrBblT4g3fSUZwolpLucgB7C0koMn7d7mE3aSx9WtyQzmcJV%7EhvBDhEHIlRKn-lwXxzgo6cHS41hRx8y5vEUJD-5hYMhbEHg71SlFpB8txjSpxB%7EwFO47afb%7EExZh1mJ0FKKNisbPOlTGQAfosa1DrKed7xyxTPg

Training BERTopic model with customized settings...


2025-06-16 18:33:48,706 - BERTopic - Embedding - Transforming documents to embeddings.
Batches:  15%|█▍        | 931/6246 [15:26:37<88:10:00, 59.72s/it]  
