Sentiment Analysis 
- RoBERTa
- Modell cardiffnlp/twitter-roberta-base-sentiment

In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import dotenv_values
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm # Import tqdm for progress bars

# Load environment variables from .env file
config = dotenv_values()

# --- Configuration Validation ---
# Ensure all required environment variables are present before proceeding
required_vars = ['POSTGRES_USER', 'POSTGRES_HOST', 'POSTGRES_PORT', 'POSTGRES_DB', 'POSTGRES_PASS']
for var in required_vars:
    if var not in config:
        raise ValueError(f"Missing required environment variable: {var}. Please check your .env file.")

# Assign environment variables to script variables
pg_user = config['POSTGRES_USER']
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
# Use .get() for optional schema, defaulting to "public"
pg_schema = config.get("POSTGRES_SCHEMA", "public")
pg_pass = config['POSTGRES_PASS']

# --- Constants ---
CHUNK_SIZE = 10_000 # Number of rows to fetch and process in each iteration
BATCH_SIZE = 16    # Number of texts processed by the sentiment model in one batch
TABLE_NAME = "bertopic_analysis_results_update_2" # Target database table
sentiment_labels = ['negative', 'neutral', 'positive'] # Labels for sentiment output

# --- Model Loading ---
# Determine the device (GPU/CPU) for model computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using compute device: {device}") # Inform the user which device is being used

# Load the pre-trained sentiment tokenizer and model
sentiment_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
sentiment_model.to(device) # Move model to the selected device
sentiment_model.eval() # Set the model to evaluation mode (disables dropout, etc.)

# --- Database Connection Function ---
def get_database_connection():
    """
    Establishes and returns a SQLAlchemy engine for PostgreSQL database connectivity.
    Configured with pool_pre_ping to ensure connections are healthy.
    """
    url = f"postgresql+psycopg2://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}"
    return create_engine(url, pool_pre_ping=True)

# --- Database Schema Management Function ---
def ensure_sentiment_columns(engine):
    """
    Ensures that 'sentiment_label' (VARCHAR) and 'sentiment_score' (FLOAT) columns
    exist in the specified database table. If they already exist, they are dropped
    and then re-added to ensure a clean slate for sentiment analysis.
    """
    with engine.connect() as conn:
        # Check for existing columns by querying information_schema
        result = conn.execute(text(f"""
            SELECT column_name
            FROM information_schema.columns
            WHERE table_schema = :schema_name AND table_name = :table_name
        """), {"schema_name": pg_schema, "table_name": TABLE_NAME})
        existing_cols = {row[0] for row in result}

        drop_statements = []
        if 'sentiment_label' in existing_cols:
            drop_statements.append('DROP COLUMN sentiment_label')
        if 'sentiment_score' in existing_cols:
            drop_statements.append('DROP COLUMN sentiment_score')

        if drop_statements:
            # If columns exist, construct and execute the ALTER TABLE DROP COLUMN statement
            drop_sql = f'ALTER TABLE "{pg_schema}"."{TABLE_NAME}" ' + ', '.join(drop_statements) + ";"
            conn.execute(text(drop_sql))
            print(f"Deleted existing sentiment columns in '{TABLE_NAME}'.")

        # Construct and execute the ALTER TABLE ADD COLUMN statement for new columns
        add_sql = f"""
            ALTER TABLE "{pg_schema}"."{TABLE_NAME}"
            ADD COLUMN sentiment_label VARCHAR(10),
            ADD COLUMN sentiment_score FLOAT;
        """
        conn.execute(text(add_sql))
        conn.commit() # Commit the DDL changes to the database
        print(f"Created new sentiment columns in '{TABLE_NAME}'.")

# --- Sentiment Analysis Function ---
def analyze_sentiment(texts):
    """
    Performs sentiment analysis on a list of input texts using the pre-loaded
    Hugging Face model. Texts are processed in batches for efficiency.
    Returns lists of predicted sentiment labels and their corresponding scores.
    """
    all_labels = []
    all_scores = []

    # Iterate through texts in batches, using tqdm for a nested progress bar
    # 'desc' provides a description, 'leave=False' makes the bar disappear on completion
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Analyzing Sentiment Batches", leave=False):
        batch_texts = texts[i:i+BATCH_SIZE]
        # Tokenize the batch, ensuring padding, truncation, and returning PyTorch tensors
        inputs = sentiment_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        # Move input tensors to the specified compute device (CPU/CUDA)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad(): # Context manager to disable gradient calculation for inference
            outputs = sentiment_model(**inputs)
            # Apply softmax to the model's logits to get probability distributions
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            # Get the maximum probability (score) and its index (label) for each text
            scores, labels = torch.max(probs, dim=1)

        # Extend the global lists with the results, converting tensors to numpy arrays
        all_labels.extend([sentiment_labels[label] for label in labels.cpu().numpy()])
        all_scores.extend(scores.cpu().numpy())

    return all_labels, all_scores

# --- Main Logic ---
def main():
    """
    Main function to execute the sentiment analysis pipeline:
    1. Connects to the database.
    2. Ensures sentiment columns are ready.
    3. Fetches review data in chunks.
    4. Performs sentiment analysis on each chunk.
    5. Updates the database with the sentiment results.
    6. Provides progress updates using tqdm.
    """
    # Initialize the database engine once at the start of the main function
    engine = get_database_connection()

    try:
        # Ensure the necessary sentiment columns exist in the database table
        ensure_sentiment_columns(engine)

        # Get the total count of ALL rows in the table
        # This count is used to initialize the main progress bar for processing all rows
        with engine.connect() as conn:
            total_rows_to_process = conn.execute(text(f"""
                SELECT COUNT(*) FROM "{pg_schema}"."{TABLE_NAME}"
            """)).scalar_one() # .scalar_one() retrieves a single scalar result (e.g., the count)

        print(f"Starting sentiment analysis for all {total_rows_to_process} rows in '{TABLE_NAME}'.")

        offset = 0 # Initialize offset for fetching data in chunks
        total_processed_in_session = 0 # Counter for rows processed in the current run

        # Initialize the main progress bar for the overall sentiment analysis process
        # 'position=0' helps control the order of nested tqdm bars in the console
        with tqdm(total=total_rows_to_process, desc="Overall Sentiment Analysis", position=0, unit="rows") as pbar_main:
            while True:
                df = pd.DataFrame() # Initialize an empty DataFrame for each iteration's chunk

                # Fetch a chunk of data from the database (now fetching ALL rows, not just NULL ones)
                with engine.connect() as conn:
                    query = text(f"""
                        SELECT review_id, text
                        FROM "{pg_schema}"."{TABLE_NAME}"
                        ORDER BY review_id
                        LIMIT :limit OFFSET :offset
                    """)
                    # Use pandas.read_sql to execute the query and load data into a DataFrame
                    df = pd.read_sql(query, conn, params={"limit": CHUNK_SIZE, "offset": offset})

                if df.empty:
                    # If the DataFrame is empty, it means no more reviews were found
                    print("All reviews processed for sentiment in the database.")
                    break # Exit the loop

                current_chunk_size = len(df)
                print(f"Processing chunk with offset {offset}, size {current_chunk_size} reviews...")

                # Perform sentiment analysis on the 'text' column of the current chunk
                sentiments, scores = analyze_sentiment(df["text"].tolist())
                # Add the sentiment results back to the DataFrame
                df["sentiment_label"] = sentiments
                df["sentiment_score"] = scores

                # Update the database with the sentiment results for the current chunk
                # Use engine.begin() for an atomic transaction for all updates within this chunk
                with engine.begin() as conn:
                    # Use tqdm for a nested progress bar to show database update progress for the current chunk
                    # 'leave=False' makes this bar disappear upon completion of each chunk's update
                    # 'position=1' ensures it appears below the main progress bar
                    for _, row in tqdm(df.iterrows(), total=current_chunk_size, desc="Updating DB for chunk", leave=False, position=1):
                        update_stmt = text(f"""
                            UPDATE "{pg_schema}"."{TABLE_NAME}"
                            SET sentiment_label = :label,
                                sentiment_score = :score
                            WHERE review_id = :rid
                        """)
                        # Execute the update statement with parameters from the current row
                        conn.execute(update_stmt, {
                            "label": row["sentiment_label"],
                            "score": float(row["sentiment_score"]), # Ensure score is explicitly float
                            "rid": row["review_id"]
                        })

                # After successfully processing and updating a chunk, update the main progress bar
                pbar_main.update(current_chunk_size)
                total_processed_in_session += current_chunk_size
                offset += CHUNK_SIZE # Increment offset to fetch the next chunk in the next iteration

                print(f"Chunk processed. Total rows analyzed and updated in this session: {total_processed_in_session}")

    except Exception as e:
        # Catch any exceptions that occur during the process and print an error message
        print(f"An error occurred during sentiment analysis: {e}")
    finally:
        # Ensure the database engine is properly disposed of, regardless of success or failure
        if engine:
            engine.dispose()
            print("Database engine disposed.")

# Entry point of the script
if __name__ == "__main__":
    main()