<a href="https://colab.research.google.com/github/AliHAlbaqali/KV6002-Group-Project/blob/main/Dynamic_Topic_Modeling_of_Uber_Reviews_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install bertopic

Collecting bertopic
  Using cached bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Us

In [4]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m81.9/126.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [5]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

# Text Preprocessing and NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download necessary NLTK data (run once)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    WordNetLemmatizer().lemmatize('test')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('punkt')
except LookupError:
    nltk.download('punkt')


# BERTopic
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Visualization (BERTopic uses Plotly)
import plotly.io as pio
pio.renderers.default = "browser" # Or 'notebook' if in Jupyter, 'colab' for Colab

# For reproducibility
SEED = 42
np.random.seed(SEED)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# --- 1. Data Loading and Preprocessing ---
def load_and_preprocess_data(filepath="/content/Uber.csv"):
    """
    Loads data from a CSV file, preprocesses text, and extracts timestamps.
    Assumes CSV has 'Tweet' and 'Datetime' columns.
    """
    print("Loading data...")
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"Error: The file {filepath} was not found. Please ensure it's in the correct path.")
        return None, None, None

    # Ensure 'Tweet' and 'Datetime' columns exist
    if 'Tweet' not in df.columns or 'Datetime' not in df.columns:
        print("Error: CSV must contain 'Tweet' and 'Datetime' columns.")
        print(f"Available columns: {df.columns.tolist()}")
        # Attempt to find likely candidates if common names are used
        tweet_col_candidates = [col for col in df.columns if 'text' in col.lower() or 'tweet' in col.lower()]
        date_col_candidates = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]

        if not tweet_col_candidates or not date_col_candidates:
            print("Could not automatically identify tweet and datetime columns. Please rename them to 'Tweet' and 'Datetime'.")
            return None, None, None

        # Use the first candidate found
        tweet_col = tweet_col_candidates[0]
        date_col = date_col_candidates[0]
        print(f"Attempting to use '{tweet_col}' as Tweet column and '{date_col}' as Datetime column.")
        df.rename(columns={tweet_col: 'Tweet', date_col: 'Datetime'}, inplace=True)


    # Drop rows with missing tweets
    df.dropna(subset=['Tweet'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    print("Preprocessing text data...")
    # Convert 'Tweet' to string to handle potential non-string data
    df['cleaned_tweet'] = df['Tweet'].astype(str).apply(preprocess_text)

    print("Processing timestamps...")
    # Convert 'Datetime' to datetime objects, handling potential errors
    df['timestamp'] = pd.to_datetime(df['Datetime'], errors='coerce')
    df.dropna(subset=['timestamp'], inplace=True) # Remove rows where date conversion failed

    # Sort by timestamp for dynamic modeling
    df.sort_values('timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Filter out empty strings after cleaning
    df = df[df['cleaned_tweet'].str.strip().astype(bool)]
    df.reset_index(drop=True, inplace=True)

    if df.empty:
        print("No valid data remaining after preprocessing. Check your CSV content and column names.")
        return None, None, None

    print(f"Data loaded and preprocessed. Shape: {df.shape}")
    return df['cleaned_tweet'].tolist(), df['timestamp'].tolist(), df

In [8]:
def preprocess_text(text):
    """
    Cleans and preprocesses a single text string.
    """
    text = text.lower()  # Lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers

    # Tokenization (simple split)
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization (optional, can be slow)
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

In [9]:

# --- 2. Sentiment Analysis ---
def get_sentiment_scores(texts):
    """
    Calculates sentiment scores for a list of texts using VADER.
    """
    print("Calculating sentiment scores...")
    analyzer = SentimentIntensityAnalyzer()
    sentiments = []
    for text in texts:
        vs = analyzer.polarity_scores(text)
        sentiments.append(vs['compound']) # Using compound score
    return sentiments

In [10]:
# --- 3. BERTopic Model Training ---
def train_bertopic_model(docs, timestamps, embedding_model_name="all-MiniLM-L6-v2", min_topic_size=10):
    """
    Trains a BERTopic model.
    """
    print(f"Training BERTopic model with embedding model: {embedding_model_name}...")

    # Using a simple CountVectorizer for c-TF-IDF, can be customized
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

    # Using a specific sentence transformer model
    embedding_model = SentenceTransformer(embedding_model_name)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        language="english",
        calculate_probabilities=True,
        verbose=True,
        min_topic_size=min_topic_size,
        # nr_topics="auto" # or a specific number
    )

    # Fit BERTopic model
    # Note: If timestamps are not perfectly aligned or have gaps, dynamic modeling might be affected.
    # Ensure timestamps are sorted and docs correspond to these timestamps.
    try:
        topics, probabilities = topic_model.fit_transform(docs, embeddings=embedding_model.encode(docs))
        print("BERTopic model training complete.")
        return topic_model, topics, probabilities
    except Exception as e:
        print(f"Error during BERTopic training: {e}")
        print("This might be due to insufficient data after preprocessing or issues with the embedding model.")
        print(f"Number of documents provided for training: {len(docs)}")
        if len(docs) < min_topic_size:
            print(f"Consider reducing 'min_topic_size' (current: {min_topic_size}) or providing more data.")
        return None, None, None

In [11]:
#--- 4. Dynamic Topic Modeling ---
def dynamic_topic_modeling(topic_model, docs, timestamps):
    """
    Performs dynamic topic modeling and generates visualizations.
    """
    if topic_model is None or not docs or not timestamps:
        print("Skipping dynamic topic modeling due to previous errors or no data.")
        return

    print("Performing dynamic topic modeling...")
    try:
        # Ensure timestamps are in a format BERTopic expects (e.g., Unix epoch or datetime)
        # If they are already datetime objects, it should be fine.
        topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20) # Adjust nr_bins as needed

        print("Visualizing topics over time...")
        fig_dynamic = topic_model.visualize_topics_over_time(topics_over_time)
        fig_dynamic.write_html("uber_topics_over_time.html")
        print("Saved dynamic topics visualization to uber_topics_over_time.html")

    except Exception as e:
        print(f"Error during dynamic topic modeling: {e}")
        print("This could be due to issues with timestamp data or insufficient data for binning.")


In [13]:
# --- 5. Incremental Learning (Online/Mini-batch) ---
def incremental_learning_demo(topic_model, all_docs, all_timestamps, batch_size_ratio=0.2):
    """
    Demonstrates incremental learning with BERTopic.
    """
    if topic_model is None or not all_docs or len(all_docs) < 2: # Need at least 2 docs for a split
        print("Skipping incremental learning demo due to previous errors or insufficient data.")
        return

    print("\n--- Incremental Learning Demo ---")

    # Split data into initial training and new batches
    # Ensure there's enough data for at least one batch
    if len(all_docs) * batch_size_ratio < 1:
        print("Not enough data for incremental learning demo with current batch_size_ratio.")
        return

    # For demonstration, we'll re-train a base model on a subset, then update.
    # In a real scenario, you'd save your initial model and load it.

    initial_training_size = int(len(all_docs) * (1 - batch_size_ratio * 2)) # Keep some for two updates
    if initial_training_size < topic_model.min_topic_size: # Ensure enough for initial training
        initial_training_size = topic_model.min_topic_size * 2 # Heuristic
        if initial_training_size >= len(all_docs):
            print("Not enough data to meaningfully split for incremental learning demo.")
            return

    initial_docs = all_docs[:initial_training_size]
    initial_timestamps = all_timestamps[:initial_training_size] # Timestamps for the initial set

    print(f"Training a base model on {len(initial_docs)} documents for incremental demo...")
    base_topic_model, _, _ = train_bertopic_model(initial_docs, initial_timestamps, min_topic_size=max(5, int(topic_model.min_topic_size/2))) # smaller min for demo

    if not base_topic_model:
        print("Failed to train base model for incremental learning demo.")
        return

    print("Base model topics:")
    try:
        print(base_topic_model.get_topic_info().head())
    except Exception as e:
        print(f"Could not get topic info for base model: {e}")

    # Simulate new data arriving (two batches for demo)
    remaining_docs = all_docs[initial_training_size:]
    remaining_timestamps = all_timestamps[initial_training_size:]

    if not remaining_docs:
        print("No remaining documents for incremental updates.")
        return

    batch_size = int(len(remaining_docs) / 2)
    if batch_size == 0 and len(remaining_docs) > 0: # if only a few docs left, make one batch
        batch_size = len(remaining_docs)

    if batch_size == 0:
        print("Not enough remaining documents for a batch update.")
        return

    new_docs_batch1 = remaining_docs[:batch_size]
    # new_timestamps_batch1 = remaining_timestamps[:batch_size] # Timestamps for batch 1

    if new_docs_batch1:
        print(f"\nUpdating model with first batch of {len(new_docs_batch1)} new documents...")
        try:
            base_topic_model.partial_fit(new_docs_batch1)
            print("Model updated with first batch. New topics overview:")
            print(base_topic_model.get_topic_info().head())
        except Exception as e:
            print(f"Error during first incremental update: {e}")
    else:
        print("Skipping first batch update: no documents in batch.")

    new_docs_batch2 = remaining_docs[batch_size:]
    # new_timestamps_batch2 = remaining_timestamps[batch_size:] # Timestamps for batch 2

    if new_docs_batch2:
        print(f"\nUpdating model with second batch of {len(new_docs_batch2)} new documents...")
        try:
            base_topic_model.partial_fit(new_docs_batch2)
            print("Model updated with second batch. Final topics overview:")
            print(base_topic_model.get_topic_info().head())
        except Exception as e:
            print(f"Error during second incremental update: {e}")
    else:
        print("Skipping second batch update: no documents in batch.")


In [14]:
# --- 5. Incremental Learning (Online/Mini-batch) ---
def incremental_learning_demo(topic_model, all_docs, all_timestamps, batch_size_ratio=0.2):
    """
    Demonstrates incremental learning with BERTopic.
    """
    if topic_model is None or not all_docs or len(all_docs) < 2: # Need at least 2 docs for a split
        print("Skipping incremental learning demo due to previous errors or insufficient data.")
        return

    print("\n--- Incremental Learning Demo ---")

    # Split data into initial training and new batches
    # Ensure there's enough data for at least one batch
    if len(all_docs) * batch_size_ratio < 1:
        print("Not enough data for incremental learning demo with current batch_size_ratio.")
        return

    initial_training_size = int(len(all_docs) * (1 - batch_size_ratio * 2)) # Keep some for two updates
    if initial_training_size < topic_model.min_topic_size: # Ensure enough for initial training
        initial_training_size = topic_model.min_topic_size * 2 # Heuristic
        if initial_training_size >= len(all_docs):
            print("Not enough data to meaningfully split for incremental learning demo.")
            return

    initial_docs = all_docs[:initial_training_size]
    initial_timestamps = all_timestamps[:initial_training_size] # Timestamps for the initial set

    print(f"Training a base model on {len(initial_docs)} documents for incremental demo...")
    base_topic_model, _, _ = train_bertopic_model(initial_docs, initial_timestamps, min_topic_size=max(5, int(topic_model.min_topic_size/2))) # smaller min for demo

    if not base_topic_model:
        print("Failed to train base model for incremental learning demo.")
        return

    print("Base model topics:")
    try:
        print(base_topic_model.get_topic_info().head())
    except Exception as e:
        print(f"Could not get topic info for base model: {e}")


    # Simulate new data arriving (two batches for demo)
    remaining_docs = all_docs[initial_training_size:]
    remaining_timestamps = all_timestamps[initial_training_size:]

    if not remaining_docs:
        print("No remaining documents for incremental updates.")
        return

    batch_size = int(len(remaining_docs) / 2)
    if batch_size == 0 and len(remaining_docs) > 0: # if only a few docs left, make one batch
        batch_size = len(remaining_docs)

    if batch_size == 0:
        print("Not enough remaining documents for a batch update.")
        return

    new_docs_batch1 = remaining_docs[:batch_size]

    if new_docs_batch1:
        print(f"\nUpdating model with first batch of {len(new_docs_batch1)} new documents...")
        try:
            base_topic_model.partial_fit(new_docs_batch1)
            print("Model updated with first batch. New topics overview:")
            print(base_topic_model.get_topic_info().head())
        except Exception as e:
            print(f"Error during first incremental update: {e}")
    else:
        print("Skipping first batch update: no documents in batch.")

    new_docs_batch2 = remaining_docs[batch_size:]
    # new_timestamps_batch2 = remaining_timestamps[batch_size:] # Timestamps for batch 2

    if new_docs_batch2:
        print(f"\nUpdating model with second batch of {len(new_docs_batch2)} new documents...")
        try:
            base_topic_model.partial_fit(new_docs_batch2)
            print("Model updated with second batch. Final topics overview:")
            print(base_topic_model.get_topic_info().head())
        except Exception as e:
            print(f"Error during second incremental update: {e}")
    else:
        print("Skipping second batch update: no documents in batch.")


In [15]:
# --- 6. Main Application Execution ---
if __name__ == "__main__":
    # --- SYSTEM Configuration ---
    DATA_FILEPATH = "/content/Uber.csv"
    SAMPLE_SIZE = 2000
    MIN_TOPIC_SIZE_MAIN = 15 # Min documents to form a topic in the main model

    # --- Load and Preprocess Data ---
    all_tweets, all_timestamps, df_full = load_and_preprocess_data(DATA_FILEPATH)

    if all_tweets is None or not all_tweets:
        print("Exiting due to data loading/preprocessing issues.")
        exit()

    # --- Sample Data (Optional, for faster execution during development) ---
    if SAMPLE_SIZE is not None and SAMPLE_SIZE < len(all_tweets):
        print(f"\nUsing a sample of {SAMPLE_SIZE} documents for this run.")
        indices = np.random.choice(len(all_tweets), SAMPLE_SIZE, replace=False)
        indices.sort() # Keep chronological order for sampled data

        sampled_tweets = [all_tweets[i] for i in indices]
        sampled_timestamps = [all_timestamps[i] for i in indices]
        df = df_full.iloc[indices].copy() # Use .copy() to avoid SettingWithCopyWarning

        docs_to_process = sampled_tweets
        timestamps_to_process = sampled_timestamps
    else:
        print(f"\nUsing the full dataset of {len(all_tweets)} documents.")
        docs_to_process = all_tweets
        timestamps_to_process = all_timestamps
        df = df_full.copy()

    if not docs_to_process:
        print("No documents to process after sampling/filtering. Exiting.")
        exit()

    # --- Sentiment Analysis ---
    df['sentiment_score'] = get_sentiment_scores(docs_to_process)
    print("\nSentiment scores calculated and added to DataFrame:")
    print(df[['cleaned_tweet', 'sentiment_score']].head())

    # --- Initial BERTopic Model Training ---
    topic_model, topics, probabilities = train_bertopic_model(docs_to_process, timestamps_to_process, min_topic_size=MIN_TOPIC_SIZE_MAIN)

    if topic_model:
        print("\n--- Initial Model Results ---")
        try:
            print("Top topics found by initial model:")
            print(topic_model.get_topic_info().head(10))

            # Visualize topics
            print("\nVisualizing topics (hierarchy)...")
            fig_hierarchy = topic_model.visualize_hierarchy()
            fig_hierarchy.write_html("uber_topic_hierarchy.html")
            print("Saved topic hierarchy to uber_topic_hierarchy.html")

            print("\nVisualizing topic terms (bar chart)...")
            fig_terms = topic_model.visualize_barchart(top_n_topics=10) # Visualize top 10 topics
            fig_terms.write_html("uber_topic_barchart.html")
            print("Saved topic barchart to uber_topic_barchart.html")

        except Exception as e:
            print(f"Error during visualization or saving of initial model: {e}")
            if "already contains the key" in str(e).lower():
                 print(" Try restarting the kernel.")


        # --- Dynamic Topic Modeling (on the same data used for initial training) ---
        dynamic_topic_modeling(topic_model, docs_to_process, timestamps_to_process)

        # --- Sentiment per Topic ---
        # This is a basic aggregation.
        if topics is not None:
            print("\nAggregating sentiment per topic...")
            df['topic_id'] = topics
            # Exclude outlier topic (-1) for meaningful sentiment aggregation
            sentiment_per_topic = df[df['topic_id'] != -1].groupby('topic_id')['sentiment_score'].mean().sort_values(ascending=False)
            print("\nAverage sentiment score per topic (higher is more positive):")
            print(sentiment_per_topic)

            topic_info_df = topic_model.get_topic_info()
            enriched_topic_info = pd.merge(topic_info_df, sentiment_per_topic.rename('avg_sentiment'), left_on='Topic', right_index=True, how='left')
            print("\nTopic info with average sentiment:")
            print(enriched_topic_info[['Topic', 'Name', 'Count', 'avg_sentiment']].head(15))
        else:
            print("Skipping sentiment per topic as topics were not generated.")


        # --- Incremental Learning Demonstration ---
        incremental_learning_demo(topic_model, all_tweets, all_timestamps, batch_size_ratio=0.1) # Use 10% for each batch update in demo

    else:
        print("BERTopic model training failed.")

    print("\n--- Script Finished ---")

Loading data...
Error: CSV must contain 'Tweet' and 'Datetime' columns.
Available columns: ['Unnamed: 0', 'id', 'conversation_id', 'created_at', 'date', 'timezone', 'place', 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str', 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video', 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url', 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt', 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src', 'trans_dest']
Attempting to use 'tweet' as Tweet column and 'date' as Datetime column.
Preprocessing text data...
Processing timestamps...
Data loaded and preprocessed. Shape: (9962, 41)

Using a sample of 2000 documents for this run.
Calculating sentiment scores...

Sentiment scores calculated and added to DataFrame:
                                        cleaned_tweet  sentiment_score
0                        uber files dai su parliamone           0.0000
3   call action assault

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-05-16 05:21:33,390 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 05:21:49,885 - BERTopic - Dimensionality - Completed ✓
2025-05-16 05:21:49,887 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 05:21:50,095 - BERTopic - Cluster - Completed ✓
2025-05-16 05:21:50,102 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 05:21:50,320 - BERTopic - Representation - Completed ✓


BERTopic model training complete.

--- Initial Model Results ---
Top topics found by initial model:
   Topic  Count                                               Name  \
0     -1    603                                -1_uber_le_pour_les   
1      0    453                                  0_que_el_uber_los   
2      1    117                     1_uber_driver_uber driver_like   
3      2     90                   2_kroes_voor_neelie_neelie kroes   
4      3     77                                 3_taxis_les_et_qui   
5      4     61                 4_uber files_files_lobbying_macron   
6      5     56  5_secretly helped_helped_helped uber_reveals p...   
7      6     56             6_laws duped_duped_duped police_police   
8      7     44             7_deal secret_deal_secret_secret entre   
9      8     39                              8_people_amp_uber_use   

                                      Representation  \
0  [uber, le, pour, les, des, et, la, macron, pas...   
1   [que, el, ube

20it [00:02,  9.05it/s]


Visualizing topics over time...
Saved dynamic topics visualization to uber_topics_over_time.html

Aggregating sentiment per topic...

Average sentiment score per topic (higher is more positive):
topic_id
23    0.099561
1     0.081453
8     0.077787
19    0.038182
3     0.025132
18    0.019627
0     0.018279
10    0.015700
9     0.000000
17   -0.024181
11   -0.028269
7    -0.028911
24   -0.034040
2    -0.042931
21   -0.059460
4    -0.132041
5    -0.161520
22   -0.342532
15   -0.366989
12   -0.381359
13   -0.499585
20   -0.561705
14   -0.617411
16   -0.623193
6    -0.810641
Name: sentiment_score, dtype: float64

Topic info with average sentiment:
    Topic                                               Name  Count  \
0      -1                                -1_uber_le_pour_les    603   
1       0                                  0_que_el_uber_los    453   
2       1                     1_uber_driver_uber driver_like    117   
3       2                   2_kroes_voor_neelie_neelie kroes   

2025-05-16 05:25:44,105 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 05:26:22,158 - BERTopic - Dimensionality - Completed ✓
2025-05-16 05:26:22,159 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 05:26:36,469 - BERTopic - Cluster - Completed ✓
2025-05-16 05:26:36,475 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 05:26:37,288 - BERTopic - Representation - Completed ✓


BERTopic model training complete.
Base model topics:
   Topic  Count                             Name  \
0     -1   2616                 -1_la_que_en_les   
1      0    161            0_jamais_pas_pour_est   
2      1    158  1_uber uber_uber_uber evil_fuck   
3      2    133                  2_um_com_eu_não   
4      3    132      3_el_lo_papeles_los papeles   

                                      Representation  \
0   [la, que, en, les, et, uber, le, pour, des, pas]   
1  [jamais, pas, pour, est, qui, vous, le, ça, ub...   
2  [uber uber, uber, uber evil, fuck, evil, right...   
3  [um, com, eu, não, pra, ele, uma, cara, que, u...   
4  [el, lo, papeles, los papeles, el uber, papele...   

                                 Representative_Docs  
0  [révélations sur les liens entre uber et macro...  
1  [pour le dire autrement si vous êtes un travai...  
2  [today weirdest uber ride made feel like im si...  
3  [não existe uber caro quando eu quero ir embor...  
4  [tengo una bonita a

In [18]:
!pip install streamlit



In [28]:
!pip install streamlit_plotly_events



In [31]:
# prompt: create an app using streamlit to display the analysis and visualisation

import streamlit as st
import plotly.graph_objects as go
import plotly.express as px

# Assuming df and topic_model are available globally from the notebook execution
# In a real Streamlit app, you would load or generate these objects here.
# For this example, we'll assume they exist after running the previous code block.

st.set_page_config(layout="wide")

st.title('Uber Tweet Analysis')

if 'df' not in locals() or 'topic_model' not in locals() or df is None or topic_model is None:
    st.error("Data or BERTopic model not available. Please ensure the previous code block runs successfully.")
else:
    st.header("Data Overview")
    st.write(f"Total number of tweets processed: {len(df_full)}")
    st.write(f"Number of tweets used for analysis (after sampling/filtering): {len(df)}")
    st.dataframe(df[['Tweet', 'cleaned_tweet', 'timestamp', 'sentiment_score', 'topic_id']].head())

    st.header("Sentiment Analysis")
    st.write("Distribution of Sentiment Scores:")
    fig_sentiment_dist = px.histogram(df, x='sentiment_score', title='Distribution of Compound Sentiment Scores')
    st.plotly_chart(fig_sentiment_dist, use_container_width=True)

    # Basic sentiment classification
    df['sentiment_category'] = df['sentiment_score'].apply(lambda x: 'Positive' if x >= 0.05 else ('Negative' if x <= -0.05 else 'Neutral'))
    st.write("Sentiment Category Distribution:")
    fig_sentiment_cat = px.pie(df, names='sentiment_category', title='Sentiment Category Distribution')
    st.plotly_chart(fig_sentiment_cat, use_container_width=True)


    st.header("Topic Modeling (BERTopic)")

    # Topic Info
    if 'enriched_topic_info' in locals() and enriched_topic_info is not None:
        st.subheader("Topic Information")
        st.dataframe(enriched_topic_info[['Topic', 'Count', 'Name', 'Representation', 'avg_sentiment']].head(20))
    else:
         st.warning("Enriched topic info not available. Run the full analysis script first.")

    # Visualize Topics Hierarchy (if file exists)
    st.subheader("Topic Hierarchy")
    try:
        with open("uber_topic_hierarchy.html", "r") as f:
            hierarchy_html = f.read()
        st.components.v1.html(hierarchy_html, height=600, scrolling=True)
    except FileNotFoundError:
        st.warning("Topic hierarchy visualization file 'uber_topic_hierarchy.html' not found. Run the full analysis script first.")

    # Visualize Topic Terms (if file exists)
    st.subheader("Topic Terms (Bar Chart)")
    try:
        with open("uber_topic_barchart.html", "r") as f:
            barchart_html = f.read()
        st.components.v1.html(barchart_html, height=600, scrolling=True)
    except FileNotFoundError:
        st.warning("Topic terms visualization file 'uber_topic_barchart.html' not found. Run the full analysis script first.")

    # Visualize Topics Over Time (if file exists)
    st.subheader("Topics Over Time")
    try:
        with open("uber_topics_over_time.html", "r") as f:
            over_time_html = f.read()
        st.components.v1.html(over_time_html, height=600, scrolling=True)
    except FileNotFoundError:
        st.warning("Topics over time visualization file 'uber_topics_over_time.html' not found. Run the full analysis script first.")


    # Optional: Display sample tweets for a selected topic
    st.subheader("Sample Tweets per Topic")
    # Ensure 'topic_id' column exists and is not None
    if 'topic_id' in df.columns and not df['topic_id'].isnull().all():
        topic_list = sorted(df['topic_id'].unique().tolist())
        selected_topic = st.selectbox("Select a Topic", topic_list)

        if selected_topic is not None:
            st.write(f"Sample Tweets for Topic {selected_topic}:")
            sample_tweets = df[df['topic_id'] == selected_topic].sample(min(10, len(df[df['topic_id'] == selected_topic]))) # Display max 10 samples
            for i, row in sample_tweets.iterrows():
                st.write(f"- {row['Tweet']}")
    else:
        st.info("Topic assignments are not available in the DataFrame.")


    # Optional: Sentiment by Time
    st.subheader("Sentiment Over Time")
    # Aggregate sentiment by a time interval (e.g., daily)
    if 'timestamp' in df.columns and 'sentiment_score' in df.columns:
        sentiment_over_time = df.set_index('timestamp')['sentiment_score'].resample('D').mean().reset_index()
        fig_sentiment_time = px.line(sentiment_over_time, x='timestamp', y='sentiment_score', title='Average Daily Sentiment Score Over Time')
        st.plotly_chart(fig_sentiment_time, use_container_width=True)
    else:
        st.info("Timestamp or sentiment data not available for time series analysis.")


# --- How to run this with Streamlit in Colab ---
# 1. Save the complete code block above as a Python file (e.g., app.py) in your Colab environment.
# 2. Ensure all necessary files (like Uber.csv) are accessible.
# 3. Run the preceding code block in your notebook to generate the dataframes and model objects
#    and save the HTML visualization files.
# 4. In a new code cell in Colab, run:
#    !streamlit run app.py & npx localtunnel --port 8501
# 5. Click the public URL provided by localtunnel to access your Streamlit app.




KeyError: "['sentiment_score', 'topic_id'] not in index"