<a href="https://colab.research.google.com/github/a-shahrabi/ClimateLens/blob/main/topic_visualizations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ClimateLens - Improved Topic Visualizations (MVP2)

# **Priority**: Intertopic Distance Map → Hierarchy → Barcharts
# **For**: Non-technical stakeholders (mental health professionals, researchers)

# **Addresses feedback**:
# - ✅ Minimum bubble sizes for mobile tapping
# - ✅ Click interaction (not hover-only)
# - ✅ Consistent X-axis scales
# - ✅ Clear labels and explanations

In [2]:
!pip install bertopic


Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


In [3]:
# Setup
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from bertopic import BERTopic
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics.pairwise import cosine_similarity
import json
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries loaded")

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


✓ Libraries loaded


In [4]:
def load_model_and_data(model_path, csv_path):
    """
    Load BERTopic model and CSV data

    Args:
        model_path: Path to .safetensors folder
        csv_path: Path to CSV file

    Returns:
        topic_model, dataframe
    """
    print(f"Loading model from {model_path}...")
    topic_model = BERTopic.load(model_path)

    print(f"Loading data from {csv_path}...")
    df = pd.read_csv(csv_path)

    print(f"✓ Loaded {len(df)} documents with {len(topic_model.get_topic_info()) - 1} topics")
    return topic_model, df

print("✓ Data loading function ready")

✓ Data loading function ready


In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Upload files directly
from google.colab import files
print("Upload your CSV file:")
uploaded = files.upload()

Upload your CSV file:


Saving filtered_anticonsumption_comments.csv to filtered_anticonsumption_comments.csv
Saving filtered_anticonsumption_comments.safetensors-20251113T022431Z-1-001.zip to filtered_anticonsumption_comments.safetensors-20251113T022431Z-1-001.zip


In [7]:
import os
print(os.listdir('.'))

['.config', 'filtered_anticonsumption_comments.csv', 'filtered_anticonsumption_comments.safetensors-20251113T022431Z-1-001.zip', 'drive', 'sample_data']


In [8]:
!unzip "filtered_anticonsumption_comments.safetensors-20251113T022431Z-1-001.zip"

Archive:  filtered_anticonsumption_comments.safetensors-20251113T022431Z-1-001.zip
  inflating: filtered_anticonsumption_comments.safetensors/topic_embeddings.safetensors  
  inflating: filtered_anticonsumption_comments.safetensors/config.json  
  inflating: filtered_anticonsumption_comments.safetensors/topics.json  


In [9]:
print(os.listdir('.'))

['.config', 'filtered_anticonsumption_comments.csv', 'filtered_anticonsumption_comments.safetensors-20251113T022431Z-1-001.zip', 'drive', 'filtered_anticonsumption_comments.safetensors', 'sample_data']


In [10]:
model_path = "filtered_anticonsumption_comments.safetensors"
csv_path = "filtered_anticonsumption_comments.csv"
topic_model, data = load_model_and_data(model_path, csv_path)



Loading model from filtered_anticonsumption_comments.safetensors...
Loading data from filtered_anticonsumption_comments.csv...
✓ Loaded 2736 documents with 29 topics


In [11]:
# 2. Priority #1 - Intertopic Distance Map

# **What it shows**: Topic bubbles where size = frequency, distance = similarity

# **Improvements**:
# - Minimum 30px diameter bubbles (mobile tap targets)
# - Click to see details (not hover)
# - Clear annotations for non-technical users

In [12]:
def create_intertopic_distance_map(topic_model, dataset_name="Dataset", width=1000, height=700):
    """
    Create improved Intertopic Distance Map

    Addresses feedback:
    - Minimum bubble size for mobile tapping
    - Click to show details (no hover-only)
    - Clear visual explanation
    """

    # Get topic information
    topic_info = topic_model.get_topic_info()
    topic_info = topic_info[topic_info['Topic'] != -1].reset_index(drop=True)

    if len(topic_info) == 0:
        raise ValueError("No topics found in model (all outliers)")

    print(f"Processing {len(topic_info)} topics...")

    # Get topic embeddings
    topic_ids = topic_info['Topic'].values
    embeddings = []
    for topic_id in topic_ids:
        embedding_idx = topic_id + 1
        embeddings.append(topic_model.topic_embeddings_[embedding_idx])
    embeddings = np.array(embeddings)

    # Reduce to 2D with t-SNE
    print("Reducing dimensions with t-SNE...")
    tsne = TSNE(
        n_components=2,
        random_state=42,
        metric='cosine',
        perplexity=min(30, len(embeddings) - 1)
    )
    coords_2d = tsne.fit_transform(embeddings)

    # Normalize to 0-100 range
    scaler = MinMaxScaler(feature_range=(5, 95))
    coords_2d = scaler.fit_transform(coords_2d)

    # Calculate bubble sizes with MINIMUM THRESHOLD
    counts = topic_info['Count'].values
    min_size = 15  # 30px diameter
    max_size = 50  # 100px diameter
    sizes = np.interp(counts, (counts.min(), counts.max()), (min_size, max_size))

    print(f"✓ Bubble sizes: {sizes.min():.1f} to {sizes.max():.1f}")

    # Prepare detailed information
    topic_details = []
    for idx, row in topic_info.iterrows():
        topic_num = row['Topic']
        topic_data = topic_model.get_topic(topic_num)
        top_words = [word for word, score in topic_data[:7]]

        detail = (
            f"<b>Topic {topic_num}</b><br>"
            f"<b>Size:</b> {row['Count']} documents<br>"
            f"<b>Keywords:</b> {', '.join(top_words)}"
        )
        topic_details.append(detail)

    # Create figure
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=coords_2d[:, 0],
        y=coords_2d[:, 1],
        mode='markers+text',
        marker=dict(
            size=sizes,
            color=topic_ids,
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(
                title="Topic ID",
                thickness=15,
                len=0.7
            ),
            line=dict(width=2, color='white'),
            sizemode='diameter',
            opacity=0.85
        ),
        text=[str(t) for t in topic_ids],
        textfont=dict(size=11, color='white', family='Arial Black'),
        textposition="middle center",
        hovertext=topic_details,
        hoverinfo='text',
        customdata=np.column_stack((topic_ids, counts)),
        name='',
        hoverlabel=dict(
            bgcolor="white",
            font_size=12,
            font_family="Arial"
        )
    ))

    # Update layout
    fig.update_layout(
        title={
            'text': (
                f"<b>{dataset_name} - Intertopic Distance Map</b><br>"
                "<sub>Click bubbles to see topic keywords | "
                "Bubble size = how often discussed | "
                "Distance = similarity</sub>"
            ),
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 18}
        },
        xaxis=dict(
            title="<b>← Different Topics ——————— Similar Topics →</b>",
            showgrid=True,
            gridcolor='rgba(200,200,200,0.3)',
            zeroline=False,
            showticklabels=False
        ),
        yaxis=dict(
            title="<b>Topic Similarity Dimension 2</b>",
            showgrid=True,
            gridcolor='rgba(200,200,200,0.3)',
            zeroline=False,
            showticklabels=False
        ),
        width=width,
        height=height,
        hovermode='closest',
        plot_bgcolor='rgba(245,245,245,0.5)',
        showlegend=False,
        font=dict(size=12, family='Arial'),
        margin=dict(l=80, r=80, t=100, b=80)
    )

    # Add helpful annotations
    fig.add_annotation(
        text=(
            "<i> How to read this chart:</i><br>"
            "• <b>Larger bubbles</b> = more people talking about this topic<br>"
            "• <b>Bubbles close together</b> = topics discuss similar themes<br>"
            "• <b>Bubbles far apart</b> = topics are very different<br>"
            "• <b>Click any bubble</b> to see what keywords define that topic"
        ),
        xref="paper", yref="paper",
        x=0.02, y=0.98,
        showarrow=False,
        align='left',
        bgcolor='rgba(255,255,255,0.95)',
        bordercolor='#2c3e50',
        borderwidth=2,
        borderpad=10,
        font=dict(size=11, family='Arial')
    )

    # Add example interpretation
    largest_topic = topic_info.loc[topic_info['Count'].idxmax()]
    fig.add_annotation(
        text=(
            f"<b>Example:</b> Topic {largest_topic['Topic']} is the largest "
            f"({largest_topic['Count']} posts) - most discussed theme"
        ),
        xref="paper", yref="paper",
        x=0.5, y=-0.08,
        showarrow=False,
        font=dict(size=11, color='#34495e', family='Arial')
    )

    print(f"✓ Intertopic Distance Map created with {len(topic_info)} topics")
    return fig

print("✓ IDM function loaded")

✓ IDM function loaded


In [13]:
# Example usage - UNCOMMENT after loading data:
fig_idm = create_intertopic_distance_map(topic_model, dataset_name="Reddit Climate Discussions")
fig_idm.write_html("intertopic_distance_map_IMPROVED.html")
print("✓ Saved to intertopic_distance_map_IMPROVED.html")

Processing 29 topics...
Reducing dimensions with t-SNE...
✓ Bubble sizes: 15.0 to 50.0
✓ Intertopic Distance Map created with 29 topics
✓ Saved to intertopic_distance_map_IMPROVED.html


In [14]:
## 3. Priority #2 - Topic Hierarchy

# **What it shows**: Topics organized in tree structure (parent-child relationships)

# **Improvements**:
# - Clear visual hierarchy
# - Size proportional to frequency
# - Grouped by similarity

In [15]:
def create_topic_hierarchy_treemap(topic_model, dataset_name="Dataset", max_depth=2):
    """Create hierarchical treemap of topics"""

    topic_info = topic_model.get_topic_info()
    topic_info = topic_info[topic_info['Topic'] != -1].reset_index(drop=True)

    if len(topic_info) < 2:
        raise ValueError("Need at least 2 topics for hierarchy")

    print(f"Building hierarchy for {len(topic_info)} topics...")

    # Get embeddings
    topic_ids = topic_info['Topic'].values
    embeddings = np.array([topic_model.topic_embeddings_[tid + 1] for tid in topic_ids])

    # Calculate distances directly (FIX: avoid diagonal issues)
    from scipy.spatial.distance import pdist
    condensed_distances = pdist(embeddings, metric='cosine')
    linkage_matrix = linkage(condensed_distances, method='ward')

    # Create clusters
    n_clusters = min(5, len(topic_info) // 2)
    cluster_labels = fcluster(linkage_matrix, n_clusters, criterion='maxclust')

    # Build treemap data
    labels = []
    parents = []
    values = []
    colors = []
    hover_texts = []

    # Root
    labels.append("All Topics")
    parents.append("")
    values.append(0)
    colors.append(0)
    hover_texts.append(f"<b>All Topics</b><br>Total: {topic_info['Count'].sum()}")

    # Clusters
    cluster_info = {}
    for cluster_id in range(1, n_clusters + 1):
        cluster_mask = cluster_labels == cluster_id
        cluster_topics = topic_info[cluster_mask]
        cluster_size = cluster_topics['Count'].sum()

        cluster_name = f"Theme {cluster_id}"
        labels.append(cluster_name)
        parents.append("All Topics")
        values.append(cluster_size)
        colors.append(cluster_id)

        top_keywords = set()
        for tid in cluster_topics['Topic'].values:
            words = [w for w, s in topic_model.get_topic(tid)[:3]]
            top_keywords.update(words)

        hover_texts.append(
            f"<b>{cluster_name}</b><br>"
            f"Topics: {cluster_mask.sum()}<br>"
            f"Documents: {cluster_size}<br>"
            f"Themes: {', '.join(list(top_keywords)[:5])}"
        )
        cluster_info[cluster_id] = cluster_name

    # Topics
    for idx, row in topic_info.iterrows():
        topic_id = row['Topic']
        cluster_id = cluster_labels[idx]
        parent_name = cluster_info[cluster_id]

        topic_words = [w for w, s in topic_model.get_topic(topic_id)[:5]]

        labels.append(f"Topic {topic_id}")
        parents.append(parent_name)
        values.append(row['Count'])
        colors.append(cluster_id + 10)

        hover_texts.append(
            f"<b>Topic {topic_id}</b><br>"
            f"Documents: {row['Count']}<br>"
            f"Keywords: {', '.join(topic_words)}"
        )

    # Create figure
    fig = go.Figure(go.Treemap(
        labels=labels,
        parents=parents,
        values=values,
        marker=dict(
            colors=colors,
            colorscale='Viridis',
            cmid=n_clusters / 2,
            line=dict(width=2, color='white')
        ),
        text=labels,
        textposition='middle center',
        hovertext=hover_texts,
        hoverinfo='text',
        textfont=dict(size=12, color='white', family='Arial')
    ))

    fig.update_layout(
        title={
            'text': (
                f"<b>{dataset_name} - Topic Hierarchy</b><br>"
                "<sub>Size = frequency | Grouped by similarity | Click to zoom</sub>"
            ),
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 18}
        },
        width=1000,
        height=700,
        font=dict(size=12, family='Arial'),
        margin=dict(l=10, r=10, t=80, b=10)
    )

    fig.add_annotation(
        text=(
            "<i> Larger boxes = more documents | Same color = similar topics</i>"
        ),
        xref="paper", yref="paper",
        x=0.5, y=-0.05,
        showarrow=False,
        align='center',
        bgcolor='rgba(255,255,255,0.95)',
        bordercolor='#2c3e50',
        borderwidth=2,
        font=dict(size=11, family='Arial')
    )

    print(f"✓ Hierarchy created with {n_clusters} themes")
    return fig

print("✓ Hierarchy function fixed and loaded")

✓ Hierarchy function fixed and loaded


In [16]:
fig_hierarchy = create_topic_hierarchy_treemap(topic_model, dataset_name="Reddit Climate Discussions")
fig_hierarchy.write_html("topic_hierarchy_IMPROVED.html")
print("✓ Saved to topic_hierarchy_IMPROVED.html")

Building hierarchy for 29 topics...
✓ Hierarchy created with 5 themes
✓ Saved to topic_hierarchy_IMPROVED.html


In [17]:
## 3. Priority #2 - Topic Hierarchy

# **What it shows**: Topics organized in tree structure (parent-child relationships)

# **Improvements**:
# - Clear visual hierarchy
# - Size proportional to frequency
# - Grouped by similarity

In [21]:
def create_topic_hierarchy_treemap(topic_model, dataset_name="Dataset", max_depth=2):
    """
    Create hierarchical treemap of topics

    Uses topic similarity to build parent-child relationships.
    Size = document frequency, color = hierarchy level
    """

    # Get topic info
    topic_info = topic_model.get_topic_info()
    topic_info = topic_info[topic_info['Topic'] != -1].reset_index(drop=True)

    if len(topic_info) < 2:
        raise ValueError("Need at least 2 topics for hierarchy")

    print(f"Building hierarchy for {len(topic_info)} topics...")

    # Calculate topic similarity matrix
    topic_ids = topic_info['Topic'].values
    embeddings = np.array([topic_model.topic_embeddings_[tid + 1] for tid in topic_ids])

    from scipy.spatial.distance import pdist
    condensed_distances = pdist(embeddings, metric='cosine')
    linkage_matrix = linkage(condensed_distances, method='ward')

    # Create clusters
    n_clusters = min(5, len(topic_info) // 2)
    cluster_labels = fcluster(linkage_matrix, n_clusters, criterion='maxclust')

    # Prepare data for treemap
    labels = []
    parents = []
    values = []
    colors = []
    hover_texts = []

    # Root node
    labels.append("All Topics")
    parents.append("")
    values.append(0)
    colors.append(0)
    hover_texts.append(f"<b>All Topics</b><br>Total documents: {topic_info['Count'].sum()}")

    # Cluster nodes (Level 1)
    cluster_info = {}
    for cluster_id in range(1, n_clusters + 1):
        cluster_mask = cluster_labels == cluster_id
        cluster_topics = topic_info[cluster_mask]
        cluster_size = cluster_topics['Count'].sum()

        cluster_name = f"Theme {cluster_id}"
        labels.append(cluster_name)
        parents.append("All Topics")
        values.append(cluster_size)
        colors.append(cluster_id)

        # Get top keywords from cluster
        top_keywords = set()
        for tid in cluster_topics['Topic'].values:
            words = [w for w, s in topic_model.get_topic(tid)[:3]]
            top_keywords.update(words)

        hover_texts.append(
            f"<b>{cluster_name}</b><br>"
            f"Topics: {cluster_mask.sum()}<br>"
            f"Documents: {cluster_size}<br>"
            f"Main themes: {', '.join(list(top_keywords)[:5])}"
        )

        cluster_info[cluster_id] = cluster_name

    # Topic nodes (Level 2)
    for idx, row in topic_info.iterrows():
        topic_id = row['Topic']
        cluster_id = cluster_labels[idx]
        parent_name = cluster_info[cluster_id]

        topic_words = [w for w, s in topic_model.get_topic(topic_id)[:5]]

        topic_name = f"Topic {topic_id}"
        labels.append(topic_name)
        parents.append(parent_name)
        values.append(row['Count'])
        colors.append(cluster_id + 10)

        hover_texts.append(
            f"<b>Topic {topic_id}</b><br>"
            f"Documents: {row['Count']}<br>"
            f"Keywords: {', '.join(topic_words)}"
        )

    # Create treemap
    fig = go.Figure(go.Treemap(
        labels=labels,
        parents=parents,
        values=values,
        marker=dict(
            colors=colors,
            colorscale='Viridis',
            cmid=n_clusters / 2,
            line=dict(width=2, color='white')
        ),
        text=labels,
        textposition='middle center',
        hovertext=hover_texts,
        hoverinfo='text',
        textfont=dict(size=12, color='white', family='Arial')
    ))

    # Update layout
    fig.update_layout(
        title={
            'text': (
                f"<b>{dataset_name} - Topic Hierarchy</b><br>"
                "<sub>Size = frequency | Grouped by similarity | "
                "Click to zoom into themes</sub>"
            ),
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 18}
        },
        width=1000,
        height=700,
        font=dict(size=12, family='Arial'),
        margin=dict(l=10, r=10, t=80, b=10)
    )

    # Add interpretation
    fig.add_annotation(
        text=(
            "<i> How to read this chart:</i><br>"
            "• <b>Larger boxes</b> = more documents about this topic<br>"
            "• <b>Same color</b> = topics grouped by similarity<br>"
            "• <b>Click any box</b> to zoom in and see details"
        ),
        xref="paper", yref="paper",
        x=0.5, y=-0.05,
        showarrow=False,
        align='center',
        bgcolor='rgba(255,255,255,0.95)',
        bordercolor='#2c3e50',
        borderwidth=2,
        font=dict(size=11, family='Arial')
    )

    print(f"✓ Topic Hierarchy created with {n_clusters} main themes")
    return fig

print("✓ Hierarchy function loaded")

✓ Hierarchy function loaded


In [22]:

fig_hierarchy = create_topic_hierarchy_treemap(topic_model, dataset_name="Reddit Climate Discussions")
fig_hierarchy.write_html("topic_hierarchy_IMPROVED.html")
print("✓ Saved to topic_hierarchy_IMPROVED.html")

Building hierarchy for 29 topics...
✓ Topic Hierarchy created with 5 main themes
✓ Saved to topic_hierarchy_IMPROVED.html


In [23]:
## 4. Priority #3 - Topic Barcharts

# **What it shows**: Keyword importance scores for each topic

# **Improvements**:
# - Consistent X-axis across all topics (for comparison)
# - Sorted in descending order
# - Clear units (importance scores)

In [24]:
def create_topic_barchart_dashboard(topic_model, dataset_name="Dataset", n_topics_display=5):
    """
    Create bar chart dashboard showing keyword importance

    Shows multiple topics at once with consistent X-axis for comparison.
    """

    # Get topic info
    topic_info = topic_model.get_topic_info()
    topic_info = topic_info[topic_info['Topic'] != -1].reset_index(drop=True)

    # Select top N topics by count
    top_topics = topic_info.nlargest(n_topics_display, 'Count')

    print(f"Creating barcharts for top {len(top_topics)} topics...")

    # Calculate global max score for consistent X-axis
    all_scores = []
    for topic_id in topic_info['Topic'].values:
        scores = [score for word, score in topic_model.get_topic(topic_id)]
        all_scores.extend(scores)
    global_max_score = max(all_scores)

    print(f"✓ Global max score: {global_max_score:.4f}")

    # Create subplots
    fig = make_subplots(
        rows=len(top_topics),
        cols=1,
        subplot_titles=[
            f"<b>Topic {row['Topic']}</b> ({row['Count']} documents)"
            for _, row in top_topics.iterrows()
        ],
        vertical_spacing=0.05,
        specs=[[{"type": "bar"}] for _ in range(len(top_topics))]
    )

    # Add bar charts for each topic
    for idx, (_, topic_row) in enumerate(top_topics.iterrows()):
        topic_id = topic_row['Topic']

        # Get keywords and scores (top 10)
        topic_data = topic_model.get_topic(topic_id)[:10]
        keywords = [word for word, score in topic_data]
        scores = [score for word, score in topic_data]

        # Sort in descending order (reverse for horizontal bars)
        sorted_pairs = sorted(zip(keywords, scores), key=lambda x: x[1])
        keywords = [k for k, s in sorted_pairs]
        scores = [s for k, s in sorted_pairs]

        # Add trace
        fig.add_trace(
            go.Bar(
                y=keywords,
                x=scores,
                orientation='h',
                marker=dict(
                    color=scores,
                    colorscale='Blues',
                    showscale=False
                ),
                text=[f'{s:.4f}' for s in scores],
                textposition='outside',
                showlegend=False,
                hovertemplate='<b>%{y}</b><br>Importance: %{x:.4f}<extra></extra>',
                name=f'Topic {topic_id}'
            ),
            row=idx+1,
            col=1
        )

        # Consistent X-axis range
        fig.update_xaxes(
            range=[0, global_max_score * 1.05],
            title_text="Importance Score" if idx == len(top_topics) - 1 else "",
            showgrid=True,
            gridcolor='rgba(200,200,200,0.3)',
            row=idx+1,
            col=1
        )

        fig.update_yaxes(
            tickfont=dict(size=10),
            row=idx+1,
            col=1
        )

    # Update layout
    fig.update_layout(
        title={
            'text': (
                f"<b>{dataset_name} - Topic Keyword Importance</b><br>"
                "<sub>Showing top topics by frequency | "
                "X-axis consistent across all charts for comparison</sub>"
            ),
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 18}
        },
        height=300 * len(top_topics),
        width=900,
        showlegend=False,
        plot_bgcolor='white',
        font=dict(size=11, family='Arial'),
        margin=dict(l=150, r=50, t=100, b=50)
    )

    # Add interpretation
    fig.add_annotation(
        text=(
            "<i> How to read this chart:</i><br>"
            "• <b>Longer bars</b> = keyword is more important for defining the topic<br>"
            "• <b>All charts use same scale</b> = you can compare keyword strength across topics<br>"
            "• <b>Keywords sorted</b> from most to least important (top to bottom)"
        ),
        xref="paper", yref="paper",
        x=0.5, y=-0.02,
        showarrow=False,
        align='center',
        bgcolor='rgba(255,255,255,0.95)',
        bordercolor='#2c3e50',
        borderwidth=2,
        borderpad=8,
        font=dict(size=10, family='Arial')
    )

    print(f"✓ Bar chart dashboard created")
    return fig

print("✓ Barchart function loaded")

✓ Barchart function loaded


In [25]:
# Example usage - UNCOMMENT after loading data:
# fig_barchart = create_topic_barchart_dashboard(topic_model, dataset_name="Reddit Climate Discussions", n_topics_display=5)
# fig_barchart.write_html("topic_barcharts_IMPROVED.html")
# print("✓ Saved to topic_barcharts_IMPROVED.html")

In [26]:
# UNCOMMENT AND EDIT PATHS:

# Load data

dataset_name = "Reddit Climate Anxiety"


# Create all visualizations
print("\n" + "="*60)
print("CREATING INTERTOPIC DISTANCE MAP (Priority #1)")
print("="*60)
fig_idm = create_intertopic_distance_map(topic_model, dataset_name=dataset_name)
fig_idm.write_html("IDM_IMPROVED.html")
print("✓ Saved to IDM_IMPROVED.html")

print("\n" + "="*60)
print("CREATING TOPIC HIERARCHY (Priority #2)")
print("="*60)
fig_hierarchy = create_topic_hierarchy_treemap(topic_model, dataset_name=dataset_name)
fig_hierarchy.write_html("Hierarchy_IMPROVED.html")
print("✓ Saved to Hierarchy_IMPROVED.html")

print("\n" + "="*60)
print("CREATING TOPIC BARCHARTS (Priority #3)")
print("="*60)
fig_barchart = create_topic_barchart_dashboard(topic_model, dataset_name=dataset_name, n_topics_display=5)
fig_barchart.write_html("Barcharts_IMPROVED.html")
print("✓ Saved to Barcharts_IMPROVED.html")

print("\n" + "="*60)
print(" ALL VISUALIZATIONS COMPLETE!")
print("="*60)
print("Next: Download HTMLs from Files panel (left sidebar)")



CREATING INTERTOPIC DISTANCE MAP (Priority #1)
Processing 29 topics...
Reducing dimensions with t-SNE...
✓ Bubble sizes: 15.0 to 50.0
✓ Intertopic Distance Map created with 29 topics
✓ Saved to IDM_IMPROVED.html

CREATING TOPIC HIERARCHY (Priority #2)
Building hierarchy for 29 topics...
✓ Topic Hierarchy created with 5 main themes
✓ Saved to Hierarchy_IMPROVED.html

CREATING TOPIC BARCHARTS (Priority #3)
Creating barcharts for top 5 topics...
✓ Global max score: 0.0189
✓ Bar chart dashboard created
✓ Saved to Barcharts_IMPROVED.html

 ALL VISUALIZATIONS COMPLETE!
Next: Download HTMLs from Files panel (left sidebar)


In [27]:
## 6. Validation (Optional)

# Check model quality before generating visualizations.

In [28]:
def validate_visualizations(topic_model):
    """Run quality checks"""
    print("Running validation...\n")

    topic_info = topic_model.get_topic_info()
    n_topics = len(topic_info[topic_info['Topic'] != -1])

    checks = 0
    if n_topics >= 2:
        print(f" {n_topics} topics")
        checks += 1

    if topic_model.topic_embeddings_ is not None:
        print(" Embeddings available")
        checks += 1

    print(f"\nScore: {checks}/2")
    return checks >= 2

print("✓ Validation loaded")

✓ Validation loaded


In [32]:
# Upload files directly
from google.colab import files
print("Upload your CSV file:")
uploaded = files.upload()

Upload your CSV file:


Saving climate_twitter_sample.csv to climate_twitter_sample.csv
Saving climate_twitter_sample.safetensors-20251113T031338Z-1-001.zip to climate_twitter_sample.safetensors-20251113T031338Z-1-001.zip


In [33]:
!unzip climate_twitter_sample*.zip

Archive:  climate_twitter_sample.safetensors-20251113T031338Z-1-001.zip
  inflating: climate_twitter_sample.safetensors/topic_embeddings.safetensors  
  inflating: climate_twitter_sample.safetensors/topics.json  
  inflating: climate_twitter_sample.safetensors/config.json  


In [34]:
model_path = "climate_twitter_sample.safetensors"
csv_path = "climate_twitter_sample.csv"
dataset_name = "Twitter Climate Discussions"
topic_model, data = load_model_and_data(model_path, csv_path)



Loading model from climate_twitter_sample.safetensors...
Loading data from climate_twitter_sample.csv...
✓ Loaded 2736 documents with 29 topics


In [35]:
fig_idm = create_intertopic_distance_map(topic_model, dataset_name)
fig_idm.write_html("Twitter_IDM.html")

fig_hierarchy = create_topic_hierarchy_treemap(topic_model, dataset_name)
fig_hierarchy.write_html("Twitter_Hierarchy.html")

fig_barchart = create_topic_barchart_dashboard(topic_model, dataset_name, 5)
fig_barchart.write_html("Twitter_Barcharts.html")

print("✓ Done")

Processing 29 topics...
Reducing dimensions with t-SNE...
✓ Bubble sizes: 15.0 to 50.0
✓ Intertopic Distance Map created with 29 topics
Building hierarchy for 29 topics...
✓ Topic Hierarchy created with 5 main themes
Creating barcharts for top 5 topics...
✓ Global max score: 0.2336
✓ Bar chart dashboard created
✓ Done
