In [2]:
import os

# Topic modeling & clustering
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

# Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Data handling
import pandas as pd
from pathlib import Path

# Wordcloud & plots
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Optional: nicer plots
plt.rcParams["figure.figsize"] = (10, 6)

# Avoid tokenizer parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [3]:
import pandas as pd
import re

# 1) Load the CSV
path = r"C:\Users\John DeForest\Desktop\dWrangl\PFAS project phoebe\lobster_clean2.csv"
df = pd.read_csv(path)

# 2) Clean column names: replace spaces/tabs with "_" and strip non-alphanumeric/underscore
df.columns = df.columns.str.replace(r"[ \t]+", "_", regex=True)
df.columns = df.columns.str.replace(r"[^A-Za-z0-9_]+", "_", regex=True)

# 3) Specific renames (matching your R code)
df = df.rename(columns={
    "_BINARY_Y_N": "ZBINARY_Y_N",
    "_CAT_WHY_YES___how_did_they_deal_with_it_categories_": (
        "ZCAT_WHY_YES___how_did_they_deal_with_it_categories_"
    )
})

# (Optional) sanity check
print(df.shape)
print(df.columns.tolist())


(34, 81)
['Timestamp', 'Name', 'Gender', 'Age', 'Town', 'Lobster_Zone', 'Do_you_fish_inshore_or_offshore_', 'Position_on_vessel', 'Type_of_License', 'Number_of_years_lobstering', 'As_a_lobsterman__are_you_concerned_about_pollution__If_yes__what_types_of_pollution_concern_you_most_', 'non_typical', 'subtractnt', 'new', 'As_a_lobsterman__are_you_concerned_about_pollution__If_yes__what_types_of_pollution_concern_you_most_2_MERGED', 'Cleaned_Concat', 'Added_from_next_Q', 'Final_Q1', '_Are_you_aware_of_any_sources_of_pollution_near_your_fishing_grounds_', 'non_typical_1', 'base', 'Cleaned', 'Reconcat', 'Have_you_previously_heard_of_PFAS__also_known_as__forever_chemicals_', '_Do_you_know_if_PFAS_is_an_issue_in_Maine_', '_Do_you_know_if_PFAS_is_an_issue_in_Maine__BINARY', 'When_you_think_about_pollution_in_fisheries__do_you_think_about_PFAS_as_a_pollutant_', '_Do_you_know_if_PFAS_have_been_detected_in_lobsters_', 'Have_you_heard_any_news__research__or_state_communications_about_PFAS_in_lobste

In [4]:
# ---- Choose which column is your text variable ----
# For now we're calling it VAR1; later you'll swap in the real name
TEXT_COL = "When_problems_hit_the_lobster_fishery__does_the_community_usually_come_together_to_deal_with_them__If_so__how__Do_you_think_PFAS_issues_would_be_dealt_with_similarly_or_differently_from_other_challenges__In_what_ways_"  # TODO: replace with the actual column name

# Drop missing values in VAR1 and convert to strings
df_text = df.dropna(subset=[TEXT_COL]).copy()
df_text[TEXT_COL] = df_text[TEXT_COL].astype(str)
print("Rows with non-missing VAR1:", df_text.shape[0])
# Final documents list for BERTopic / wordcloud
documents = df_text[TEXT_COL].tolist()
print("Number of documents:", len(documents))
print("Example doc:", documents[0][:200])


Rows with non-missing VAR1: 34
Number of documents: 34
Example doc: There's some discourse with strong opinions, but eventually yes. The fishery is unique as there's a lot of collaboration between lobstermen and policy makers. MLA works with a lot of people. Lobsterme


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

# 1) Vectorizer
vectorizer_model = CountVectorizer(
    ngram_range=(1, 3),
    stop_words="english",
    min_df=1  # you can lower this (e.g. 2) if you have fewer responses
)

# 2) UMAP
umap_model = UMAP(
    n_neighbors=20,
    n_components=5,
    metric="cosine",
    random_state=42
)

# 3) HDBSCAN
hdbscan_model = HDBSCAN(
    min_cluster_size=3,
    min_samples=2,
    metric="euclidean",
    prediction_data=True
)

# 4) Representation model
rep_model = KeyBERTInspired()

# 5) BERTopic model
topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",  # sentence-transformers model
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    seed_topic_list=None,
    representation_model=rep_model,
    verbose=True,
)

# 6) Fit the model on your lobster text
topics, probs = topic_model.fit_transform(documents)


2025-11-17 15:07:12,466 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2/2 [00:04<00:00,  2.48s/it]
2025-11-17 15:07:18,637 - BERTopic - Embedding - Completed ✓
2025-11-17 15:07:18,637 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-17 15:07:18,702 - BERTopic - Dimensionality - Completed ✓
2025-11-17 15:07:18,703 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-17 15:07:18,710 - BERTopic - Cluster - Completed ✓
2025-11-17 15:07:18,713 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-17 15:07:20,531 - BERTopic - Representation - Completed ✓


In [25]:
import os

# Directory where you want to save results
save_dir = r"C:\Users\John DeForest\Desktop\dWrangl\PFAS project phoebe\tmres"
os.makedirs(save_dir, exist_ok=True)

# Model path
model_path = os.path.join(save_dir, "topic_model_python")

# Save the model
topic_model.save(model_path)
print("Model saved to:", model_path)

topic_info = topic_model.get_topic_info()
print(topic_info.head())
print("Number of topics (excluding -1 outlier):", (topic_info.Topic != -1).sum())





Model saved to: C:\Users\John DeForest\Desktop\dWrangl\PFAS project phoebe\tmres\topic_model_python
   Topic  Count                                               Name  \
0     -1     12        -1_pfas_pfas different_know pfas_pfas issue   
1      0      9  0_yes fishery organizations_background controv...   
2      1      7  1_pfas dealt differently_pfas_boat races pfas_...   
3      2      6  2_collaboration lobstermen policy_collaboratio...   

                                      Representation  \
0  [pfas, pfas different, know pfas, pfas issue, ...   
1  [yes fishery organizations, background controv...   
2  [pfas dealt differently, pfas, boat races pfas...   
3  [collaboration lobstermen policy, collaboratio...   

                                 Representative_Docs  
0  [More often than not. When a fishermen dies ev...  
1  [Fishermen are fairly resourceful and pretty g...  
2  [The community comes together with different f...  
3  [100% the community comes together. In every 

In [26]:
topic_info = topic_model.get_topic_info()
print(topic_info)
import pandas as pd

pd.Series(topics).value_counts()


   Topic  Count                                               Name  \
0     -1     12        -1_pfas_pfas different_know pfas_pfas issue   
1      0      9  0_yes fishery organizations_background controv...   
2      1      7  1_pfas dealt differently_pfas_boat races pfas_...   
3      2      6  2_collaboration lobstermen policy_collaboratio...   

                                      Representation  \
0  [pfas, pfas different, know pfas, pfas issue, ...   
1  [yes fishery organizations, background controv...   
2  [pfas dealt differently, pfas, boat races pfas...   
3  [collaboration lobstermen policy, collaboratio...   

                                 Representative_Docs  
0  [More often than not. When a fishermen dies ev...  
1  [Fishermen are fairly resourceful and pretty g...  
2  [The community comes together with different f...  
3  [100% the community comes together. In every w...  


-1    12
 0     9
 1     7
 2     6
Name: count, dtype: int64

In [27]:
topic_info = topic_model.get_topic_info()
print(topic_info)
# Make sure indices line up
df_text = df_text.reset_index(drop=True)

# Attach topic assignments
df_topics_docs = df_text.copy()
df_topics_docs["Topic"] = topics

# Quick peek: a few docs per topic
for t in sorted(set(topics)):
    if t == -1:
        continue  # skip outliers
    print("\n=== Topic", t, "===")
    print(topic_model.get_topic(t))  # top words for this topic
    print(df_topics_docs.loc[df_topics_docs["Topic"] == t, TEXT_COL].head(5))



   Topic  Count                                               Name  \
0     -1     12        -1_pfas_pfas different_know pfas_pfas issue   
1      0      9  0_yes fishery organizations_background controv...   
2      1      7  1_pfas dealt differently_pfas_boat races pfas_...   
3      2      6  2_collaboration lobstermen policy_collaboratio...   

                                      Representation  \
0  [pfas, pfas different, know pfas, pfas issue, ...   
1  [yes fishery organizations, background controv...   
2  [pfas dealt differently, pfas, boat races pfas...   
3  [collaboration lobstermen policy, collaboratio...   

                                 Representative_Docs  
0  [More often than not. When a fishermen dies ev...  
1  [Fishermen are fairly resourceful and pretty g...  
2  [The community comes together with different f...  
3  [100% the community comes together. In every w...  

=== Topic 0 ===
[('yes fishery organizations', np.float32(0.5618259)), ('background controve

In [35]:
import pandas as pd
import os

# Get topic info
topic_info = topic_model.get_topic_info()
print(topic_info)

# Option A: exclude outlier topic -1 (what we did before)
valid_topic_ids = topic_info[topic_info.Topic != -1].Topic.tolist()

# Option B: include -1 as well for debugging:
# valid_topic_ids = topic_info.Topic.tolist()

topics_data = []

for topic_id in valid_topic_ids:
    topic = topic_model.get_topic(topic_id)
    if topic is None:
        continue
    for word, weight in topic:
        topics_data.append((topic_id, word, weight))

topics_df = pd.DataFrame(topics_data, columns=["Topic", "Word", "Weight"])
print(topics_df.head())
print("Number of rows in topics_df:", len(topics_df))

csv_path = os.path.join(save_dir, "topic_keywords_2.csv")
topics_df.to_csv(csv_path, index=False)
print("Topic keyword CSV saved to:", csv_path)


   Topic  Count                                               Name  \
0     -1     12        -1_pfas_pfas different_know pfas_pfas issue   
1      0      9  0_yes fishery organizations_background controv...   
2      1      7  1_pfas dealt differently_pfas_boat races pfas_...   
3      2      6  2_collaboration lobstermen policy_collaboratio...   

                                      Representation  \
0  [pfas, pfas different, know pfas, pfas issue, ...   
1  [yes fishery organizations, background controv...   
2  [pfas dealt differently, pfas, boat races pfas...   
3  [collaboration lobstermen policy, collaboratio...   

                                 Representative_Docs  
0  [More often than not. When a fishermen dies ev...  
1  [Fishermen are fairly resourceful and pretty g...  
2  [The community comes together with different f...  
3  [100% the community comes together. In every w...  
   Topic                           Word    Weight
0      0      yes fishery organizations  0.

In [29]:
from bertopic import BERTopic

model_path = os.path.join(save_dir, "topic_model_python")
topic_model = BERTopic.load(model_path)
print("Model loaded from:", model_path)


Model loaded from: C:\Users\John DeForest\Desktop\dWrangl\PFAS project phoebe\tmres\topic_model_python


In [31]:
import pandas as pd

# Get topic info and exclude the outlier topic -1
topic_info = topic_model.get_topic_info()
valid_topic_ids = topic_info[topic_info.Topic != -1].Topic.tolist()

topics_data = []

for topic_id in valid_topic_ids:
    # Each topic is a list of (word, weight) tuples
    topic = topic_model.get_topic(topic_id)
    for word, weight in topic:
        topics_data.append((topic_id, word, weight))

topics_df = pd.DataFrame(topics_data, columns=["Topic", "Word", "Weight"])
print(topics_df.head())

# Save to CSV in your tmres folder
csv_path = os.path.join(save_dir, "topic_keywords_2.csv")
topics_df.to_csv(csv_path, index=False)
print("Topic keyword CSV saved to:", csv_path)


   Topic                           Word    Weight
0      0      yes fishery organizations  0.561826
1      0  background controversial pfas  0.464222
2      0   change frustrating fishermen  0.426346
3      0                        fishery  0.402387
4      0           pfas dealt similarly  0.392740
Topic keyword CSV saved to: C:\Users\John DeForest\Desktop\dWrangl\PFAS project phoebe\tmres\topic_keywords_2.csv


In [None]:
#TODO FIX BROKEN
# Simple topic overview
topic_model.visualize_topics()


  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(


TypeError: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.

In [33]:
# Hierarchical topic visualization
topic_model.visualize_hierarchy()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [34]:
# Hierarchical topic structure + tree
hierarchical_topics = topic_model.hierarchical_topics(documents)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)


100%|██████████| 2/2 [00:02<00:00,  1.17s/it]

.
├─■──pfas dealt differently_pfas_boat races pfas_unaware uneducated pfas_assistance pfas ── Topic: 1
└─yes fishery_lobstermen_fishery_pfas dealt similarly_pfas
     ├─■──yes fishery organizations_background controversial pfas_change frustrating fishermen_fishery_pfas de ── Topic: 0
     └─■──collaboration lobstermen policy_collaboration lobstermen_come fishermen rally_advocate pfas dealt_pf ── Topic: 2






In [36]:
#by age: 
import numpy as np

# Make sure Age is numeric; coerce errors to NaN
df_text["Age"] = pd.to_numeric(df_text["Age"], errors="coerce")

# Keep only rows with non-missing Age
df_age = df_text.dropna(subset=["Age"]).reset_index(drop=True)

# Documents and "time" variable (here: Age)
documents_age = df_age[TEXT_COL].astype(str).tolist()
ages = df_age["Age"].astype(float).tolist()

print(f"Number of documents with Age: {len(documents_age)}")
print(f"Example Age values: {ages[:10]}")


Number of documents with Age: 34
Example Age values: [24.0, 24.0, 28.0, 73.0, 26.0, 52.0, 27.0, 24.0, 26.0, 33.0]


In [37]:
# Use the existing trained topic_model
updated_topics, _ = topic_model.transform(documents_age)


Batches: 100%|██████████| 2/2 [00:07<00:00,  3.62s/it]
2025-11-17 15:16:00,363 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-17 15:16:00,366 - BERTopic - Dimensionality - Completed ✓
2025-11-17 15:16:00,369 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-17 15:16:00,373 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-17 15:16:00,383 - BERTopic - Probabilities - Completed ✓
2025-11-17 15:16:00,386 - BERTopic - Cluster - Completed ✓


In [38]:
topics_over_time = topic_model.topics_over_time(
    documents_age,
    ages,
    updated_topics
)

fig = topic_model.visualize_topics_over_time(
    topics_over_time,
    title="Topics Across Age"
)
fig


23it [00:27,  1.20s/it]


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [40]:
#NOW NEXT COL FOR TOPIC MODEL
# Text column for this run
TEXT_COL = "If_PFAS_was_detected_in_some_areas__but_not_in_other_areas__how_would_that_affect_the_social_dynamics_of_the_fishery_How_territorial_would_other_lobstermen_be_Do_you_think_lobstermen_would_be_open_to_allowing_lobstermen_whose_fishing_grounds_have_been"

# Drop missing values and convert to string
df_text = df.dropna(subset=[TEXT_COL]).copy()
df_text[TEXT_COL] = df_text[TEXT_COL].astype(str)

print("Rows with non-missing text:", df_text.shape[0])

# Final documents list for BERTopic
documents = df_text[TEXT_COL].tolist()
print("Number of documents:", len(documents))
print("Example doc:", documents[0][:200])
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

# 1) Vectorizer
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    stop_words="english",
    min_df=1
)

# 2) UMAP (slightly more permissive)
umap_model = UMAP(
    n_neighbors=10,
    n_components=2,
    metric="cosine",
    random_state=42
)

# 3) HDBSCAN (allow small clusters)
hdbscan_model = HDBSCAN(
    min_cluster_size=3,
    min_samples=1,
    metric="euclidean",
    prediction_data=True
)

# 4) Representation model
rep_model = KeyBERTInspired()

# 5) BERTopic model
topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    seed_topic_list=None,
    representation_model=rep_model,
    verbose=True,
)

# 6) Fit the model on this specific question column
topics, probs = topic_model.fit_transform(documents)


2025-11-17 15:29:49,116 - BERTopic - Embedding - Transforming documents to embeddings.


Rows with non-missing text: 34
Number of documents: 34
Example doc: Lobstermen would not want to share territory. would be difficult because you have permitting for your zone. 


Batches: 100%|██████████| 2/2 [00:04<00:00,  2.23s/it]
2025-11-17 15:29:54,846 - BERTopic - Embedding - Completed ✓
2025-11-17 15:29:54,847 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-17 15:29:54,964 - BERTopic - Dimensionality - Completed ✓
2025-11-17 15:29:54,967 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-17 15:29:54,980 - BERTopic - Cluster - Completed ✓
2025-11-17 15:29:54,989 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-17 15:29:57,401 - BERTopic - Representation - Completed ✓


In [41]:
import os

save_dir = r"C:\Users\John DeForest\Desktop\dWrangl\PFAS project phoebe\tmres"
os.makedirs(save_dir, exist_ok=True)

model_path = os.path.join(save_dir, "topic_model_python")
topic_model.save(model_path)
print("Model saved to:", model_path)

topic_info = topic_model.get_topic_info()
print(topic_info)
print("Number of topics (excluding -1 outlier):", (topic_info.Topic != -1).sum())




Model saved to: C:\Users\John DeForest\Desktop\dWrangl\PFAS project phoebe\tmres\topic_model_python
   Topic  Count                                               Name  \
0      0     13  0_lobstermen territorial_lobstermen want_idea ...   
1      1     11  1_lobstermen_areas lobsters_lobstering_fished ...   
2      2      7  2_accommodate fishermen_fish areas_declining t...   
3      3      3  3_territory fish_ocean people_fish lot_need te...   

                                      Representation  \
0  [lobstermen territorial, lobstermen want, idea...   
1  [lobstermen, areas lobsters, lobstering, fishe...   
2  [accommodate fishermen, fish areas, declining ...   
3  [territory fish, ocean people, fish lot, need ...   

                                 Representative_Docs  
0  [Lobstermen would not be open to sharing terri...  
1  [A lot of people wouldn't move traps at all ev...  
2  [If guys move gear into a new area, then that ...  
3  [In the casco Bay Area everyone gets along we

In [42]:
# Make indices line up
df_text = df_text.reset_index(drop=True)

df_topics_docs = df_text.copy()
df_topics_docs["Topic"] = topics

# Peek a few responses per topic
for t in sorted(set(topics)):
    if t == -1:
        continue
    print("\n=== Topic", t, "===")
    print(topic_model.get_topic(t))  # top words
    print(df_topics_docs.loc[df_topics_docs["Topic"] == t, TEXT_COL].head(5))



=== Topic 0 ===
[('lobstermen territorial', np.float32(0.73900753)), ('lobstermen want', np.float32(0.580724)), ('idea lobstermen', np.float32(0.5513441)), ('congested lobstermen', np.float32(0.5468248)), ('sharing territory', np.float32(0.53033584)), ('closed lobstermen', np.float32(0.47580245)), ('lobstermen', np.float32(0.4694628)), ('lobstermen open', np.float32(0.46236873)), ('establish territory', np.float32(0.39327785)), ('thinks lobsters', np.float32(0.39134493))]
0    Lobstermen would not want to share territory. ...
1    Lobstermen would not be open to sharing territ...
4    Lobstermen are very territorial, and would not...
6    Nope. Not willingly. Lobstermen are very compe...
7    cause a lot of social and physical issues. lob...
Name: If_PFAS_was_detected_in_some_areas__but_not_in_other_areas__how_would_that_affect_the_social_dynamics_of_the_fishery_How_territorial_would_other_lobstermen_be_Do_you_think_lobstermen_would_be_open_to_allowing_lobstermen_whose_fishing_grounds

In [43]:
import pandas as pd

topic_info = topic_model.get_topic_info()
print(topic_info)

# Exclude outlier topic -1
valid_topic_ids = topic_info[topic_info.Topic != -1].Topic.tolist()

topics_data = []

for topic_id in valid_topic_ids:
    topic = topic_model.get_topic(topic_id)
    if topic is None:
        continue
    for word, weight in topic:
        topics_data.append((topic_id, word, weight))

topics_df = pd.DataFrame(topics_data, columns=["Topic", "Word", "Weight"])
print(topics_df.head())
print("Number of rows in topics_df:", len(topics_df))

csv_path = os.path.join(save_dir, "topic_keywords_3.csv")
topics_df.to_csv(csv_path, index=False)
print("Topic keyword CSV saved to:", csv_path)


   Topic  Count                                               Name  \
0      0     13  0_lobstermen territorial_lobstermen want_idea ...   
1      1     11  1_lobstermen_areas lobsters_lobstering_fished ...   
2      2      7  2_accommodate fishermen_fish areas_declining t...   
3      3      3  3_territory fish_ocean people_fish lot_need te...   

                                      Representation  \
0  [lobstermen territorial, lobstermen want, idea...   
1  [lobstermen, areas lobsters, lobstering, fishe...   
2  [accommodate fishermen, fish areas, declining ...   
3  [territory fish, ocean people, fish lot, need ...   

                                 Representative_Docs  
0  [Lobstermen would not be open to sharing terri...  
1  [A lot of people wouldn't move traps at all ev...  
2  [If guys move gear into a new area, then that ...  
3  [In the casco Bay Area everyone gets along wel...  
   Topic                    Word    Weight
0      0  lobstermen territorial  0.739008
1      

In [44]:
# Simple topic overview (interactive Plotly)
topic_model.visualize_topics()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [45]:
# Hierarchical topic structure
hierarchical_topics = topic_model.hierarchical_topics(documents)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

# Hierarchical visualization
topic_model.visualize_hierarchy()


100%|██████████| 3/3 [00:04<00:00,  1.53s/it]

.
├─■──territory fish_ocean people_fish lot_need territory_people fish ── Topic: 3
└─lobstermen territorial_lobstermen want_idea lobstermen_lobstermen open_lobstermen
     ├─■──accommodate fishermen_fish areas_declining territory_established territory_territory ── Topic: 2
     └─lobstermen territorial_lobstermen want_idea lobstermen_lobstermen open_willingly lobstermen
          ├─■──lobstermen_areas lobsters_lobstering_fished area_fish area ── Topic: 1
          └─■──lobstermen territorial_lobstermen want_idea lobstermen_congested lobstermen_sharing territory ── Topic: 0






ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [46]:
import numpy as np

# Ensure numeric Age
df_text["Age"] = pd.to_numeric(df_text["Age"], errors="coerce")

df_age = df_text.dropna(subset=["Age"]).reset_index(drop=True)

documents_age = df_age[TEXT_COL].astype(str).tolist()
ages = df_age["Age"].astype(float).tolist()

print(f"Number of documents with Age: {len(documents_age)}")
print(f"Example Age values: {ages[:10]}")

updated_topics, _ = topic_model.transform(documents_age)

topics_over_time = topic_model.topics_over_time(
    documents_age,
    ages,
    updated_topics
)

fig = topic_model.visualize_topics_over_time(
    topics_over_time,
    title="Topics Across Age"
)
fig


Number of documents with Age: 34
Example Age values: [24.0, 24.0, 28.0, 73.0, 26.0, 52.0, 27.0, 24.0, 26.0, 33.0]


Batches: 100%|██████████| 2/2 [00:04<00:00,  2.15s/it]
2025-11-17 15:31:08,858 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-11-17 15:31:08,860 - BERTopic - Dimensionality - Completed ✓
2025-11-17 15:31:08,862 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-11-17 15:31:08,866 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2025-11-17 15:31:08,876 - BERTopic - Probabilities - Completed ✓
2025-11-17 15:31:08,878 - BERTopic - Cluster - Completed ✓
23it [00:21,  1.08it/s]


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed