In [1]:
# Cell 1: Data Loading and Cleaning (Refactored)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import random
import sqlite3

# --- NLTK resource download (if necessary) ---
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords', quiet=True)
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)

# ==============================================================================
# --- HYPERPARAMETERS AND CONSTANTS ---
# ==============================================================================
MODEL_PATH = "aging_topics.bertopic"
DB_NAME = "fightaging_articles.db"

# --- Sampling Control ---
# True: Uses only a portion of the data for quick testing.
# False: Uses the full dataset for final analysis.
use_sample = False
SAMPLE_DAYS = 730  # 2 years. Only used if use_sample = True.

# ==============================================================================
# --- HELPER FUNCTIONS ---
# ==============================================================================
def load_data_from_db(db_name="fightaging_articles.db"):
    """Loads all articles from the SQLite database into a DataFrame."""
    try:
        conn = sqlite3.connect(db_name)
        df = pd.read_sql_query("SELECT * FROM articles", conn)
        conn.close()
        df['publish_date'] = pd.to_datetime(df['publish_date'])
        print(f"✅ Successfully loaded {len(df)} articles from '{db_name}'.")
        return df
    except Exception as e:
        print(f"❌ Could not load data from the database. Error: {e}")
        return pd.DataFrame()

def simple_clean_text(text):
    """Cleans the text: lowercase, removes punctuation/numbers, etc."""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Only keeps letters and spaces
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if len(word) > 2]
    return " ".join(filtered_tokens)

# ==============================================================================
# --- MAIN LOADING AND PREPROCESSING LOGIC (REFACTORED) ---
# ==============================================================================

# 1. Load data if it's not already in memory
# This structure ensures `articles_df` is always defined, satisfying the linter.
if 'articles_df' not in locals() or not isinstance(articles_df, pd.DataFrame) or articles_df.empty:
    print("DataFrame not found or is empty. Loading from database...")
    articles_df = load_data_from_db(DB_NAME)
else:
    print(f"Using pre-existing 'articles_df' with {len(articles_df)} rows.")

# Initialize final variables to ensure they exist even if processing fails
documents, timestamps = [], []

# 2. Proceed with processing only if the DataFrame has data
if not articles_df.empty:
    try:
        print("\nStarting data preprocessing...")
        processed_df = articles_df.copy() # Work on a copy to preserve the original
        
        # Text and date preprocessing
        processed_df['full_text'] = processed_df['title'].astype(str) + ' ' + processed_df['body'].astype(str)
        processed_df['cleaned_text'] = processed_df['full_text'].apply(simple_clean_text)
        processed_df.dropna(subset=['cleaned_text', 'publish_date'], inplace=True)
        processed_df.sort_values(by='publish_date', inplace=True, ascending=True)

        # Data selection (Sample vs. Full)
        if use_sample:
            print(f"\n--- 🧪 SAMPLE MODE ACTIVATED ---")
            start_date = processed_df['publish_date'].min()
            end_date = start_date + pd.Timedelta(days=SAMPLE_DAYS)
            df_final = processed_df[processed_df['publish_date'] <= end_date]
            print(f"Sample date range: {df_final['publish_date'].min().date()} to {df_final['publish_date'].max().date()}")
        else:
            print(f"\n--- 🚀 FULL DATASET MODE ---")
            df_final = processed_df
            print(f"Full date range: {df_final['publish_date'].min().date()} to {df_final['publish_date'].max().date()}")

        # List preparation for the model
        documents = df_final['cleaned_text'].tolist()
        timestamps = df_final['publish_date'].tolist()

        if not documents:
            print("❌ ERROR: The list of documents is empty after filtering.")
        else:
            print(f"\n✅ Data ready for modeling: {len(documents)} documents.")

    except Exception as e:
        print(f"❌ An error occurred during preprocessing: {e}")
else:
    print("DataFrame is empty. Cannot continue with processing.")

  from .autonotebook import tqdm as notebook_tqdm


DataFrame not found or is empty. Loading from database...
✅ Successfully loaded 18753 articles from 'fightaging_articles.db'.

Starting data preprocessing...

--- 🚀 FULL DATASET MODE ---
Full date range: 2002-11-01 to 2025-09-19

✅ Data ready for modeling: 18753 documents.


In [2]:
# Cell 2: Topic Training and Assignment

from sentence_transformers import SentenceTransformer
import os # Added for os.path.exists()
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired


# Only execute if Cell 1 loaded data correctly
if 'documents' in locals() and documents:
    # --- Confirmation of data size ---
    print(f"\n▶️ Starting topic modeling process on {len(documents)} documents.")

    # --- a. Configuration and Paths ---
    if use_sample:
        MODEL_PATH = "aging_topics_sample.bertopic"
        print(f"--- 🧪 SAMPLE MODE: Model will be saved to '{MODEL_PATH}' ---")
    else:
        MODEL_PATH = "aging_topics_full.bertopic"
        print(f"--- 🚀 FULL MODE: Model will be saved to '{MODEL_PATH}' ---")

    # --- b. Pre-calculation of Embeddings ---
    print("\nGenerating document embeddings (this may take a while)...")
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedding_model.encode(documents, show_progress_bar=True)
    print(f"✅ Embeddings generated with shape: {embeddings.shape}")

    # --- c. Load or Train the Model ---
    if os.path.exists(MODEL_PATH):
        print(f"\nLoading existing model from '{MODEL_PATH}'...")
        topic_model = BERTopic.load(MODEL_PATH)
        print("✅ Model loaded successfully.")
    else:
        print("\nModel not found. Training a new BERTopic model...")
        vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
        keybert_representation = KeyBERTInspired()
        representation_model = {"KeyBERT_Phrases": keybert_representation}
        
        topic_model = BERTopic(
            embedding_model=embedding_model,
            language="english",
            verbose=True,
            vectorizer_model=vectorizer_model,
            representation_model=representation_model,
        )
        topic_model.fit(documents, embeddings) # Use fit() since we transform later
        topic_model.save(MODEL_PATH)
        print(f"\n✅ New model trained and saved to '{MODEL_PATH}'!")

    # --- d. Topic Assignment ---
    print("\nAssigning topics to documents...")
    topics, _ = topic_model.transform(documents, embeddings)
    print("✅ Topics assigned successfully.")

else:
    print("❌ No data to train the model. Run Cell 1 first.")


▶️ Starting topic modeling process on 18753 documents.
--- 🚀 FULL MODE: Model will be saved to 'aging_topics_full.bertopic' ---

Generating document embeddings (this may take a while)...


Batches: 100%|██████████| 587/587 [00:37<00:00, 15.57it/s]


✅ Embeddings generated with shape: (18753, 384)

Loading existing model from 'aging_topics_full.bertopic'...
✅ Model loaded successfully.

Assigning topics to documents...
✅ Topics assigned successfully.


In [3]:
# Cell 3: Synchronization and Loading/Calculation of Results

import numpy as np
import pandas as pd
import os

if 'topic_model' in locals() and 'documents' in locals() and 'embeddings' in locals():
    # --- a. Define File Paths ---
    RESULTS_DIR = "inference_results"
    os.makedirs(RESULTS_DIR, exist_ok=True) # Creates the directory if it doesn't exist

    # Dynamic paths based on whether we are using the sample or the full dataset
    if use_sample:
        topics_path = os.path.join(RESULTS_DIR, "topics_sample.npy")
        time_path = os.path.join(RESULTS_DIR, "topics_over_time_sample.csv")
    else:
        topics_path = os.path.join(RESULTS_DIR, "topics_full.npy")
        time_path = os.path.join(RESULTS_DIR, "topics_over_time_full.csv")

    # --- b. Loading or Calculation Logic ---
    # Check if the results files already exist
    if os.path.exists(topics_path) and os.path.exists(time_path):
        # --- FAST TRACK: Load from disk ---
        # ✅ Loading pre-calculated results from '{RESULTS_DIR}'...
        print(f"✅ Loading pre-calculated results from '{RESULTS_DIR}'...")
        
        topics = np.load(topics_path)
        topics_over_time = pd.read_csv(time_path, parse_dates=['Timestamp'])
        
        # Results loaded successfully!
        print("Results loaded successfully!")

    else:
        # --- SLOW TRACK: Calculate and then Save ---
        # ❌ Pre-calculated results not found. Calculating now (this may take a while)...
        print("❌ Pre-calculated results not found. Calculating now (this may take a while)...")
        
        # 1. Synchronize topics (uses the GPU)
        # Assigning topics to documents...
        print("Assigning topics to documents...")
        topics, _ = topic_model.transform(documents, embeddings)
        
        # 2. Calculate temporal evolution
        # Calculating topic evolution over time...
        print("Calculating topic evolution over time...")
        topics_over_time = topic_model.topics_over_time(
            docs=documents, 
            timestamps=timestamps, 
            topics=topics,
            nr_bins=20
        )
        # ✅ Calculations completed.
        print("✅ Calculations completed.")
        
        # 3. Save the new results for next time
        # Saving new results to '{RESULTS_DIR}'...
        print(f"Saving new results to '{RESULTS_DIR}'...")
        np.save(topics_path, topics)
        topics_over_time.to_csv(time_path, index=False)
        # ✅ Results saved for future sessions!
        print("✅ Results saved for future sessions!")

    # At the end of this cell, 'topics' and 'topics_over_time' will always be available
    # \n--- Results ready for analysis and visualization. ---
    print("\n--- Results ready for analysis and visualization. ---")

else:
    # ❌ The model, documents, or embeddings are not available. Run previous cells first.
    print("❌ The model, documents, or embeddings are not available. Run previous cells first.")

✅ Loading pre-calculated results from 'inference_results'...
Results loaded successfully!

--- Results ready for analysis and visualization. ---


In [4]:
# Cell 4: Topic Frequency Chart

if 'topic_model' in locals():
    # --- b. Bar Chart of the Main Topics ---
    # --- Topic Frequency Chart ---
    print("\n--- Topic Frequency Chart ---")
    # Displays the 15 most frequent topics (excluding the -1 "outliers" topic)
    display(topic_model.visualize_barchart(top_n_topics=30))
else:
    # ❌ The model is not available. Run previous cells first.
    print("❌ The model is not available. Run previous cells first.")


--- Topic Frequency Chart ---


In [5]:
# Cell 5: Temporal Evolution Graph (with Caching)

import pandas as pd
import os

if 'topic_model' in locals() and 'topics' in locals():
    # --- a. Define file paths for caching ---
    RESULTS_DIR = "inference_results"
    os.makedirs(RESULTS_DIR, exist_ok=True) # Create directory if it doesn't exist

    # Dynamic path based on whether we are using the sample or full dataset
    if use_sample:
        time_path = os.path.join(RESULTS_DIR, "topics_over_time_sample.csv")
    else:
        time_path = os.path.join(RESULTS_DIR, "topics_over_time_full.csv")

    # --- b. Load pre-calculated results or calculate them if they don't exist ---
    if os.path.exists(time_path):
        # --- FAST PATH: Load from disk ---
        print(f"✅ Loading pre-calculated temporal evolution data from:\n{time_path}")
        topics_over_time = pd.read_csv(time_path, parse_dates=['Timestamp'])
    else:
        # --- SLOW PATH: Calculate and then save ---
        print("Calculating topics over time... (This can be very slow, but only runs once)")
        topics_over_time = topic_model.topics_over_time(
            docs=documents, 
            timestamps=timestamps, 
            topics=topics,
            nr_bins=20
        )
        
        # Save the results for next time
        try:
            topics_over_time.to_csv(time_path, index=False)
            print(f"✅ Results saved to '{time_path}' for future sessions.")
        except Exception as e:
            print(f"❌ Error saving results: {e}")

    # --- c. Display the visualization ---
    print("\n--- Temporal Evolution Graph ---")
    display(topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10))

else:
    print("❌ The model or 'topics' variable is not available. Please run the previous cells first.")

✅ Loading pre-calculated temporal evolution data from:
inference_results/topics_over_time_full.csv

--- Temporal Evolution Graph ---


In [6]:
# Cell 6: Saving the Temporal Analysis Result

# Ensure that the variable 'topics_over_time' exists after the lengthy calculation
if 'topics_over_time' in locals() and not topics_over_time.empty:
    
    # --- Define the saving path ---
    # We use the same logic as before to have separate files for the sample and the full dataset
    RESULTS_DIR = "inference_results"
    os.makedirs(RESULTS_DIR, exist_ok=True)

    if use_sample:
        time_path = os.path.join(RESULTS_DIR, "topics_over_time_sample.csv")
    else:
        time_path = os.path.join(RESULTS_DIR, "topics_over_time_full.csv")

    # --- Save the DataFrame to a CSV file ---
    try:
        topics_over_time.to_csv(time_path, index=False)
        # ✅ Success! The result of the temporal analysis has been saved to:
        print(f"✅ Success! The result of the temporal analysis has been saved to:")
        print(time_path)
    except Exception as e:
        # ❌ An error occurred while trying to save the file: {e}
        print(f"❌ An error occurred while trying to save the file: {e}")

else:
    # ❌ The variable 'topics_over_time' was not found or is empty. Nothing was saved.
    print("❌ The variable 'topics_over_time' was not found or is empty. Nothing was saved.")

✅ Success! The result of the temporal analysis has been saved to:
inference_results/topics_over_time_full.csv


In [7]:
# Cell 7: Detailed Analysis of a Specific Topic (Corrected)

if 'topic_model' in locals() and 'documents' in locals():
    # --- a. Detailed Analysis of a Specific Topic by ID ---
    topic_info_df = topic_model.get_topic_info()
    
    if len(topic_info_df) > 1:
        # --- INTERACTIVE PARAMETER ---
        # Change this ID to explore any topic that interests you.
        # By default, it inspects the most frequent topic.
        topic_id_to_inspect = topic_info_df.iloc[1]['Topic']
        # -----------------------------
        
        # \n--- 🔎 Detailed Analysis of Topic ID: {topic_id_to_inspect} ---
        print(f"\n--- 🔎 Detailed Analysis of Topic ID: {topic_id_to_inspect} ---")
        topic_name = topic_info_df.loc[topic_info_df['Topic'] == topic_id_to_inspect, 'Name'].iloc[0]
        # Name: {topic_name}
        print(f"Name: {topic_name}")
        
        # \nKeywords:
        print("\nKeywords:")
        print([word for word, score in topic_model.get_topic(topic_id_to_inspect)])
        
        # \nMost representative documents for this topic:
        print("\nMost representative documents for this topic:")
        
        # ========================== START OF CORRECTION ==========================
        # 1. We get the INDICES of the representative documents.
        representative_doc_indices = topic_model.get_representative_docs(topic_id_to_inspect)
        
        # 2. We use the indices to look up the TEXT in the original 'documents' list.
        for doc_index in representative_doc_indices:
            # We get the full text of the document
            doc_text = documents[doc_index]
            # Now we can slice the text (string)
            print(f"- {doc_text[:150]}...")
        # =========================== END OF CORRECTION ===========================

    else:
        # \n[WARNING]: No topics found to analyze (only outliers).
        print("\n[WARNING]: No topics found to analyze (only outliers).")

else:
    # ❌ The model or the 'documents' variable are not available. Run previous cells first.
    print("❌ The model or the 'documents' variable are not available. Run previous cells first.")


--- 🔎 Detailed Analysis of Topic ID: 0 ---
Name: 0_mitochondrial_mitochondria_mtdna_mitochondrial dna

Keywords:
['mitochondrial', 'mitochondria', 'mtdna', 'mitochondrial dna', 'dna', 'oxidative', 'mitophagy', 'ros', 'damage', 'mitochondrial function']

Most representative documents for this topic:
- senescent cells accumulate iron while resisting the consequent ferroptosis ferroptosis form programmed cell death driven iron accumulation and involvi...
- cryonics cryonics the only present option offering chance much longer life the future that open older and seriously ill people the many individuals wh...
- calorie restriction explained calorie restriction caloric restriction usually abbreviated strategy proven extend healthy average and maximum life span...
- what antiaging antiaging can difficult topic address war currently fought over the meaning the term research and medicine and brand for products energ...
- the importance activism activism very important persistent and vocal advo

In [8]:
# Cell 8: Data Preparation for N-Gram Analysis (Corrected)
import pandas as pd
import matplotlib.pyplot as plt

# Ensure the necessary variables from previous cells exist
if 'topic_model' in locals() and 'topics_over_time' in locals() and not topics_over_time.empty:

    # ==============================================================================
    # --- 1. Prepare the Base Dataframe 📊 ---
    # ==============================================================================
    # --- Preparing base dataframe for N-gram analysis ---
    print("--- Preparing base dataframe for N-gram analysis ---")
    
    df_ngrams = topics_over_time.copy()
    df_ngrams['Year'] = df_ngrams['Timestamp'].dt.year

    def filter_ngrams(words_string):
        """Filters a word string to return only bigrams and trigrams."""
        if not isinstance(words_string, str):
            return []
        word_list = words_string.split(', ')
        ngrams = [word.strip() for word in word_list if ' ' in word.strip()]
        return ngrams

    # Apply function to extract n-grams
    df_ngrams['N-Grams_List'] = df_ngrams['Words'].apply(filter_ngrams)
    
    # Enrich the table with topic names from the main model
    topic_info = topic_model.get_topic_info()[['Topic', 'Name']]
    
    # Perform the merge, which can create 'Name_x' and 'Name_y'
    df_ngrams = pd.merge(df_ngrams, topic_info, on='Topic')
    
    # --- FIX IS HERE ---
    # Check for the 'Name_x' column created by the merge and rename it back to 'Name'
    if 'Name_x' in df_ngrams.columns:
        # Detected 'Name_x' column from merge. Standardizing to 'Name'.
        print("Detected 'Name_x' column from merge. Standardizing to 'Name'.")
        # Drop the redundant 'Name_y' column if it exists
        if 'Name_y' in df_ngrams.columns:
            df_ngrams.drop(columns=['Name_y'], inplace=True)
        # Rename 'Name_x' to the consistent 'Name'
        df_ngrams.rename(columns={'Name_x': 'Name'}, inplace=True)
    
    # Filter out rows not useful for visualization
    df_ngrams_filtered = df_ngrams[(df_ngrams['N-Grams_List'].astype(bool)) & (df_ngrams['Topic'] != -1)]
    
    # Sort values for processing
    df_ngrams_prepared = df_ngrams_filtered.sort_values(by=['Year', 'Frequency'], ascending=[True, False])

    # ✅ Data preparation complete. You can now re-run Cell 9.
    print("✅ Data preparation complete. You can now re-run Cell 9.")
    # A useful debug print to confirm the columns are correct
    # Columns in the prepared dataframe: ['Timestamp', 'Frequency', 'Words', 'Topic', 'Year', 'N-Grams_List', 'Name']
    print("\nColumns in the prepared dataframe:", df_ngrams_prepared.columns.tolist())

else:
    # The model or the 'topics_over_time' variable is not available. Please run the previous cells first.
    print("The model or the 'topics_over_time' variable is not available. Please run the previous cells first.")

--- Preparing base dataframe for N-gram analysis ---
Detected 'Name_x' column from merge. Standardizing to 'Name'.
✅ Data preparation complete. You can now re-run Cell 9.

Columns in the prepared dataframe: ['Topic', 'Words', 'Frequency', 'Timestamp', 'Name', 'Year', 'N-Grams_List']


In [9]:
# Cell 9: Display Top Topics Per Year (with Full Output)

# Ensure the prepared dataframe from the previous cell exists
if 'df_ngrams_prepared' in locals():
    # ==============================================================================
    # --- 2. Display Top Topics Per Year Summary Table 📋 ---
    # ==============================================================================
    print("--- Summary Table: Top Topics and their N-Grams per Year ---")
    
    # Group by year and take the top entries for each year
    top_15_summary_table = df_ngrams_prepared.groupby('Year').head(15)
    display(top_15_summary_table[['Year', 'Topic', 'Name', 'N-Grams_List', 'Frequency']])

    # # --- Temporarily set pandas display options to show all rows ---
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #     # Display the resulting summary table, now including Topic ID and the N-Grams
    #     display(top_15_summary_table[['Year', 'Topic', 'Name', 'N-Grams_List', 'Frequency']])

else:
    print("The prepared dataframe ('df_ngrams_prepared') is not available. Please run the previous cell first.")

--- Summary Table: Top Topics and their N-Grams per Year ---


Unnamed: 0,Year,Topic,Name,N-Grams_List,Frequency
30,2002,51,51_cloning_ban_therapeutic cloning_thera...,"[theraputic cloning, therapeutic cloning]",32
5,2002,5,5_stem_stem cells_cells_embryonic,"[stem cells, stem cell, adult stem]",22
10,2002,13,13_stem cell research_cell research_stem...,"[stem cell research, cell research, stem cell]",21
29,2002,50,50_calorie_calorie restriction_restricti...,[calorie restriction],21
34,2002,60,60_council_bioethics_kass_leon kass,"[council bioethics, presidents council]",17
...,...,...,...,...,...
3195,2024,33,33_reprogramming_partial_partial reprogr...,"[partial reprogramming, yamanaka factors]",9
3184,2024,19,19_immune_immunosenescence_cells_immunit...,"[immune resilience, immune aging]",8
3309,2024,229,229_epigenetic_physical_clocks_fitness,[physical activity],8
3177,2024,10,10_exercise_brain_physical_cognitive,[physical activity],7


In [10]:
# Cell 10: Generating Readable Topic Names (Directly from the Table)

import pandas as pd
from transformers import pipeline
from tqdm.auto import tqdm
import re

# Initialize tqdm for pandas operations (.progress_apply)
tqdm.pandas(desc="Interpreting Names")

# Ensure the summary table from previous cells exists
if 'top_15_summary_table' in locals():
    
    # 1. Load the local summarization model (Flan-T5)
    try:
        topic_namer = pipeline("summarization", model="google/flan-t5-small", device=-1)
        print("✅ Flan-T5 model loaded successfully.")
    except Exception as e:
        print(f"❌ Error loading the language model: {e}")
        topic_namer = None

    if topic_namer:
        # 2. Define a function to "translate" the 'Name' column string
        def interpret_name_from_string(name_string):
            """Cleans the 'Name' column string and generates a descriptive title."""
            if not isinstance(name_string, str):
                return "Invalid Name"
            
            # Clean the string: remove leading number, replace underscores
            # e.g., "5_stem_cells_cells_embryonic" -> "stem cells cells embryonic"
            keywords = re.sub(r'^\d+_', '', name_string).replace('_', ' ')
            
            # Create a clear prompt for the model
            prompt = f"Generate a short, 3-5 word descriptive title for a category defined by these keywords: {keywords}"
            
            # Generate the name
            result = topic_namer(prompt, max_length=15, min_length=3, do_sample=False)
            return result[0]['summary_text'].strip()

        # 3. Apply the function to the 'Name' column to create the new column
        print("\nGenerating interpreted names directly from the 'Name' column...")
        top_15_summary_table['Interpreted_Name'] = top_15_summary_table['Name'].progress_apply(interpret_name_from_string)
        print("✅ New 'Interpreted_Name' column added successfully!")

else:
    print("❌ The 'top_15_summary_table' DataFrame is not available. Run previous cells first.")

Device set to use cpu


✅ Flan-T5 model loaded successfully.

Generating interpreted names directly from the 'Name' column...


Interpreting Names:   0%|          | 0/300 [00:00<?, ?it/s]Both `max_new_tokens` (=256) and `max_length`(=15) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Interpreting Names:   1%|          | 2/300 [00:00<00:43,  6.78it/s]Both `max_new_tokens` (=256) and `max_length`(=15) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Interpreting Names:   1%|          | 3/300 [00:00<00:46,  6.41it/s]Both `max_new_tokens` (=256) and `max_length`(=15) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Interpreting Names:   1%|▏         | 4/300 [00:00<00:46,  6.

✅ New 'Interpreted_Name' column added successfully!





In [11]:
# 4. Display the final, synchronized result
print("\n--- Summary Table with Interpreted Names ---")
# display(top_15_summary_table[[
#     'Year', 
#     'Topic', 
#     'Name',
#     'Interpreted_Name', 
#     'N-Grams_List', 
#     'Frequency'
# ]])

# # --- Temporarily set pandas display options to show all rows ---
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    # Display the resulting summary table, now including Topic ID and the N-Grams
    display(top_15_summary_table)



--- Summary Table with Interpreted Names ---


Unnamed: 0,Topic,Words,Frequency,Timestamp,Name,Year,N-Grams_List,Interpreted_Name
30,51,"cloning, theraputic cloning, theraputic, ban, ...",32,2002-10-23 15:24:28.800,51_cloning_ban_therapeutic cloning_thera...,2002,"[theraputic cloning, therapeutic cloning]",Thera cloning ban
5,5,"stem, stem cells, stem cell, adult, adult stem",22,2002-10-23 15:24:28.800,5_stem_stem cells_cells_embryonic,2002,"[stem cells, stem cell, adult stem]",embryonic stem cell cell
10,13,"stem cell research, cell research, stem cell, ...",21,2002-10-23 15:24:28.800,13_stem cell research_cell research_stem...,2002,"[stem cell research, cell research, stem cell]",Stem cell research stem
29,50,"calorie, calorie restriction, restriction, die...",21,2002-10-23 15:24:28.800,50_calorie_calorie restriction_restricti...,2002,[calorie restriction],calorie restriction restricti - wikipedia
34,60,"council, kass, bioethics, council bioethics, p...",17,2002-10-23 15:24:28.800,60_council_bioethics_kass_leon kass,2002,"[council bioethics, presidents council]",council bioethics kass leon
21,32,"antiaging, antiaging medicine, medicine, marke...",16,2002-10-23 15:24:28.800,32_antiaging_marketplace_antiaging medic...,2002,[antiaging medicine],antiaging marketplace medics
71,153,"european, ban, stem cell research, cell resear...",12,2002-10-23 15:24:28.800,153_stem cell research_cell research_ban...,2002,"[stem cell research, cell research, european p...",ban on stem cell research
8,11,"heart, stem, stem cells, stem cell, heart damage",10,2002-10-23 15:24:28.800,11_heart_stem_stem cells_cells,2002,"[stem cells, stem cell, heart damage]",heart stem stem cells cells
25,41,"alzheimers, alzheimers research, article, rese...",10,2002-10-23 15:24:28.800,41_alzheimers_alzheimers disease_brain_d...,2002,[alzheimers research],alzheimers disease brain d
40,67,"life extension, extension, healthy life extens...",10,2002-10-23 15:24:28.800,67_life extension_extension_life_healthy...,2002,"[life extension, healthy life extension, healt...",Life Extension Extension Life Healthy


In [12]:
# Cell 10: Faceted Graph for Maximum Clarity

import plotly.express as px
import pandas as pd

if 'df_ngrams_prepared' in locals():
    print(f"\n--- Generating Faceted Plot: Top 5 Topics per Year ---")

    # 1. Prepare data
    top_topics_per_year_5 = df_ngrams_prepared.groupby('Year').head(5).reset_index(drop=True)
    
    # IMPORTANT: Ensure 'Year' is treated as a category for proper sorting and display
    top_topics_per_year_5['Year'] = top_topics_per_year_5['Year'].astype(str)

    # 2. Create the Faceted Plotly Express Bar Chart
    fig = px.bar(
        top_topics_per_year_5,
        x='Name',           # Put the Topic Name on the x-axis now
        y='Frequency',
        color='Name',       # Still color by name for consistency
        hover_data=['Topic', 'N-Grams_List', 'Frequency'],
        
        # ==============================================================================
        facet_col='Year',         # Create a new column of plots for each year
        facet_col_wrap=4,         # Wrap to a new row after every 4 years
        # ==============================================================================
        
        title='Top 5 Most Frequent Topics per Year',
        labels={'Frequency': 'Frequency', 'Name': 'Topic Name'},
    )
    
    # 3. Customize layout
    # Remove x-axis labels since they are redundant with the color legend
    fig.update_xaxes(showticklabels=False) 
    fig.update_layout(
        height=800, # May need a taller figure to accommodate the rows
        width=1400,
        title_x=0.5,
        legend_title_text='Topic Name'
    )
    
    # Display the interactive plot
    fig.show()
    
    print("✅ Faceted graph generated. Each year has its own subplot for clarity.")
        
else:
    print("The prepared dataframe ('df_ngrams_prepared') is not available. Please run the previous cells first.")


--- Generating Faceted Plot: Top 5 Topics per Year ---


✅ Faceted graph generated. Each year has its own subplot for clarity.


In [13]:
# # Celda 8: Reducción y Refinamiento de Tópicos
# #
# # EJECUTA ESTA CELDA SÓLO SI DESPUÉS DEL ANÁLISIS INICIAL CONSIDERAS
# # QUE EL NÚMERO DE TÓPICOS ES DEMASIADO ALTO Y QUIERES UNA VERSIÓN MÁS RESUMIDA.
# #
# # NOTA: Esta operación modifica el objeto 'topic_model'. Si quieres volver al
# # modelo original, simplemente vuelve a ejecutar la "Celda 2".

# if 'topic_model' in locals():
#     # --- Parámetro de ajuste ---
#     # Define a cuántos tópicos quieres reducir el modelo.
#     desired_topics = 30 
    
#     print(f"--- 🔬 Herramienta de Refinamiento ---")
#     print(f"Reduciendo el número de tópicos a {desired_topics}...")
    
#     # --- Lógica de Reducción ---
#     # Este comando es muy rápido. No re-entrena, solo fusiona los tópicos existentes.
#     topic_model.reduce_topics(documents, nr_topics=desired_topics)
    
#     print("\n✅ ¡Reducción completada!")
#     print("Para ver los resultados, vuelve a ejecutar las celdas de análisis y visualización (Celda 4 y 5).")
    
#     # Muestra un resumen rápido de los nuevos tópicos
#     topic_info_df = topic_model.get_topic_info()
#     display(topic_info_df.head(10))

# else:
#     print("❌ El modelo no ha sido cargado o entrenado. Ejecuta la Celda 2 primero.")