# 03 - Topic Modeling Training

This notebook performs topic modeling on the preprocessed data.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from gensim import corpora, models
from gensim.models import CoherenceModel, LdaModel, TfidfModel
from sklearn.decomposition import TruncatedSVD, NMF
from gensim.matutils import corpus2csc
from concurrent.futures import ProcessPoolExecutor, as_completed
import joblib
import os
import warnings
import ast
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
from plotly import express as px
from pprint import pformat

from src.topic_model_utils import evaluate_topic_models

In [None]:
# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
nltk.download('stopwords')

# Create necessary directories
os.makedirs('../models', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)


# Load and preprocess data
def load_and_preprocess_data():
    print("Loading and preprocessing data...")
    # Load your data here
    df = pd.read_json('../data/processed/data_processed.json', lines=True)
    # Preprocessing steps...
    return df


## 2. Topic Modeling Training

In [None]:
def evaluate_models_parallel(num_topics_range, corpus, corpus_tfidf_sparse, dictionary, processed_texts, save_dir=None, save_models=False):
    with tqdm_joblib(tqdm(total=len(num_topics_range), desc="Evaluating models")):
        results = Parallel(n_jobs=-1, prefer="processes")(
            delayed(evaluate_topic_models)(
                num_topics, corpus, corpus_tfidf_sparse, dictionary, processed_texts, save_dir, save_models
            )
            for num_topics in num_topics_range
        )
    coherence_df = pd.DataFrame(results, columns=["num_topics", "results"]).sort_values(by="num_topics").reset_index(drop=True)
    return coherence_df


def evaluate_models_serial(num_topics_range, corpus, corpus_tfidf_sparse, dictionary, processed_texts, save_dir=None, save_models=False):
    results = []
    for num_topics in tqdm(num_topics_range, desc="Evaluating models"):
        result = evaluate_topic_models(num_topics, corpus, corpus_tfidf_sparse, dictionary, processed_texts, save_dir, save_models)
        results.append(result)
    coherence_df = pd.DataFrame(results, columns=["num_topics", "results"]).sort_values(by="num_topics").reset_index(drop=True)
    return coherence_df

In [None]:
# Creating dictionary and corpus
dictionary = corpora.Dictionary(df["processed_text"])
corpus = [dictionary.doc2bow(text) for text in df["processed_text"]]

# Transforming to TF-IDF for LSA and pLSA
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

In [None]:
# Save all necessary components
dictionary = corpora.Dictionary(df["processed_text"])
dictionary.save("../data/processed/dictionary.dict")
joblib.dump(dictionary, "../data/processed/dictionary.joblib")

# Save the TF-IDF model
tfidf_model = TfidfModel(corpus)
joblib.dump(tfidf_model, "../data/processed/tfidf_model.pkl")

In [None]:
# Converting corpus_tfidf to sparse matrix
corpus_tfidf_sparse = corpus2csc(corpus_tfidf).T

# Defining topic range
num_topics_range = range(3, 16)

# Evaluating models serially
coherence_df = evaluate_models_serial(num_topics_range, corpus, corpus_tfidf_sparse, dictionary, df["processed_text"])

# Printing results
print(coherence_df)

In [None]:
# Save coherence results as a CSV file
coherence_df.to_csv("../data/processed/coherence_results.csv", index=False)

def safe_literal_eval(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val

coherence_df['results'] = coherence_df['results'].apply(safe_literal_eval)


## 3. Analysis of Topic Models and Coherence Value

In [None]:
# Expand the dict into separate columns
results_expanded = pd.json_normalize(coherence_df['results'])
coherence_df = pd.concat([coherence_df.drop(columns=['results']), results_expanded], axis=1)

print(coherence_df)
print(coherence_df.iloc[0])
print(coherence_df.iloc[0]['coherence_lda'])

In [None]:
# --- Melt DataFrame for plotting ---
coherence_df_long = coherence_df.melt(
    id_vars=["num_topics"],
    value_vars=["coherence_lda", "coherence_lsa", "coherence_plsa"],
    var_name="model",
    value_name="coherence"
)

# Plot knee plot
fig = px.line(
    coherence_df_long,
    x="num_topics",
    y="coherence",
    color="model",  # Differentiate lines by model
    markers=True,
    title="Coherence Value vs Number of Topics",
    labels={"num_topics": "Number of Topics", "coherence": "Coherence Score", "model": "Model"}
)
fig.update_layout(
    xaxis_title="Number of Topics",
    yaxis_title="Coherence Score",
    yaxis=dict(range=[0, 1]),
    legend_title="Model",
    template="plotly_white"
)
fig.show()


#### **Analysis of Topic Model Coherence Trends**  

- **LDA**: Peaks at **4 topics (0.5169)**, then declines. More topics lead to **topic fragmentation** and reduced coherence.  
- **LSA**: Peaks at **3 topics (0.6477)**, then declines. Struggles with **dimensionality reduction loss** at higher topic counts.  
- **pLSA**: Steady increase from **2 topics (0.5094) to 15 topics (0.7071)**. Performs better with **more topics**, leveraging **soft topic assignments**.  
- **Comparison**:
  - **LDA & LSA**: Best with **fewer topics**, decline as topics increase.  
  - **pLSA**: **Improves with more topics**, capturing complex structures better.  
- **Conclusion**: pLSA outperforms LDA and LSA at higher topic counts, while LDA/LSA are more effective at lower topic numbers.

In [None]:
# --- Calculate average coherence and scores spread ---
coherence_df['average_coherence'] = coherence_df[['coherence_lda', 'coherence_lsa', 'coherence_plsa']].mean(axis=1)
coherence_df['spread'] = coherence_df[['coherence_lda', 'coherence_lsa', 'coherence_plsa']].max(axis=1) - \
                         coherence_df[['coherence_lda', 'coherence_lsa', 'coherence_plsa']].min(axis=1)

# --- Select Best Models ---
# Best individual models
best_lda = coherence_df.loc[coherence_df['coherence_lda'].idxmax()]
best_lsa = coherence_df.loc[coherence_df['coherence_lsa'].idxmax()]
best_plsa = coherence_df.loc[coherence_df['coherence_plsa'].idxmax()]

# Best overall group based on average coherence
best_group = coherence_df.loc[coherence_df['average_coherence'].idxmax()]

# Best group with minimum spread (i.e. most similar scores)
best_group_spread = coherence_df.loc[coherence_df['spread'].idxmin()]

# --- Create Summary DataFrames ---
best_individual_df = pd.DataFrame({
    'Technique': ['LDA', 'LSA', 'pLSA'],
    'Best_Num_Topics': [
        best_lda['num_topics'],
        best_lsa['num_topics'],
        best_plsa['num_topics']
    ],
    'Best_Coherence': [
        best_lda['coherence_lda'],
        best_lsa['coherence_lsa'],
        best_plsa['coherence_plsa']
    ]
})

best_group_df = pd.DataFrame({
    'Strategy': ['Best Average Group'],
    'Num_Topics': [best_group['num_topics']],
    'Average_Coherence': [best_group['average_coherence']],
    'LDA_Coherence': [best_group['coherence_lda']],
    'LSA_Coherence': [best_group['coherence_lsa']],
    'PLSA_Coherence': [best_group['coherence_plsa']]
})

best_group_spread_df = pd.DataFrame({
    'Strategy': ['Minimum Spread Group'],
    'Num_Topics': [best_group_spread['num_topics']],
    'Spread': [best_group_spread['spread']],
    'LDA_Coherence': [best_group_spread['coherence_lda']],
    'LSA_Coherence': [best_group_spread['coherence_lsa']],
    'PLSA_Coherence': [best_group_spread['coherence_plsa']]
})

# --- Load Models ---
# Load individual best models
num_topics_lda = int(best_lda['num_topics'])
best_lda_model = joblib.load(f"models/lda_model_{num_topics_lda}.pkl")

num_topics_lsa = int(best_lsa['num_topics'])
best_lsa_model = joblib.load(f"models/lsa_model_{num_topics_lsa}.pkl")

num_topics_plsa = int(best_plsa['num_topics'])
best_plsa_model = joblib.load(f"models/plsa_model_{num_topics_plsa}.pkl")

# Load best group models (by average coherence)
num_topics_group = int(best_group['num_topics'])
best_group_lda = joblib.load(f"models/lda_model_{num_topics_group}.pkl")
best_group_lsa = joblib.load(f"models/lsa_model_{num_topics_group}.pkl")
best_group_plsa = joblib.load(f"models/plsa_model_{num_topics_group}.pkl")

# Optionally, load the best minimum spread group models (if different)
num_topics_group_spread = int(best_group_spread['num_topics'])
best_group_spread_lda = joblib.load(f"models/lda_model_{num_topics_group_spread}.pkl")
best_group_spread_lsa = joblib.load(f"models/lsa_model_{num_topics_group_spread}.pkl")
best_group_spread_plsa = joblib.load(f"models/plsa_model_{num_topics_group_spread}.pkl")

# --- Display Results ---
print("Best Individual Models:")
print(best_individual_df)
print("\nBest Group Model (Average Coherence):")
print(best_group_df)
print("\nBest Group Model (Minimum Spread):")
print(best_group_spread_df)


In [None]:
# Create the original line plot
fig = px.line(
    coherence_df_long,
    x="num_topics",
    y="coherence",
    color="model",  # Differentiates lines by model
    markers=True,
    title="Coherence Value vs Number of Topics",
    labels={"num_topics": "Number of Topics", "coherence": "Coherence Score", "model": "Model"}
)
fig.update_layout(
    xaxis_title="Number of Topics",
    yaxis_title="Coherence Score",
    yaxis=dict(range=[0, 1]),
    legend_title="Model",
    template="plotly_white"
)

# Highlight the groups using vertical lines.
# Make sure num_topics_group and num_topics_group_spread are defined from your analysis.
fig.add_vline(
    x=num_topics_group,
    line=dict(dash="dash", color="black"),
    annotation_text="Best Group (Avg)",
    annotation_position="top left"
)
fig.add_vline(
    x=num_topics_group_spread,
    line=dict(dash="dash", color="orange"),
    annotation_text="Best Group (Spread)",
    annotation_position="top right"
)

fig.show()


In [None]:
# --- Helper Functions ---
def get_topic_words(model, model_type, dictionary, n_words=10):
    """Extract the top words for each topic for different model types."""
    topics = []
    if model_type == 'LDA':
        for idx in range(model.num_topics):
            topics.append([word for word, _ in model.show_topic(idx, topn=n_words)])
    elif model_type in ['LSA', 'pLSA']:
        for topic in model.components_:
            top_indices = topic.argsort()[-n_words:][::-1]
            # Use dictionary[i] to retrieve the token. If a key is missing, mark it as unknown.
            tokens = []
            for i in top_indices:
                try:
                    tokens.append(dictionary[i])
                except KeyError:
                    tokens.append(f"<UNK:{i}>")
            topics.append(tokens)
    return topics


def print_topics(topics, model_name):
    """Print topics in a formatted manner."""
    print(f"\nTópicos para {model_name}:")
    for i, topic in enumerate(topics, 1):
        print(f"Tópico {i}: {', '.join(topic)}")

def classify_documents(model, model_type, corpus, corpus_tfidf_sparse):
    """Classify documents into topics."""
    doc_topics = []
    if model_type == 'LDA':
        for doc in corpus:
            topics = model.get_document_topics(doc)
            doc_topics.append(max(topics, key=lambda x: x[1])[0])
    elif model_type in ['LSA', 'pLSA']:
        transformed = model.transform(corpus_tfidf_sparse)
        doc_topics = transformed.argmax(axis=1)
    return doc_topics

## 4. Analysis Report Generation

In [None]:
# Opens (or creates) the Markdown file for writing
with open("data/analise.md", "w", encoding="utf-8") as md:
    md.write("# Análise dos Modelos de Tópicos\n\n")
    
    # === Análise dos Melhores Modelos Individuais ===
    md.write("## ANÁLISE DOS MELHORES MODELOS INDIVIDUAIS\n")
    for technique, model in zip(
        ['LDA', 'LSA', 'pLSA'],
        [best_lda_model, best_lsa_model, best_plsa_model]
    ):
        best_num_topics = int(best_individual_df.loc[best_individual_df['Technique'] == technique, 'Best_Num_Topics'].values[0])
        md.write(f"\n### {technique} (Best Individual – {best_num_topics} tópicos)\n")
        
        topics = get_topic_words(model, technique, dictionary, n_top_words)
        md.write(f"**Palavras-chave ({technique}):**\n")
        for i, topic in enumerate(topics):
            md.write(f"- Tópico {i+1}: {', '.join(topic[:n_top_words])}\n")
        
        # Classificação dos documentos
        if technique == 'LDA':
            doc_topics = classify_documents(model, technique, corpus, None)
        else:
            doc_topics = classify_documents(model, technique, None, corpus_tfidf_sparse)
        
        topic_dist = pd.Series(doc_topics).value_counts().sort_index()
        md.write(f"\n**Distribuição de tópicos ({technique}):**\n")
        md.write("```\n" + topic_dist.to_string() + "\n```\n")
        
        md.write(f"\n**Exemplo de classificação de documentos ({technique}):**\n")
        for i in range(sample_docs):
            md.write(f"\n**Documento {i+1}:**\n")
            md.write("Texto original:\n")
            md.write("```\n" + pformat(df['julgado'].iloc[i]) + "\n```\n")
            md.write(f"Tópico atribuído: {doc_topics[i] + 1}\n")
            md.write(f"Palavras-chave: {', '.join(topics[doc_topics[i]][:n_top_words])}\n")
    
    # === Análise do Melhor Grupo (Average Coherence) ===
    md.write("\n## ANÁLISE DO MELHOR GRUPO (AVERAGE COHERENCE)\n")
    for technique, model in zip(
        ['LDA', 'LSA', 'pLSA'],
        [best_group_lda, best_group_lsa, best_group_plsa]
    ):
        md.write(f"\n### {technique} (Best Group Average – {num_topics_group} tópicos)\n")
        
        topics = get_topic_words(model, technique, dictionary, n_top_words)
        md.write(f"**Palavras-chave ({technique}):**\n")
        for i, topic in enumerate(topics):
            md.write(f"- Tópico {i+1}: {', '.join(topic[:n_top_words])}\n")
        
        if technique == 'LDA':
            doc_topics = classify_documents(model, technique, corpus, None)
        else:
            doc_topics = classify_documents(model, technique, None, corpus_tfidf_sparse)
        
        topic_dist = pd.Series(doc_topics).value_counts().sort_index()
        md.write(f"\n**Distribuição de tópicos ({technique}):**\n")
        md.write("```\n" + topic_dist.to_string() + "\n```\n")
        
        md.write(f"\n**Exemplo de classificação de documentos ({technique}):**\n")
        for i in range(sample_docs):
            md.write(f"\n**Documento {i+1}:**\n")
            md.write("Texto original:\n")
            md.write("```\n" + pformat(df['julgado'].iloc[i]) + "\n```\n")
            md.write(f"Tópico atribuído: {doc_topics[i] + 1}\n")
            md.write(f"Palavras-chave: {', '.join(topics[doc_topics[i]][:n_top_words])}\n")
    
    # === Análise do Melhor Grupo (Minimum Spread) ===
    md.write("\n## ANÁLISE DO MELHOR GRUPO (MINIMUM SPREAD)\n")
    for technique, model in zip(
        ['LDA', 'LSA', 'pLSA'],
        [best_group_spread_lda, best_group_spread_lsa, best_group_spread_plsa]
    ):
        md.write(f"\n### {technique} (Best Group Minimum Spread – {num_topics_group_spread} tópicos)\n")
        
        topics = get_topic_words(model, technique, dictionary, n_top_words)
        md.write(f"**Palavras-chave ({technique}):**\n")
        for i, topic in enumerate(topics):
            md.write(f"- Tópico {i+1}: {', '.join(topic[:n_top_words])}\n")
        
        if technique == 'LDA':
            doc_topics = classify_documents(model, technique, corpus, None)
        else:
            doc_topics = classify_documents(model, technique, None, corpus_tfidf_sparse)
        
        topic_dist = pd.Series(doc_topics).value_counts().sort_index()
        md.write(f"\n**Distribuição de tópicos ({technique}):**\n")
        md.write("```\n" + topic_dist.to_string() + "\n```\n")
        
        md.write(f"\n**Exemplo de classificação de documentos ({technique}):**\n")
        for i in range(sample_docs):
            md.write(f"\n**Documento {i+1}:**\n")
            md.write("Texto original:\n")
            md.write("```\n" + pformat(df['julgado'].iloc[i]) + "\n```\n")
            md.write(f"Tópico atribuído: {doc_topics[i] + 1}\n")
            md.write(f"Palavras-chave: {', '.join(topics[doc_topics[i]][:n_top_words])}\n")
