In [None]:
!pip install pandas gensim pyLDAvis matplotlib openpyxl

import pandas as pd
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

In [None]:
# **ESSA LDA FUNCIONOU PERFEITAMENTE** #

# Configurações
NUM_TOPICS = 40  # Número de tópicos a ser modelado
INPUT_FILE = "noticiaspt_processado.xlsx"  # Caminho do arquivo de entrada
COLUMN_NAME = "Transcrito_Limpo"  # Coluna com o texto processado
OUTPUT_VISUALIZATION = "lda_visualization.html"  # Arquivo para salvar a visualização

# Passo 1: Carregar os dados
data = pd.read_excel(INPUT_FILE)
texts = data[COLUMN_NAME].dropna().tolist()

# Passo 2: Criar o Dicionário e o Corpus
dictionary = corpora.Dictionary([text.split() for text in texts])
corpus = [dictionary.doc2bow(text.split()) for text in texts]

# Passo 3: Treinamento do modelo LDA
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    random_state=42,
    passes=10,
    minimum_probability=0
)

# Passo 4: Gerar distribuições de tópicos para cada documento
topic_distribution = []

for doc in corpus:
    doc_topics = lda_model.get_document_topics(doc, minimum_probability=0)
    topic_probs = [float(prob.real) if isinstance(prob, complex) else float(prob)
                   for _, prob in doc_topics]
    full_distribution = [0.0] * NUM_TOPICS
    for topic_id, prob in enumerate(topic_probs):
        full_distribution[topic_id] = prob
    topic_distribution.append(full_distribution)

# Passo 5: Criar um DataFrame com as distribuições de tópicos
df_topic_distribution = pd.DataFrame(
    topic_distribution,
    columns=[f'Tópico {i+1}' for i in range(NUM_TOPICS)]
)

# Opcional: Salvar o DataFrame em um arquivo CSV
df_topic_distribution.to_csv("distribuicao_topicos.csv", index=False)

# Passo 6: Visualizar os tópicos com pyLDAvis
pyLDAvis.enable_notebook()  # Ativa para notebooks Jupyter
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(lda_vis, OUTPUT_VISUALIZATION)

print(f"Visualização dos tópicos salva em: {OUTPUT_VISUALIZATION}")

In [None]:
import pandas as pd

# Assuming lda_model, dictionary, and corpus are already defined from previous code

def get_topic_word_lists(lda_model, num_words=10):
    """
    Extracts the top words for each topic from the LDA model and groups them by topic.

    Args:
        lda_model: The trained LDA model.
        num_words: The number of top words to extract per topic.

    Returns:
        A dictionary where keys are topic numbers and values are lists of top words.
    """
    topic_word_lists = {}
    for topic_num in range(lda_model.num_topics):
        words = lda_model.show_topic(topic_num, topn=num_words)
        word_list = [word for word, prob in words]
        topic_word_lists[topic_num + 1] = word_list  # Adding 1 to topic number for readability
    return topic_word_lists

topic_word_lists = get_topic_word_lists(lda_model)

# Convert to DataFrame for easier export to Excel
topic_words_df = pd.DataFrame.from_dict(topic_word_lists, orient='index').transpose()
topic_words_df.index.name = 'Topic Word'

# Save to Excel
topic_words_df.to_excel('topic_word_lists.xlsx')

# Download the file (Google Colab specific)
from google.colab import files
files.download('topic_word_lists.xlsx')