In [1]:
import torch
import numpy as np
import pandas as pd
import pickle
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.infer.autoguide import AutoDelta

In [3]:
# --- Load data ---
df = pd.read_csv("lda_df.csv")

with open("lda_vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

X_array = np.load("BoW_X_Array.npz")["arr_0"]
X_tensor = torch.tensor(X_array, dtype=torch.float)

In [4]:
# --- Set model parameters ---
K = 15  # number of topics (can change)
num_docs, vocab_size = X_tensor.shape


# --- Define Pyro LDA model ---
def lda_model(data):
    with pyro.plate("topics", K):
        topic_words = pyro.sample("topic_words", dist.Dirichlet(torch.ones(vocab_size)))

    with pyro.plate("documents", num_docs):
        doc_topics = pyro.sample("doc_topics", dist.Dirichlet(torch.ones(K)))

        word_dists = torch.matmul(doc_topics, topic_words)
        word_dists = word_dists / word_dists.sum(dim=1, keepdim=True)

        total_count = 100
        pyro.sample(
            "doc_words",
            dist.Multinomial(total_count=total_count, probs=word_dists),
            obs=data,
        )

In [6]:
# --- Set up SVI ---
guide = AutoDelta(lda_model)
optimizer = pyro.optim.Adam({"lr": 0.01})
svi = SVI(lda_model, guide, optimizer, loss=Trace_ELBO())

In [9]:
# --- Training loop ---
num_steps = 100
for step in range(num_steps):
    loss = svi.step(X_tensor)
    if step % 10 == 0:
        print(f"[step {step}] loss = {loss:.2f}")

[step 0] loss = -645081.38
[step 10] loss = -649994.19
[step 20] loss = -654594.69
[step 30] loss = -659034.31
[step 40] loss = -663410.34
[step 50] loss = -667763.56
[step 60] loss = -672065.88
[step 70] loss = -676251.25
[step 80] loss = -680276.12
[step 90] loss = -684130.19


Il tuo training è assolutamente in linea con le aspettative. Il loss parte da -645k perché stai modellando tanti documenti e parole contemporaneamente, e Pyro somma tutto.

Fammi sapere se vuoi normalizzare, loggare o tracciare il progresso del training visivamente.

In [10]:
# --- Extract results ---
posterior = guide()
topic_words = posterior["topic_words"]
doc_topics = posterior["doc_topics"]

In [11]:
# --- OPTIONAL: Save topic vectors per doc ---
topic_df = pd.DataFrame(
    doc_topics.detach().numpy(), columns=[f"topic_{i}" for i in range(K)]
)
df_with_topics = pd.concat([df.reset_index(drop=True), topic_df], axis=1)
df_with_topics.to_csv("lda_output_with_topics.csv", index=False)