In [1]:
import torch
import numpy as np
import pandas as pd
import pickle
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.infer.autoguide import AutoDelta

In [3]:
# --- Load data ---
df = pd.read_csv("lda_df.csv")

with open("lda_vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

X_array = np.load("BoW_X_Array.npz")["arr_0"]
X_tensor = torch.tensor(X_array, dtype=torch.float)

In [12]:
# --- Set model parameters ---
K = 30
num_docs, vocab_size = X_tensor.shape


# --- Define Pyro LDA model ---
def lda_model(data):
    with pyro.plate("topics", K):
        topic_words = pyro.sample("topic_words", dist.Dirichlet(torch.ones(vocab_size)))

    with pyro.plate("documents", num_docs):
        doc_topics = pyro.sample("doc_topics", dist.Dirichlet(torch.ones(K)))

        word_dists = torch.matmul(doc_topics, topic_words)
        word_dists = word_dists / word_dists.sum(dim=1, keepdim=True)

        total_count = 100
        pyro.sample(
            "doc_words",
            dist.Multinomial(total_count=total_count, probs=word_dists),
            obs=data,
        )

In [6]:
# --- Set up SVI ---
guide = AutoDelta(lda_model)
optimizer = pyro.optim.Adam({"lr": 0.01})
svi = SVI(lda_model, guide, optimizer, loss=Trace_ELBO())

In [9]:
# --- Training loop ---
num_steps = 100
for step in range(num_steps):
    loss = svi.step(X_tensor)
    if step % 10 == 0:
        print(f"[step {step}] loss = {loss:.2f}")

[step 0] loss = -645081.38
[step 10] loss = -649994.19
[step 20] loss = -654594.69
[step 30] loss = -659034.31
[step 40] loss = -663410.34
[step 50] loss = -667763.56
[step 60] loss = -672065.88
[step 70] loss = -676251.25
[step 80] loss = -680276.12
[step 90] loss = -684130.19


Il tuo training è assolutamente in linea con le aspettative. Il loss parte da -645k perché stai modellando tanti documenti e parole contemporaneamente, e Pyro somma tutto.

Fammi sapere se vuoi normalizzare, loggare o tracciare il progresso del training visivamente.

In [10]:
# --- Extract results ---
posterior = guide()
topic_words = posterior["topic_words"]
doc_topics = posterior["doc_topics"]

In [11]:
# --- OPTIONAL: Save topic vectors per doc ---
topic_df = pd.DataFrame(
    doc_topics.detach().numpy(), columns=[f"topic_{i}" for i in range(K)]
)
df_with_topics = pd.concat([df.reset_index(drop=True), topic_df], axis=1)
df_with_topics.to_csv("lda_output_with_topics.csv", index=False)

### Topic Number Selection (K)

Since the number of topics `K` in **Latent Dirichlet Allocation (LDA)** must be specified in advance, we ran the Pyro-based LDA model across multiple candidate values (e.g. K = 5, 10, 15, ..., 30) and monitored the Evidence Lower Bound (ELBO) loss during training. 

For each value of `K`, we trained the model for a (smaller) fixed number of steps and recorded the final loss. The model with the lowest ELBO loss was selected as the best configuration. This approach allows us to balance model complexity and fit without relying on manual inspection or external coherence metrics.


In [13]:
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.infer.autoguide import AutoDelta
import torch
import pandas as pd

K_values = [5, 10, 15, 20, 25, 30]
best_loss = float("inf")
best_K = None
best_outputs = {}

for K in K_values:
    print(f"\n--- Training LDA with K = {K} ---")

    num_docs, vocab_size = X_tensor.shape

    def lda_model(data):
        with pyro.plate("topics", K):
            topic_words = pyro.sample(
                "topic_words", dist.Dirichlet(torch.ones(vocab_size))
            )

        with pyro.plate("documents", num_docs):
            doc_topics = pyro.sample("doc_topics", dist.Dirichlet(torch.ones(K)))
            word_dists = torch.matmul(doc_topics, topic_words)
            word_dists = word_dists / word_dists.sum(dim=1, keepdim=True)
            pyro.sample(
                "doc_words",
                dist.Multinomial(total_count=100, probs=word_dists),
                obs=data,
            )

    pyro.clear_param_store()
    guide = AutoDelta(lda_model)
    svi = SVI(lda_model, guide, pyro.optim.Adam({"lr": 0.01}), loss=Trace_ELBO())

    num_steps = 100
    loss = None
    for step in range(num_steps):
        loss = svi.step(X_tensor)
        if step % 10 == 0:
            print(f"[K={K} | step {step}] loss = {loss:.2f}")

    print(f"[K={K}] final loss = {loss:.2f}")

    if loss < best_loss:
        print(f"✅ New best K = {K}")
        best_loss = loss
        best_K = K
        posterior = guide()
        best_outputs = {
            "K": K,
            "loss": loss,
            "doc_topics": posterior["doc_topics"].detach().clone(),
            "topic_words": posterior["topic_words"].detach().clone(),
        }

# Save the best output
print(f"\n🎯 Best K = {best_K} with loss = {best_loss:.2f}")
doc_topics_df = pd.DataFrame(
    best_outputs["doc_topics"].numpy(), columns=[f"topic_{i}" for i in range(best_K)]
)
df_with_topics = pd.concat([df.reset_index(drop=True), doc_topics_df], axis=1)
df_with_topics.to_csv(f"best_lda_doc_topics_K{best_K}.csv", index=False)
torch.save(best_outputs["topic_words"], f"best_topic_words_K{best_K}.pt")


--- Training LDA with K = 5 ---
[K=5 | step 0] loss = 51524.04
[K=5 | step 10] loss = 45766.19
[K=5 | step 20] loss = 40301.16
[K=5 | step 30] loss = 35131.79
[K=5 | step 40] loss = 30212.57
[K=5 | step 50] loss = 25451.38
[K=5 | step 60] loss = 20725.76
[K=5 | step 70] loss = 15940.26
[K=5 | step 80] loss = 11106.19
[K=5 | step 90] loss = 6364.44
[K=5] final loss = 2336.94
✅ New best K = 5

--- Training LDA with K = 10 ---
[K=10 | step 0] loss = -281346.40
[K=10 | step 10] loss = -287104.27
[K=10 | step 20] loss = -292569.77
[K=10 | step 30] loss = -297743.02
[K=10 | step 40] loss = -302670.43
[K=10 | step 50] loss = -307437.62
[K=10 | step 60] loss = -312155.18
[K=10 | step 70] loss = -316923.46
[K=10 | step 80] loss = -321781.40
[K=10 | step 90] loss = -326667.27
[K=10] final loss = -330968.90
✅ New best K = 10

--- Training LDA with K = 15 ---
[K=15 | step 0] loss = -629626.97
[K=15 | step 10] loss = -635384.88
[K=15 | step 20] loss = -640849.72
[K=15 | step 30] loss = -646009.62


In [15]:
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.infer.autoguide import AutoDelta
import torch
import pandas as pd

K_values = [40, 50, 60, 70, 80, 90]
best_loss = float("inf")
best_K = None
best_outputs = {}

for K in K_values:
    print(f"\n--- Training LDA with K = {K} ---")

    num_docs, vocab_size = X_tensor.shape

    def lda_model(data):
        with pyro.plate("topics", K):
            topic_words = pyro.sample(
                "topic_words", dist.Dirichlet(torch.ones(vocab_size))
            )

        with pyro.plate("documents", num_docs):
            doc_topics = pyro.sample("doc_topics", dist.Dirichlet(torch.ones(K)))
            word_dists = torch.matmul(doc_topics, topic_words)
            word_dists = word_dists / word_dists.sum(dim=1, keepdim=True)
            pyro.sample(
                "doc_words",
                dist.Multinomial(total_count=100, probs=word_dists),
                obs=data,
            )

    pyro.clear_param_store()
    guide = AutoDelta(lda_model)
    svi = SVI(lda_model, guide, pyro.optim.Adam({"lr": 0.01}), loss=Trace_ELBO())

    num_steps = 10
    loss = None
    for step in range(num_steps):
        loss = svi.step(X_tensor)
        if step % 1 == 0:
            print(f"[K={K} | step {step}] loss = {loss:.2f}")

    print(f"[K={K}] final loss = {loss:.2f}")

    if loss < best_loss:
        print(f"✅ New best K = {K}")
        best_loss = loss
        best_K = K
        posterior = guide()
        best_outputs = {
            "K": K,
            "loss": loss,
            "doc_topics": posterior["doc_topics"].detach().clone(),
            "topic_words": posterior["topic_words"].detach().clone(),
        }

# Save the best output
print(f"\n🎯 Best K = {best_K} with loss = {best_loss:.2f}")
doc_topics_df = pd.DataFrame(
    best_outputs["doc_topics"].numpy(), columns=[f"topic_{i}" for i in range(best_K)]
)
df_with_topics = pd.concat([df.reset_index(drop=True), doc_topics_df], axis=1)
df_with_topics.to_csv(f"best_lda_doc_topics_K{best_K}.csv", index=False)
torch.save(best_outputs["topic_words"], f"best_topic_words_K{best_K}.pt")


--- Training LDA with K = 40 ---
[K=40 | step 0] loss = -2479647.09
[K=40 | step 1] loss = -2480235.25
[K=40 | step 2] loss = -2480820.69
[K=40 | step 3] loss = -2481403.41
[K=40 | step 4] loss = -2481983.34
[K=40 | step 5] loss = -2482560.62
[K=40 | step 6] loss = -2483135.16
[K=40 | step 7] loss = -2483707.03
[K=40 | step 8] loss = -2484276.09
[K=40 | step 9] loss = -2484842.38
[K=40] final loss = -2484842.38
✅ New best K = 40

--- Training LDA with K = 50 ---
[K=50 | step 0] loss = -3249508.47
[K=50 | step 1] loss = -3250096.62
[K=50 | step 2] loss = -3250682.06
[K=50 | step 3] loss = -3251264.78
[K=50 | step 4] loss = -3251844.69
[K=50 | step 5] loss = -3252421.91
[K=50 | step 6] loss = -3252996.44
[K=50 | step 7] loss = -3253568.19
[K=50 | step 8] loss = -3254137.19
[K=50 | step 9] loss = -3254703.28
[K=50] final loss = -3254703.28
✅ New best K = 50

--- Training LDA with K = 60 ---
[K=60 | step 0] loss = -4030704.59
[K=60 | step 1] loss = -4031292.75
[K=60 | step 2] loss = -4031

In [None]:
# Carica la matrice documento-termine
X_array = np.load("BoW_X_Array.npz")["arr_0"]
X_tensor = torch.tensor(X_array, dtype=torch.float)

num_docs, vocab_size = X_tensor.shape

# Range di valori K da testare
K_values = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90]
results = []

for K in K_values:
    print(f"\n--- Training LDA with K = {K} ---")

    def lda_model(data):
        with pyro.plate("topics", K):
            topic_words = pyro.sample(
                "topic_words", dist.Dirichlet(torch.ones(vocab_size))
            )
        with pyro.plate("documents", num_docs):
            doc_topics = pyro.sample("doc_topics", dist.Dirichlet(torch.ones(K)))
            word_dists = torch.matmul(doc_topics, topic_words)
            word_dists = word_dists / word_dists.sum(dim=1, keepdim=True)
            pyro.sample(
                "doc_words",
                dist.Multinomial(total_count=100, probs=word_dists),
                obs=data,
            )

    pyro.clear_param_store()
    guide = AutoDelta(lda_model)
    svi = SVI(lda_model, guide, pyro.optim.Adam({"lr": 0.01}), loss=Trace_ELBO())

    # Solo 10 passi di training
    for step in range(10):
        loss = svi.step(X_tensor)

    # Estrai i risultati
    posterior = guide()
    doc_topics = posterior["doc_topics"]
    topic_usage = doc_topics.sum(dim=0).detach().numpy()
    threshold = 5.0
    num_active_topics = (topic_usage > threshold).sum()

    results.append(
        {"K": K, "Final Loss": float(loss), "Active Topics": int(num_active_topics)}
    )

# Mostra i risultati
results_df = pd.DataFrame(results)
print("\n📊 Risultati confronto K:")
print(results_df)


--- Training LDA with K = 5 ---

--- Training LDA with K = 10 ---

--- Training LDA with K = 15 ---

--- Training LDA with K = 20 ---

--- Training LDA with K = 25 ---

--- Training LDA with K = 30 ---

--- Training LDA with K = 35 ---

--- Training LDA with K = 40 ---

--- Training LDA with K = 45 ---

--- Training LDA with K = 50 ---

--- Training LDA with K = 55 ---

--- Training LDA with K = 60 ---

--- Training LDA with K = 65 ---

--- Training LDA with K = 70 ---

--- Training LDA with K = 75 ---

--- Training LDA with K = 80 ---

--- Training LDA with K = 85 ---

--- Training LDA with K = 90 ---

📊 Risultati confronto K:


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.

In [18]:
results_df

Unnamed: 0,K,Final Loss,Active Topics
0,5,46329.35,5
1,10,-286541.1,10
2,15,-634821.7,15
3,20,-992905.2,20
4,25,-1358210.0,25
5,30,-1729240.0,30
6,35,-2105015.0,35
7,40,-2484842.0,40
8,45,-2868204.0,45
9,50,-3254703.0,50
