In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from ctm_dataloader import create_dataloader
import pandas as pd

In [37]:
news_data = pd.read_csv("newsgroups_data.csv")
print("Shape of dataset:", news_data.shape)

news_data = news_data.drop(columns=["Unnamed: 0"])

documents = news_data.content
target_labels = news_data.target
target_names = news_data.target_names

news_data.head()

Shape of dataset: (11314, 4)


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [39]:
# Hyperparameters
num_epochs = 10
num_topics = 5
batch_size = 64
rho_size = 5
lr = 0.001

In [48]:
# Create dataloader
train_loader, vocab_size = create_dataloader(documents, batch_size)

for idx, data in enumerate(train_loader):
    print("Shape of data:", data.shape)
    break

Printing vocabulary:
['also' 'article' 'believe' 'call' 'come' 'drive' 'even' 'file' 'find'
 'first']
Shape of the bag-of-words model: (11314, 50)
Shape of data: torch.Size([64, 50])


In [51]:
class CTM(nn.Module):
    def __init__(self, num_topics, vocab_size, rho_size):
        super(CTM, self).__init__()
        self.num_topics = num_topics
        self.vocab_size = vocab_size
        self.rho_size = rho_size

        # Parameters for document-topic distribution
        self.alpha = nn.Parameter(torch.randn(num_topics))

        # Parameters for topic-word distribution
        self.beta = nn.Parameter(torch.randn(num_topics, vocab_size))

        # Fixed parameters for the Gaussian distribution of the correlation matrix
        self.mu = nn.Parameter(torch.zeros(num_topics))
        self.sigma = nn.Parameter(torch.ones(num_topics, num_topics))

    def forward(self, bow):
        # Calculate document-topic distribution using the softmax function
        theta = F.softmax(self.alpha, dim=0)

        # Calculate topic-word distribution using the softmax function
        phi = F.softmax(self.beta, dim=1)

        # Sample correlation matrix from a Gaussian distribution
        rho = torch.randn_like(self.mu) * self.sigma + self.mu
        sigma = torch.mm(rho, rho.t())

        # Calculate the document-topic distribution for each document in the batch
        # doc_topic_dist = torch.mm(torch.mm(bow, phi.t()), torch.diag(theta))
        doc_topic_dist = torch.mm(bow, torch.mm(theta.diag(), phi).t()).type(torch.LongTensor)

        return doc_topic_dist, theta, phi, sigma

    def ctm_loss(self, doc_topic_dist, bow, theta, phi, sigma):
        # Reconstruction loss
        recon_loss = -torch.sum(bow * torch.log(doc_topic_dist + 1e-9))

        # Regularization terms
        alpha_reg = -0.5 * torch.sum(theta * theta)
        beta_reg = -0.5 * torch.sum(phi * phi)
        rho_reg = -0.5 * torch.sum(sigma * sigma)

        # Total loss
        total_loss = recon_loss + alpha_reg + beta_reg + rho_reg

        return total_loss

    def train_ctm(self, model, dataloader, optimizer, num_epochs):
        model.train()

        for epoch in range(num_epochs):
            total_loss = 0.0

            for idx, bow in enumerate(dataloader):
                optimizer.zero_grad()

                # Forward pass
                print(f'Forward pass number {idx + 1}')
                doc_topic_dist, theta, phi, sigma = model(bow)

                # Calculate loss
                loss = self.ctm_loss(doc_topic_dist, bow, theta, phi, sigma)

                # Backward pass
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

In [52]:
model = CTM(num_topics, vocab_size, rho_size)

# Hyper parameters for the model
optimizer = optim.Adam(model.parameters(), lr = lr)

In [53]:
# Model training
model.train_ctm(model, train_loader, optimizer, num_epochs)

Forward pass number 0


RuntimeError: expected scalar type Long but found Float

In [66]:
alpha = nn.Parameter(torch.randn(num_topics))
beta = nn.Parameter(torch.randn(num_topics, vocab_size))

theta = F.softmax(alpha, dim=0)
phi = F.softmax(beta, dim=1)
bow = torch.randn(64, vocab_size)

torch.mm(bow, torch.mm(theta.diag(), phi).t())

tensor([[-6.2794e-02, -5.9271e-02,  7.9860e-02, -5.3417e-02, -5.0754e-02],
        [-1.5863e-02, -7.2526e-02, -1.0251e-01, -1.6956e-02, -1.5346e-03],
        [-1.7118e-03, -4.6495e-02, -6.8183e-02,  1.3756e-02,  3.6790e-02],
        [ 1.4866e-02,  8.1127e-02, -8.0783e-02, -2.6623e-02, -5.9851e-02],
        [-5.8799e-02,  9.6068e-03, -5.7787e-03, -1.8282e-02,  3.1468e-02],
        [ 2.1975e-03, -4.4036e-02,  3.9203e-03, -2.6338e-02, -6.7144e-03],
        [-6.2016e-02, -4.2894e-02, -2.4982e-01, -1.9733e-03, -1.4236e-02],
        [-1.4666e-02, -5.3935e-02, -1.7326e-01, -1.0494e-02, -1.9817e-02],
        [ 6.0686e-02, -8.4007e-03, -1.2442e-01, -3.4783e-03, -8.7673e-04],
        [-2.8180e-02, -1.0351e-01,  7.0084e-02,  1.6225e-02,  1.2218e-02],
        [ 6.5545e-03, -6.3201e-02, -2.2912e-02, -3.2435e-03,  2.7094e-02],
        [ 3.6090e-02, -5.2901e-02, -1.0103e-01, -2.6638e-02,  4.1363e-02],
        [-1.1474e-02, -6.2607e-03, -1.2875e-01, -4.2854e-02, -1.2747e-02],
        [ 1.6749e-02, -7.