In [5]:
# GLOBAL VARIABLES. BE SURE NOT TO OVERWRITE THEM
D = 50 # Amount of documents
V = 100 # Size of the vocabulary
M = 10 # Maximum amount of same word repetition in a document -> This should be sampled (FA)
k = 5 # Amount of topics

## IMPORTANT: Please use static random seeds in **EVERY** cell where you use a random function, so that the result does **NOT** change at every run.

# 1. ARTIFICIAL DATA

### Task:

You must implement an algorithm that generates an artificial *corpus*, and return also a graph G and a correlation matrix Sigma.

In [16]:
# OUTPUT:
class SIM:  # I'm using a class as a namespace - SIM = Simulation
    W = None # matrix of D×V where Wdn is counter of appearances of the word n in document d
    Z = None # matrix of DxVxM where Zdnm is the topic index from which the m-th appearance of the word n on doc d is drawn
    B = None # matrix of kxV where Bz is the parameter vector of the distribution for the z-th topic
    C = None # matrix of kxV where Cz is the count vec of sampled topics over each word for all docs
    E = None # matrix of Dxk where Ed is the count vec of sampled drawings for topic z over all words for each doc
    Theta = None  # This is just a transformation of H
    H = None # H_d is eta_d
    G = None  # Adjacency Matrix (Check also python package "networkx" for graph objects!)
    K = None  # Precision matrix of G
    Sigma = None # Inverse of K
    
    # Don't know how much these are necessary but
    def __init__(D, V, k):
        # Create zero matrices for all possible matrices
        self.W = np.zeros((D, V))
        # Z shouldn't be created before W is sampled
    
    # Generations
    def sample_Z(self):
        # Multinomial drawing
    
    def sample_W(self):
        # Multinomial drawing
    
    # Transformations
    def update_Theta(self):
        self.Theta = np.log(self.H)
    
    def update_E(self):
        for topic in range(1, k+1):
            E[] = np.sum(Z_full == topic, axis=2).sum(axis=1)
    
    def update_C(self):
        for topic in range(1, k+1):
            self.B[topic] = np.sum(self.Z == topic, axis=2).sum(axis=0)
        
    def update_B(self):
        self.B = np.sum(self.Z == topic, axis=2).sum(axis=0)

In [17]:
# What follows is just an example...

In [18]:
import numpy as np

In [19]:
# Generating matrix Sigma
np.random.seed(123)
SIM.Sigma = np.random.random((k, k))
SIM.Sigma

array([[0.69646919, 0.28613933, 0.22685145, 0.55131477, 0.71946897],
       [0.42310646, 0.9807642 , 0.68482974, 0.4809319 , 0.39211752],
       [0.34317802, 0.72904971, 0.43857224, 0.0596779 , 0.39804426],
       [0.73799541, 0.18249173, 0.17545176, 0.53155137, 0.53182759],
       [0.63440096, 0.84943179, 0.72445532, 0.61102351, 0.72244338]])

In [20]:
'''
Step 1 (Anas):
- Beta -> Set some convenient alpha to have an informative sample (maybe needs some playing), get matrix B
- G -> Generate the graph to generate K to generate Sigma
- Eta -> Matrix H from the multivariate Normal to generate matrix Theta

Generate around 3 functions with the following outputs:
1. B
2. (G, K, Sigma)
3. (H, Theta)
'''

SyntaxError: invalid syntax (572798405.py, line 1)

In [None]:
'''
Step 2 (Francesca):
- Matrix Z -> Will come from the multinomial given H/Theta
- Matrix W -> Will come from the multinomial given Z, B
- Matrices C, E -> Will come from Z
'''

### Original Data Generating Algorithm by Kanthavel


In [1]:
# Imports
import pymc3 as pm
import numpy as np
# import numpy.linalg
from matplotlib import pyplot as plt

In [2]:
V = [
    'dog',
    'aunt',
    'cat',
    'square',
    'house',
    'root',
    'mouse',
    'cow',
    'palm',
    'tree',
    'mom',
    'sun',
    'moon',
    'father',
    'spoon',
    'circle',
    'mug',
    'glass'
]
V_arr = np.array(V)

In [3]:
def build_topic_distribution(seed=1234):
    np.random.seed(seed)
    vocabulary_size = len(V)
    distribution = np.random.random(vocabulary_size)
    return distribution / distribution.sum()

In [4]:
build_topic_distribution()

array([0.01973417, 0.0641021 , 0.04510347, 0.08092336, 0.08036872,
       0.02808795, 0.02848688, 0.08262492, 0.0987267 , 0.09025611,
       0.0368695 , 0.05162255, 0.07042403, 0.07343683, 0.03815064,
       0.05782566, 0.0518377 , 0.0014187 ])

In [5]:
topics = {  # Betas
    'red': build_topic_distribution(seed=1),
    'blue': build_topic_distribution(seed=2),
    'green': build_topic_distribution(seed=3),
    'pink': build_topic_distribution(seed=4),
    'yellow': build_topic_distribution(seed=5)
}

In [6]:
topics['red']

array([5.95139825e-02, 1.02798842e-01, 1.63226420e-05, 4.31464413e-02,
       2.09438049e-02, 1.31778118e-02, 2.65815397e-02, 4.93156112e-02,
       5.66234210e-02, 7.68955339e-02, 5.98240254e-02, 9.77889437e-02,
       2.91777592e-02, 1.25317765e-01, 3.90853414e-03, 9.56836599e-02,
       5.95543411e-02, 7.97316600e-02])

In [7]:
topics['red'].sum()

1.0

In [8]:
topics_number = len(topics)
topics_number

5

In [9]:
np.random.seed(1984)
topic_mean = np.random.random(len(topics)) 
topic_mean /= topic_mean.sum()  # Mu
topic_mean

array([0.00775911, 0.2647408 , 0.04999273, 0.427745  , 0.24976236])

In [10]:
np.random.seed(12)
topic_covariance = np.random.random((topics_number, topics_number))
topic_covariance = np.dot(topic_covariance, topic_covariance.T)
topic_covariance

array([[0.9258647 , 1.32977215, 1.1960876 , 1.02902203, 0.2354462 ],
       [1.32977215, 2.59107874, 1.65456663, 1.74944266, 0.87948846],
       [1.1960876 , 1.65456663, 2.06661948, 1.5962034 , 0.45713552],
       [1.02902203, 1.74944266, 1.5962034 , 1.42787331, 0.61474357],
       [0.2354462 , 0.87948846, 0.45713552, 0.61474357, 0.71282345]])

In [11]:
def transform_proportions(eta):
    # Not sure about this
    theta = np.exp(eta)
    theta /= theta.sum()
    return theta.squeeze()

In [24]:
# Correct with this: https://numpy.org/doc/stable/reference/random/index.html#random-quick-start

# Building a document:
N = 10

seed = 1979
np.random.seed(seed)

# Eta
topic_proportions = np.random.multivariate_normal(topic_mean, topic_covariance, 1)  # (in LDA this is a Dirichlet)

# Theta
topic_normalized_proportions = transform_proportions(topic_proportions)

# BoW
document = np.zeros(len(V))

for n in range(N):
    topic_assignment = np.random.multinomial(1, pvals=topic_normalized_proportions).squeeze().astype(bool)
    assigned_topic = np.array(list(topics.keys()))[topic_assignment][0]
    assigned_topic_distribution = topics[assigned_topic]
    word_mask = np.random.multinomial(1, pvals=assigned_topic_distribution).squeeze()
    word = V_arr[word_mask.astype(bool)][0]
    document += word_mask
    print(f'{n}-th word drawn from topic {assigned_topic} is {word}')
document

0-th word drawn from topic green is square
1-th word drawn from topic green is glass
2-th word drawn from topic blue is house
3-th word drawn from topic green is tree
4-th word drawn from topic green is glass
5-th word drawn from topic green is glass
6-th word drawn from topic blue is sun
7-th word drawn from topic blue is palm
8-th word drawn from topic red is father
9-th word drawn from topic blue is father


array([0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 2., 0., 0., 0.,
       3.])

# 2 SAMPLER

# 2.1 Hamiltonian MC Sampling

### Task:

You must implement a function that receives matrices $E_i$, $K_i$ and vector $\mu$ and generates the next $H_{i+1}$.

In [None]:
def Hamiltonian_MC_Sampling(E, K, mu):
    return None

# 2.2 MCMC Sampling

### Task:

You must implement a function that receives matrices $W$, $\Theta_{i+1}$ and $B_i$ and generates the next $Z_{i+1}$ and $B_{i+1}$.

In [21]:
def MCMC_Sampling(W, Theta, B):
    return None

# 2.3 BDMCMC Sampling

### Task:

You must implement a function that receives matrices $W$, $Z_{i+1}$ and $H_{i+1}$ and generates the next $G_{i+1}$ and $K_{i+1}$.

In [21]:
def BDMCMC_Sampling(W, Z, H):
    return None