In [17]:
# GLOBAL VARIABLES. BE SURE NOT TO OVERWRITE THEM
D = 50 # Amount of documents
V = 100 # Size of the vocabulary
M = 10 # Maximum amount of same word repetition in a document
k = 5 # Amount of topics

## IMPORTANT: Please use static random seeds in **EVERY** cell where you use a random function, so that the result does **NOT** change at every run.

# 1. ARTIFICIAL DATA

### Task:

You must implement an algorithm that generates an artificial *corpus*, and return also a graph G and a correlation matrix Sigma.

In [18]:
import numpy as np

In [19]:
# OUTPUT:
class SIM:  # I'm using a class as a namespace - SIM = Simulation
    W = None # matrix of D×V where Wdn is counter of appearances of the word n in document d
    Z = None # matrix of DxVxM where Zdnm is the topic index from which the m-th appearance of the word n on doc d is drawn
    B = None # matrix of kxV where Bz is the parameter vector of the distribution for the z-th topic
    C = None # matrix of kxV where Cz is the count vec of sampled topics over each word for all docs
    E = None # matrix of Dxk where Ed is the count vec of sampled drawings for topic z over all words for each doc
    H = None # H_d is eta_d
    Theta = None  # This is just a transformation of H
    G = None  # Adjacency Matrix (Check also python package "networkx" for graph objects!)
    K = None  # Precision matrix of G
    Sigma = None # Inverse of K
    
    # Remember we will have indexes starting from 0 so all max are -=1
    
    def __init__(D, V, k):
        # Create zero matrices for all possible matrices
        self.W = np.zeros((D, V))
        self.B = np.zeros((k, V))
        self.C = np.zeros((k, V))
        self.E = np.zeros((D, k))
        self.H = np.zeros((D, k))
        self.Theta = np.zeros((D, k))
        self.G = np.zeros((k, k))
        self.K = np.zeros((k, k))
        self.Sigma = np.zeros((k, k))
        # Z shouldn't be created before W is sampled (depends if generated or read)
    
    def get_M(self):
        # Ref: https://numpy.org/doc/stable/reference/generated/numpy.matrix.max.html
        return W.max()
    
    # Generations
    def generate_W_from_M(self, M):
        # We use M as the maximum repetitions of words
        # Hence, for each word in V in each document, we throw a randint(0,M) for the generative process
        if M != 0:
            for d in range(D):
                for n in range(V):
                    self.W[d,n] = np.random.randint(0,M)
            print('Success: W generated from M')
            return 0
        else:
            print('Error: Input 0 for the chosen method')
            return 1
        
    def generate_W_from_Z(self):
        if np.sum(self.B, axis=1).sum(axis=0) > 0 and np.sum(self.Z, axis=1).sum(axis=0) > 0 :
            # Multinomial drawing from Z, B
            # Ref https://numpy.org/doc/stable/reference/random/generated/numpy.random.multinomial.html
            # Params: samples (counts), probabilities, repetitions (different vectors)
            # np.random.multinomial(1, self.B[Z[d,n,m]], size=1)
            # This will give a canonical vector over V
            for d in range(D):
                for n in range(V):
                    for m in range(len(Z[d,n])):
                        self.W[d] += np.random.multinomial(1, self.B[Z[d,n,m]], size=1)
            print('Success: W generated from Z')
            return 0
        else:
            print('Error: Input 0 for the chosen method')
            return 1
        
    def generate_Z(self):
        # Multinomial drawing from Theta, because it has to be normalized
        # Ref https://numpy.org/doc/stable/reference/random/generated/numpy.random.multinomial.html
        # np.random.multinomial(1, self.Theta[d], size=1)
        # This will give a canonical vector over k
        if np.sum(self.Theta, axis=1).sum(axis=0) == 0:
            print('Error: Theta matrix 0')
            return 1
        for d in range(D):
            for n in range(V):
                for m in range(self.W[d,n]):
                    mult = np.random.multinomial(1, self.Theta[d], size=1)
                    self.Z[d,n,m] = np.where(mult == 1)
        print('Success: Z generated from Theta')
        return 0
    
    # Transformations
    def update_Theta(self):
        for d in range(D):
            self.Theta[d] = np.exp(self.H[d]) / np.sum(np.exp(self.H[d]), axis=0)
        print('Success: Theta transformed from H')
        return 0
    
    def update_E(self):
        for topic in range(1, k+1):
            E[topic,:] = np.sum(self.Z == topic, axis=2).sum(axis=1)
        print('Success: E transformed from Z')
        return 0
    
    def update_C(self):
        for topic in range(1, k+1):
            self.C[topic, :] = np.sum(self.Z == topic, axis=2).sum(axis=0)
        print('Success: C transformed from Z')
        return 0
        
    def update_B(self):
        # Note this is the transformation from C
        for topic in range(0, k):
            self.B[topic] = self.C[topic] / sum(self.C[topic])
        print('Success: B transformed from C')
        return 0
    
    def update_Sigma(self):
        self.Sigma = np.linalg.inv(self.K)
        print('Success: Sigma transformed from K')
        return 0
    
    # Initializing with real data
    # def save_W()
    
    # Priors (Anas)
    def build_topic_distribution(seed=1234):
        np.random.seed(seed)
        vocabulary_size = V
        distribution = np.random.random(vocabulary_size)
        return distribution / distribution.sum()
    
    def sample_B(self, alpha):
        # B is the matrix whose rows are the distribution of topic i over the vocabulary
        # Each row means : for each topic i we have the probability of word i to occur
        # Using what Kanthavel did before
        b = np.empty((k,V))
        
        for i in range(k):
            b[i,:] = build_topic_distribution(seed=1234)
        
        self.B = b 
        return None
        
    def sample_GK(self, gamma):
        # Bernoulli for G
        # generate a random adjacency matrix
        matrix = np.array([[int(bernoulli.rvs(gamma, size=1)) for i in range(n)] for j in range(n)])
        for i in range(n):
            matrix[i][i] = 0
        for i in range(n):
            for j in range(n):
                matrix[j][i] = matrix[i][j]
        self.G = matrix
        # I can build K for using make_sparse_spd_matrix from sklearn.datasets for example
        self.K = make_sparse_spd_matrix(k,alpha=0.95, norm_diag=False, smallest_coef=0.1, largest_coef=0.9, random_state=None)
        update_Sigma()
    
    def sample_H(self, alpha):
        # Multivariate Normal
        mu = np.zeros(k)
        self.H = np.random.multivariate_gaussian(mu,Sigma,k)
        update_Theta()


Step 1 (Anas):
- Beta -> Set some convenient alpha to have an informative sample (maybe needs some playing), get matrix B
- G -> Generate the graph to generate K to generate Sigma
- Eta -> Matrix H from the multivariate Normal to generate matrix Theta

Generate around 3 functions with the following outputs:
1. B
2. (G, K, Sigma)
3. (H, Theta)

Step 2 (Francesca):
- Matrix Z -> Will come from the multinomial given H/Theta
- Matrix W -> Will come from the multinomial given Z, B
- Matrices C, E -> Will come from Z

### Original Data Generating Algorithm by Kanthavel


In [21]:
# Imports
import pymc3 as pm
import numpy as np
# import numpy.linalg
from matplotlib import pyplot as plt

In [22]:
V = [
    'dog',
    'aunt',
    'cat',
    'square',
    'house',
    'root',
    'mouse',
    'cow',
    'palm',
    'tree',
    'mom',
    'sun',
    'moon',
    'father',
    'spoon',
    'circle',
    'mug',
    'glass'
]
V_arr = np.array(V)

In [23]:
def build_topic_distribution(seed=1234):
    np.random.seed(seed)
    vocabulary_size = len(V)
    distribution = np.random.random(vocabulary_size)
    return distribution / distribution.sum()

In [24]:
build_topic_distribution()

array([0.01973417, 0.0641021 , 0.04510347, 0.08092336, 0.08036872,
       0.02808795, 0.02848688, 0.08262492, 0.0987267 , 0.09025611,
       0.0368695 , 0.05162255, 0.07042403, 0.07343683, 0.03815064,
       0.05782566, 0.0518377 , 0.0014187 ])

In [25]:
topics = {  # Betas
    'red': build_topic_distribution(seed=1),
    'blue': build_topic_distribution(seed=2),
    'green': build_topic_distribution(seed=3),
    'pink': build_topic_distribution(seed=4),
    'yellow': build_topic_distribution(seed=5)
}

In [26]:
topics['red']

array([5.95139825e-02, 1.02798842e-01, 1.63226420e-05, 4.31464413e-02,
       2.09438049e-02, 1.31778118e-02, 2.65815397e-02, 4.93156112e-02,
       5.66234210e-02, 7.68955339e-02, 5.98240254e-02, 9.77889437e-02,
       2.91777592e-02, 1.25317765e-01, 3.90853414e-03, 9.56836599e-02,
       5.95543411e-02, 7.97316600e-02])

In [27]:
topics['red'].sum()

1.0

In [28]:
topics_number = len(topics)
topics_number

5

In [29]:
np.random.seed(1984)
topic_mean = np.random.random(len(topics)) 
topic_mean /= topic_mean.sum()  # Mu
topic_mean

array([0.00775911, 0.2647408 , 0.04999273, 0.427745  , 0.24976236])

In [30]:
np.random.seed(12)
topic_covariance = np.random.random((topics_number, topics_number))
topic_covariance = np.dot(topic_covariance, topic_covariance.T)
topic_covariance

array([[0.9258647 , 1.32977215, 1.1960876 , 1.02902203, 0.2354462 ],
       [1.32977215, 2.59107874, 1.65456663, 1.74944266, 0.87948846],
       [1.1960876 , 1.65456663, 2.06661948, 1.5962034 , 0.45713552],
       [1.02902203, 1.74944266, 1.5962034 , 1.42787331, 0.61474357],
       [0.2354462 , 0.87948846, 0.45713552, 0.61474357, 0.71282345]])

In [31]:
def transform_proportions(eta):
    # Not sure about this
    theta = np.exp(eta)
    theta /= theta.sum()
    return theta.squeeze()

In [32]:
# Correct with this: https://numpy.org/doc/stable/reference/random/index.html#random-quick-start

# Building a document:
N = 10

seed = 1979
np.random.seed(seed)

# Eta
topic_proportions = np.random.multivariate_normal(topic_mean, topic_covariance, 1)  # (in LDA this is a Dirichlet)

# Theta
topic_normalized_proportions = transform_proportions(topic_proportions)

# BoW
document = np.zeros(len(V))

for n in range(N):
    topic_assignment = np.random.multinomial(1, pvals=topic_normalized_proportions).squeeze().astype(bool)
    assigned_topic = np.array(list(topics.keys()))[topic_assignment][0]
    assigned_topic_distribution = topics[assigned_topic]
    word_mask = np.random.multinomial(1, pvals=assigned_topic_distribution).squeeze()
    word = V_arr[word_mask.astype(bool)][0]
    document += word_mask
    print(f'{n}-th word drawn from topic {assigned_topic} is {word}')
document

0-th word drawn from topic yellow is spoon
1-th word drawn from topic yellow is mouse
2-th word drawn from topic blue is tree
3-th word drawn from topic blue is cat
4-th word drawn from topic green is moon
5-th word drawn from topic blue is mug
6-th word drawn from topic pink is circle
7-th word drawn from topic green is dog
8-th word drawn from topic pink is circle
9-th word drawn from topic green is sun


array([1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 2., 1.,
       0.])

# 2 SAMPLER

# 2.1 Hamiltonian MC Sampling

### Task:

You must implement a function that receives matrices $E_i$, $K_i$ and vector $\mu$ and generates the next $H_{i+1}$.


. $E$ matrix of $D \times k$ where $E_d$ is the $k$-dim vector of counts of sampled drawings for the $z$-th topic over all words for each document

. $K$ matrix of $k \times k$ representing the precision matrix associated to the graph $G$

. $\mu = 0$

. $H$ matrix of $D \times k$ where $H_d = \eta_d$ is the $k$-dim vector of the topic prevalences over document $d$

In [4]:
import numpy as np
import numpy.linalg

In [5]:
def sampled_distribution_kernel(eta, K, E):
    k = eta.shape[0]
    eta_K_eta = -0.5 * eta.dot(K.dot(eta))
    E_eta = E.dot(eta)
    sum_eta_pow_k = np.sum(np.exp(eta)) ** k
    return np.exp(eta_K_eta + E_eta) / sum_eta_pow_k

In [6]:
def H_sampler(E, Sigma, H_current=None, burn_in=100, seed=None):
    
    np.random.seed(seed)
    
    K = np.linalg.inv(Sigma)
    
    D, k = E.shape  # Number of documents, Number of topics
    
    if H_current is None:
        H_current = np.zeros((D, k))
    
    H_sampled = np.zeros((D, k))
    
    for d in range(D):  # Iterating over each document
        eta_current = H_current[d]
        E_d = E[d]
        for iteration in range(burn_in + 1):
            
            # Sampling proposed eta from multivariate normal (q "proposal density")
            eta_prop = np.random.multivariate_normal(eta_current, Sigma)
            
            # Compute acceptance probability
            alpha = min(1, sampled_distribution_kernel(eta_prop, K, E_d) / sampled_distribution_kernel(eta_current, K, E_d))
            
            if alpha == 1 or np.random.uniform(0.0, 1.0) < alpha:
                eta_current = eta_prop
            
        H_sampled[d] = eta_current
    
    return H_sampled

# 2.2 MCMC Sampling

### Task:

You must implement a function that receives matrices $W$, $\Theta_{i+1}$ and $B_i$ and generates the next $Z_{i+1}$ and $B_{i+1}$.

In [None]:
def MCMC_Sampling(W, Theta, B):
    return None

# 2.3 BDMCMC Sampling

### Task:

You must implement a function that receives matrices $W$, $Z_{i+1}$ and $H_{i+1}$ and generates the next $G_{i+1}$ and $K_{i+1}$.

In [40]:
def BDMCMC_Sampling(W, Z, H):
    #update_G(W,Z,H,K,G)
    #update_K()
    return (G,K)

In [39]:
gamma = 0.05

def update_G(W, Z, H, K, G):

    N = G.shape[0]
    delta_K = 0
    beta_K = 0

    death_rates = np.zeros((k,k))
    birth_rates = np.zeros((k,k))

    PrHK = lambda K,H:      K.size**(n/2) * np.exp(-0.5*np.trace(np.matmul(np.matmul(K, np.transpose(H)), H)))
    PrK_G = lambda K,G,D,b: K.size**(b+D-2) * np.exp(-0.5*np.trace(np.matmul(S + np.matmul(np.transpose(H), H), K)))
    PrG = lambda gamma, E:  (gamma/(1-gamma))**(E.size)

    PrKG_H = lambda K,G,H,D,b,gamma,E: PrHK(K,H)*PrK_G(K,G,D,b)*PrG(gamma,E)

    Pr_init = PrKG_H(K,G,H,D,b,gamma,E)

    for i in range(N):
        for j in range(i+1, N):
            if G[i,j]:
                G_loop = G.copy()
                G_loop[i,j], G_loop[j,i] = 0
                #technically, we should compute K_loop here...
                Pr_loop = PrKG_H(K,G_loop,H,D,b,gamma,E)

                death_rate = Pr_loop/Pr_init

                if death_rate > 1:
                    death_rate = 1
                death_rates[i,j], death_rates[j,i] = death_rate, death_rate
                delta_K += death_rate

            else:
                G_loop = G.copy()
                G_loop[i,j], G_loop[j,i] = 1
                #technically, we should compute K_loop here...
                Pr_loop = PrKG_H(K,G_loop,H,D,b,gamma,E) 

                birth_rate = Pr_loop/Pr_init

                if birth_rate > 1:
                    birth_rate = 1
                birth_rates[i,j], birth_rates[j,i] = birth_rate, birth_rate
                beta_K += birth_rate
    
    W = 1/(beta_K + delta_K)

    pr_death = W*death_rates
    pr_birth = W*birth_rates

    for i in range(N):
        for j in range(i+1, N):
            if pr_death[i,j] > 0.5:
                G[i,j], G[j,i] = 0,0
            elif pr_birth[i,j] > 0.5:
                G[i,j], G[j,i] = 1,1

    return G
