In [5]:
# GLOBAL VARIABLES. BE SURE NOT TO OVERWRITE THEM
D = 50 # Amount of documents
V = 100 # Size of the vocabulary
M = 10 # Maximum amount of same word repetition in a document
k = 5 # Amount of topics

## IMPORTANT: Please use static random seeds in **EVERY** cell where you use a random function, so that the result does **NOT** change at every run.

# 1. ARTIFICIAL DATA

### Task:

You must implement an algorithm that generates an artificial *corpus*, and return also a graph G and a correlation matrix Sigma.

In [None]:
import numpy as np

In [16]:
# OUTPUT:
class SIM:  # I'm using a class as a namespace - SIM = Simulation
    W = None # matrix of D×V where Wdn is counter of appearances of the word n in document d
    Z = None # matrix of DxVxM where Zdnm is the topic index from which the m-th appearance of the word n on doc d is drawn
    B = None # matrix of kxV where Bz is the parameter vector of the distribution for the z-th topic
    C = None # matrix of kxV where Cz is the count vec of sampled topics over each word for all docs
    E = None # matrix of Dxk where Ed is the count vec of sampled drawings for topic z over all words for each doc
    H = None # H_d is eta_d
    Theta = None  # This is just a transformation of H
    G = None  # Adjacency Matrix (Check also python package "networkx" for graph objects!)
    K = None  # Precision matrix of G
    Sigma = None # Inverse of K
    
    # Remember we will have indexes starting from 0 so all max are -=1
    
    # Don't know how much these are necessary but
    def __init__(D, V, k):
        # Create zero matrices for all possible matrices
        self.W = np.zeros((D, V))
        self.B = np.zeros((k, V))
        self.C = np.zeros((k, V))
        self.E = np.zeros((D, k))
        self.H = np.zeros((D, k))
        self.Theta = np.zeros((D, k))
        self.G = np.zeros((k, k))
        self.K = np.zeros((k, k))
        self.Sigma = np.zeros((k, k))
        # Z shouldn't be created before W is sampled (depends if generated or read)
    
    # Generations
    def generate_W(self, method, M):
        # We use M as the maximum repetitions of words
        # Hence, for each word in V in each document, we throw a randint(0,M) for the generative process
        if method == 'from_M' and M != 0:
            for d in range(D):
                for n in range(V):
                    self.W[d,n] = np.random.randint(0,M)
            print('Success: W generated from M')
            return 0
        elif method == 'from_Z' and np.sum(self.B, axis=1).sum(axis=0) > 0 and np.sum(self.Z, axis=1).sum(axis=0) > 0 :
            # Multinomial drawing from Z, B
            # Params: samples (counts), probabilities, repetitions (different vectors)
            # np.random.multinomial(1, self.B[Z[d,n,m]], size=1)
            # This will give a canonical vector over V
            for d in range(D):
                for n in range(V):
                    for m in range(len(Z[d,n])):
                        self.W[d] += np.random.multinomial(1, self.B[Z[d,n,m]], size=1)
            print('Success: W generated from Z')
            return 0
        else
            print('Error: Input 0 for the chosen method')
            return 1
        
    def generate_Z(self):
        # Multinomial drawing from Theta, because it has to be normalized
        # np.random.multinomial(1, self.Theta[d], size=1)
        # This will give a canonical vector over k
        if np.sum(self.Theta, axis=1).sum(axis=0) == 0:
            print('Error: Theta matrix 0')
            return 1
        for d in range(D):
            for n in range(V):
                for m in range(self.W[d,n]):
                    mult = np.random.multinomial(1, self.Theta[d], size=1)
                    self.Z[d,n,m] = np.where(mult == 1)
        print('Success: Z generated from Theta')
        return 0
    
    # Transformations
    def update_Theta(self):
        for d in range(D):
            self.Theta[d] = np.exp(self.H[d]) / np.sum(np.exp(self.H[d]), axis=0)
        print('Success: Theta transformed from H')
        return 0
    
    def update_E(self):
        for topic in range(1, k+1):
            E[topic,:] = np.sum(self.Z == topic, axis=2).sum(axis=1)
        print('Success: E transformed from Z')
        return 0
    
    def update_C(self):
        for topic in range(1, k+1):
            self.C[topic, :] = np.sum(self.Z == topic, axis=2).sum(axis=0)
        print('Success: C transformed from Z')
        return 0
        
    def update_B(self):
        # Note this is the transformation from C
        for topic in range(0, k):
            self.B[topic] = self.C[topic] / sum(self.C[topic])
        print('Success: B transformed from C')
        return 0
    
    def update_Sigma(self):
        self.Sigma = np.linalg.inv(self.K)
        print('Success: Sigma transformed from K')
        return 0
    
    # Initializing with real data
    # def save_W()
    
    # Priors (Anas)
    def sample_B(self, alpha):
        # Dirichlet
        
    def sample_GK(self, gamma):
        # Bernoulli for G
        # G-Wishart for K given G
        update_Sigma()
    
    def sample_H(self, alpha):
        # Multivariate Normal
        update_Theta()
        

In [17]:
'''
Step 1 (Anas):
- Beta -> Set some convenient alpha to have an informative sample (maybe needs some playing), get matrix B
- G -> Generate the graph to generate K to generate Sigma
- Eta -> Matrix H from the multivariate Normal to generate matrix Theta

Generate around 3 functions with the following outputs:
1. B
2. (G, K, Sigma)
3. (H, Theta)

Step 2 (Francesca):
- Matrix Z -> Will come from the multinomial given H/Theta
- Matrix W -> Will come from the multinomial given Z, B
- Matrices C, E -> Will come from Z
'''

### Original Data Generating Algorithm by Kanthavel


In [1]:
# Imports
import pymc3 as pm
import numpy as np
# import numpy.linalg
from matplotlib import pyplot as plt

In [2]:
V = [
    'dog',
    'aunt',
    'cat',
    'square',
    'house',
    'root',
    'mouse',
    'cow',
    'palm',
    'tree',
    'mom',
    'sun',
    'moon',
    'father',
    'spoon',
    'circle',
    'mug',
    'glass'
]
V_arr = np.array(V)

In [3]:
def build_topic_distribution(seed=1234):
    np.random.seed(seed)
    vocabulary_size = len(V)
    distribution = np.random.random(vocabulary_size)
    return distribution / distribution.sum()

In [4]:
build_topic_distribution()

array([0.01973417, 0.0641021 , 0.04510347, 0.08092336, 0.08036872,
       0.02808795, 0.02848688, 0.08262492, 0.0987267 , 0.09025611,
       0.0368695 , 0.05162255, 0.07042403, 0.07343683, 0.03815064,
       0.05782566, 0.0518377 , 0.0014187 ])

In [5]:
topics = {  # Betas
    'red': build_topic_distribution(seed=1),
    'blue': build_topic_distribution(seed=2),
    'green': build_topic_distribution(seed=3),
    'pink': build_topic_distribution(seed=4),
    'yellow': build_topic_distribution(seed=5)
}

In [6]:
topics['red']

array([5.95139825e-02, 1.02798842e-01, 1.63226420e-05, 4.31464413e-02,
       2.09438049e-02, 1.31778118e-02, 2.65815397e-02, 4.93156112e-02,
       5.66234210e-02, 7.68955339e-02, 5.98240254e-02, 9.77889437e-02,
       2.91777592e-02, 1.25317765e-01, 3.90853414e-03, 9.56836599e-02,
       5.95543411e-02, 7.97316600e-02])

In [7]:
topics['red'].sum()

1.0

In [8]:
topics_number = len(topics)
topics_number

5

In [9]:
np.random.seed(1984)
topic_mean = np.random.random(len(topics)) 
topic_mean /= topic_mean.sum()  # Mu
topic_mean

array([0.00775911, 0.2647408 , 0.04999273, 0.427745  , 0.24976236])

In [10]:
np.random.seed(12)
topic_covariance = np.random.random((topics_number, topics_number))
topic_covariance = np.dot(topic_covariance, topic_covariance.T)
topic_covariance

array([[0.9258647 , 1.32977215, 1.1960876 , 1.02902203, 0.2354462 ],
       [1.32977215, 2.59107874, 1.65456663, 1.74944266, 0.87948846],
       [1.1960876 , 1.65456663, 2.06661948, 1.5962034 , 0.45713552],
       [1.02902203, 1.74944266, 1.5962034 , 1.42787331, 0.61474357],
       [0.2354462 , 0.87948846, 0.45713552, 0.61474357, 0.71282345]])

In [11]:
def transform_proportions(eta):
    # Not sure about this
    theta = np.exp(eta)
    theta /= theta.sum()
    return theta.squeeze()

In [24]:
# Correct with this: https://numpy.org/doc/stable/reference/random/index.html#random-quick-start

# Building a document:
N = 10

seed = 1979
np.random.seed(seed)

# Eta
topic_proportions = np.random.multivariate_normal(topic_mean, topic_covariance, 1)  # (in LDA this is a Dirichlet)

# Theta
topic_normalized_proportions = transform_proportions(topic_proportions)

# BoW
document = np.zeros(len(V))

for n in range(N):
    topic_assignment = np.random.multinomial(1, pvals=topic_normalized_proportions).squeeze().astype(bool)
    assigned_topic = np.array(list(topics.keys()))[topic_assignment][0]
    assigned_topic_distribution = topics[assigned_topic]
    word_mask = np.random.multinomial(1, pvals=assigned_topic_distribution).squeeze()
    word = V_arr[word_mask.astype(bool)][0]
    document += word_mask
    print(f'{n}-th word drawn from topic {assigned_topic} is {word}')
document

0-th word drawn from topic green is square
1-th word drawn from topic green is glass
2-th word drawn from topic blue is house
3-th word drawn from topic green is tree
4-th word drawn from topic green is glass
5-th word drawn from topic green is glass
6-th word drawn from topic blue is sun
7-th word drawn from topic blue is palm
8-th word drawn from topic red is father
9-th word drawn from topic blue is father


array([0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 2., 0., 0., 0.,
       3.])

# 2 SAMPLER

# 2.1 Hamiltonian MC Sampling

### Task:

You must implement a function that receives matrices $E_i$, $K_i$ and vector $\mu$ and generates the next $H_{i+1}$.


. $E$ matrix of $D \times k$ where $E_d$ is the $k$-dim vector of counts of sampled drawings for the $z$-th topic over all words for each document

. $K$ matrix of $k \times k$ representing the precision matrix associated to the graph $G$

. $\mu = 0$

. $H$ matrix of $D \times k$ where $H_d = \eta_d$ is the $k$-dim vector of the topic prevalences over document $d$

In [1]:
# import cmdstanpy
# cmdstanpy.install_cmdstan()

Installing CmdStan version: 2.28.2
Install directory: /home/kanthavel/.cmdstan
CmdStan version 2.28.2 already installed
deleting tmpfiles dir: /tmp/tmppunv_5p8
done



True

In [3]:
import os
import numpy as np
os.environ['CMDSTAN'] = '/home/kanthavel/.cmdstan/cmdstan-2.28.2/'
from cmdstanpy import cmdstan_path, CmdStanModel

import arviz as az
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
normal_code = """
    data {
        int<lower=0> dim;
        matrix[dim, dim] cov_chol;
    }
    
    parameters {
        vector[dim] x;
    }
    
    model {
        vector[dim] mu = rep_vector(0, dim);
        x ~ multi_normal_cholesky(mu, cov_chol);
    }
"""




stan_file = "./multi_normal.stan"

with open(stan_file, "w") as f:
    print(normal_code, file=f)

stan_model = CmdStanModel(stan_file=stan_file)

INFO:cmdstanpy:compiling stan program, exe file: /media/kanthavel/data/code/Bayesian_Statistics/Project/multi_normal
INFO:cmdstanpy:compiler options: stanc_options=None, cpp_options=None
INFO:cmdstanpy:compiled model file: /media/kanthavel/data/code/Bayesian_Statistics/Project/multi_normal


In [5]:
d = 2
sigma = 0.99 ** np.abs(np.vstack([np.arange(d)] *d) - np.vstack([np.arange(d)] *d).T)
sigma_chol = np.linalg.cholesky(sigma)

In [6]:
normal_data = {
    "dim": 2,
    "cov_chol": sigma_chol
}

stan_fit = stan_model.sample(data=normal_data, chains=4, parallel_chains=4, 
                             iter_warmup=1000, iter_sampling=5000)

cmdstanpy_data = az.from_cmdstanpy(stan_fit)

INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:start chain 2
INFO:cmdstanpy:start chain 3
INFO:cmdstanpy:start chain 4
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:finish chain 3
INFO:cmdstanpy:finish chain 2
INFO:cmdstanpy:finish chain 4


In [None]:
normal_code = """
    data {
        int<lower=0> D;
        int<lower=0> k;
        matrix[D, k] E;
        matrix[k, k] Sigma;
        vector[k] mu;
    }
    
    parameters {
        matrix[D, k] H;
    }
    
    model {
        H ~ normal(mu, Sigma);
        matrix[D, k] Theta = ; 
        E ~ multinomial()...
    }
"""

In [None]:
def Hamiltonian_MC_Sampling(E, K, mu):
    
    return None

# 2.2 MCMC Sampling

### Task:

You must implement a function that receives matrices $W$, $\Theta_{i+1}$ and $B_i$ and generates the next $Z_{i+1}$ and $B_{i+1}$.

In [21]:
def MCMC_Sampling(W, Theta, B):
    return None

# 2.3 BDMCMC Sampling

### Task:

You must implement a function that receives matrices $W$, $Z_{i+1}$ and $H_{i+1}$ and generates the next $G_{i+1}$ and $K_{i+1}$.

In [21]:
def BDMCMC_Sampling(W, Z, H):
    return None