# Dummy training version 

### 0- Import librairies

In [13]:
import torch
import json
import numpy as np
from scipy.special import logit
from fcts.lbm_nmar import LBM_NMAR
from fcts.lbfgs import FullBatchLBFGS
from fcts.utils import reparametrized_expanded_params, inv_softplus, shrink_simplex

### 1- Load torch parameters

In [14]:
# (for Mac) %env PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
device = 'mps' #put 'cuda' or 'cpu'
device2 = 'mps' #put None or 'cuda'

if not torch.backends.mps.is_available() and device != 'cpu':
    print('Cuda is not available. Algorithm will use cpu')
    device, device2 = torch.device('cpu'), None

### 2- Load parliament datasets

In [15]:
#votes: matrix gathering votes for several laws and politicians (1: positive, 0: missing/abstention, -1: negative)
votes = np.loadtxt("data_parliament/votes.txt",delimiter=";").astype(int)

#deputes: Family name, Name, Political group 
deputes = json.load(open('data_parliament/deputes.json', 'r')) 

#texts:  political group demanding the law, title of demand, date, type (type of vote, type of majority, name of type of vote), 
texts = json.load(open('data_parliament/texts.json', 'r')) 

### 3- Train one iteration by hand

#### a - Parameter initialization

Dataset shape

In [16]:
n1, n2 = votes.shape
print("row length (number of deputes): ",n1)
print("col length (number of laws): ",n2)

row length (number of deputes):  576
col length (number of laws):  1256


Define a number of row and column clusters

In [17]:
# Select number of row clusters
nq = 3# COMPLETE

# Select number of column clusters
nl = 5 # COMPLETE

Initialization of parameters: 

$\gamma = (\nu_a, \nu_b, \nu_c, \nu_d, \tau_1, \tau_2)$

In [None]:
# ν: mean of restrincted variational distribution q_{gamma}
nu_a = np.random.uniform(-0.5, 0.5, (n1, 1))# column vector of size n1
nu_b = np.random.uniform(-0.5, 0.5, (n1, 1))
nu_p = np.random.uniform(-0.5, 0.5, (1, n2)) # row vector of size n2
nu_q = np.random.uniform(-0.5, 0.5, (1, n2))

# ρ: mean of restrincted variational distribution q_{gamma}  
rho_a = 1e-5 * np.ones((n1, 1))# column vector of size n1
rho_b = 1e-5 * np.ones((n1, 1))
rho_p = 1e-5 * np.ones((1, n2)) # row vector of size n2
rho_q = 1e-5 * np.ones((1, n2))

# τ: probability of each row (resp. column), to be in cluster k ∈ {1,...,nq} (resp. l ∈ {1,...,nl})
tau_1 = np.diff(
        np.concatenate(
            (np.zeros((n1, 1)),
             np.sort(np.random.uniform(size=(n1, nq - 1)), axis=1),
             np.ones((n1, 1)),),axis=1,),
        axis=1,
    ) #size (n1,nq)

tau_2 = np.diff(
        np.concatenate(
            (np.zeros((n2, 1)),
            np.sort(np.random.uniform(size=(n2, nl - 1)), axis=1),
            np.ones((n2, 1)),
            ),
            axis=1,
        ),
        axis=1,
    ) #size (n2, nl)

In [None]:
gamma = np.concatenate((
            nu_a.flatten(),
            inv_softplus(rho_a.flatten()),
            nu_b.flatten(),
            inv_softplus(rho_b.flatten()),
            nu_p.flatten(),
            inv_softplus(rho_p.flatten()),
            nu_q.flatten(),
            inv_softplus(rho_q.flatten()),
            logit(shrink_simplex(tau_1).flatten()),
            logit(shrink_simplex(tau_2).flatten()),
        ))

# Just-in-case checks: 
assert len(gamma.shape) == 1
assert gamma.shape[0] == 4 * n1 + 4 * n2 + (n1 * (nq - 1)) + (n2 * (nl - 1))

$\theta = (\mu , \sigma_a^2, \sigma_b^2, \sigma_c^2, \sigma_d^2, \pi)$

In [18]:
# μ: where expit(μ) is the global missingness rate
mu_un = np.random.uniform(-4.5, -3.5) 

# σ^2: variances of latent variables A, B, C, D
sigma_sq_a = np.random.uniform(0.4, 0.7) 
sigma_sq_b = np.random.uniform(0.4, 0.7)
sigma_sq_p = np.random.uniform(0.4, 0.7)
sigma_sq_q = np.random.uniform(0.4, 0.7)
alpha_1 = (np.ones(nq) / nq).reshape((nq, 1)) #uniform proba of each row cluster
alpha_2 = (np.ones(nl) / nl).reshape((1, nl)) #idem for col cluster

# π_{k,l}: probability of being in cluster k & l
pi = np.random.uniform(0.2, 0.8, (nq, nl)) #pi_kl (size: nb row clust  nb col clust)

In [19]:
theta = np.concatenate((
            (mu_un,),
            (inv_softplus(sigma_sq_a),),
            (inv_softplus(sigma_sq_b),),
            (inv_softplus(sigma_sq_p),),
            (inv_softplus(sigma_sq_q),),
            logit(shrink_simplex(alpha_1.T).flatten()),
            logit(shrink_simplex(alpha_2).flatten()),
            logit(pi.flatten()),
        ))

# Just-in-case checks:
assert len(theta.shape) == 1
assert theta.shape[0] == 5 + nq - 1 + nl - 1 + nq * nl

In [22]:
vector_of_parameters = torch.tensor(np.concatenate((gamma, theta)), requires_grad=True, device=device, dtype=torch.float32)

Optimization parameters

In [23]:
loglike_dist_tol=1e-4
max_iter=100
norm_grad_tol=1e-4
initial_learning_rate=1.0
hessian_history_size=100
loglike_diff_breaking_cond=1e-3
divide_by_line_search=2

#### b- Model creation

In [24]:
model = LBM_NMAR(vector_of_parameters,votes,(n1, n2, nq, nl),device=device,device2=device2)

*(For more details check the file: 1.2-Model_LBM_MNAR)*

#### c- Train model 

##### Variational EM 

**VE step:** Find the variational parameter $\gamma^{(t+1)}$ 

$\gamma^{(t+1)} = argmax  J(\gamma, \theta^t)$

In [25]:
print("-" * 80, "\nStart training LBM MNAR", "\n", "-" * 80)
print("VE step")
print(f"""  LBFGS iter  | criteria |""")

# Declaration of optimization variables for gamma optimization 
line_search = "Armijo"
optimizer = FullBatchLBFGS([model.variationnal_params],lr=initial_learning_rate,history_size=hessian_history_size,line_search=line_search,debug=True)

optimizer.zero_grad()
obj = model()
obj.backward()
f_old = obj.item() # update with current iteration value 

for n_iter in range(max_iter):
    # optimize the gamma params (a maximum of iteration)
    def closure():
        # define closure for line search
        loss_fn = model(no_grad=True) #definition of gradient here 
        return loss_fn
    
    ### Perform line search step
    options = {"closure": closure,"current_loss": obj,"eta": divide_by_line_search,"max_ls": 150,"interpolate": False,"inplace": True,"ls_debug": False,"damping": False,"eps": 1e-2,"c1": 0.5,"c2": 0.95,}
                    
    # Optimization part 
    obj, lr, backtracks, clos_evals, desc_dir, fail = optimizer.step(options=options)  
    optimizer.zero_grad() # put gradients to 0
    obj = model() #call function to optimize
    obj.backward() # compute optimization
    grad = optimizer._gather_flat_grad()


    print(f""" {n_iter + 1}  | {obj.item():.5f} |""")
    if (torch.norm(grad) < norm_grad_tol or np.abs(obj.item() - f_old) < loglike_dist_tol):
        break
    f_old = obj.item()
    

-------------------------------------------------------------------------------- 
Start training LBM MNAR 
 --------------------------------------------------------------------------------
VE step
  LBFGS iter  | criteria |


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/python_arg_parser.cpp:1519.)
  p.data.add_(


 1  | 341133.21875 |
 2  | 325310.43750 |
 3  | 319513.18750 |
 4  | 316872.50000 |
 5  | 315784.68750 |
 6  | 315224.03125 |
 7  | 314697.93750 |
 8  | 314385.37500 |
 9  | 313947.43750 |
 10  | 313657.96875 |
 11  | 313154.18750 |
 12  | 312730.53125 |
 13  | 312020.43750 |
 14  | 311554.75000 |
 15  | 311061.93750 |
 16  | 310535.12500 |
 17  | 310296.81250 |
 18  | 310287.87500 |
 19  | 310283.56250 |
 20  | 310282.00000 |
 21  | 310281.46875 |
 22  | 310281.40625 |
 23  | 310281.34375 |
 24  | 310281.34375 |


**M step:** Find the model parameter $\theta^{(t+1)}$ 

$\theta^{(t+1)} = argmax  J(\gamma^{(t+1)}, \theta)$

In [26]:
print("M step")
print(f"""  LBFGS iter  | criteria |""")

# Declaration of optimization variables for theta optimization 
optimizer = FullBatchLBFGS([model.model_params],lr=initial_learning_rate,history_size=hessian_history_size,line_search=line_search,debug=True)

optimizer.zero_grad()
obj = model()
obj.backward()
f_old = obj.item()


for n_iter in range(max_iter):
    # optimize the relative params (a maximum of iteration)
    # define closure for line search
    def closure():
        loss_fn = model(no_grad=True) #definition of gradient here 
        return loss_fn
    
    ### perform line search step
    options = {"closure": closure,"current_loss": obj,"eta": divide_by_line_search,"max_ls": 150,"interpolate": False,"inplace": True,"ls_debug": False,"damping": False,"eps": 1e-2,"c1": 0.5,"c2": 0.95,}
                    
    # Optimization part 
    obj, lr, backtracks, clos_evals, desc_dir, fail = optimizer.step(options=options)  
    optimizer.zero_grad() # put gradients to 0
    obj = model() #call function to optimize
    obj.backward() # compute optimization
    grad = optimizer._gather_flat_grad()


    print(f""" {n_iter + 1}  | {obj.item():.5f} |""")
    if (torch.norm(grad) < norm_grad_tol or np.abs(obj.item() - f_old) < loglike_dist_tol):
        break
    f_old = obj.item()
    

M step
  LBFGS iter  | criteria |
 1  | 308739.12500 |
 2  | 308389.50000 |
 3  | 308304.93750 |
 4  | 308153.06250 |
 5  | 307825.87500 |
 6  | 307447.96875 |
 7  | 307007.06250 |
 8  | 306853.46875 |
 9  | 306823.65625 |
 10  | 306798.03125 |
 11  | 306734.00000 |
 12  | 306699.21875 |
 13  | 306671.78125 |
 14  | 306667.34375 |
 15  | 306656.75000 |
 16  | 306644.65625 |
 17  | 306637.96875 |
 18  | 306633.75000 |
 19  | 306632.28125 |
 20  | 306631.59375 |
 21  | 306631.09375 |
 22  | 306630.78125 |
 23  | 306630.62500 |
 24  | 306630.62500 |


Reparametrization

In [27]:
(   nu_a,
    rho_a,
    nu_b,
    rho_b,
    nu_p,
    rho_p,
    nu_q,
    rho_q,
    tau_1,
    tau_2,
    mu_un,
    sigma_sq_a,
    sigma_sq_b,
    sigma_sq_p,
    sigma_sq_q,
    alpha_1,
    alpha_2,
    pi,
    ) = reparametrized_expanded_params(torch.cat((model.variationnal_params, model.model_params)), n1, n2, nq, nl, device)