# Poisson Matrix Factorization

Disclaimer: these are non-sparse implementations.They can be optimized.

In [None]:
# import libraries
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special as sp
from scipy.stats import gamma, poisson
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.ticker import MaxNLocator
import matplotlib

import functions as fun

In [None]:
prng = np.random.RandomState(10)  # container for the Mersenne Twister pseudo-random number generator
cmap = matplotlib.cm.get_cmap('tab20')  # colormap

### Pre-Processing

In [None]:
# import dataset
graph = nx.read_gml('football/football_net.gml')
# keep largest connected component
Gcc = sorted(nx.connected_components(graph), key=len, reverse=True)
graph = graph.subgraph(Gcc[0])

print('nodes N =',graph.number_of_nodes(),'\nedges E =',graph.number_of_edges())

In [None]:
# import dataset covariates to create ground truth
df = pd.read_csv('football/football_cov.csv')
ground_lab = ['Atlantic Coast','Big East','Big Ten','Big Twelve','Conference USA','Big West','Mid-American',
              'Mountain West','Pacific Ten','Southeastern','Western Athletic','NotreDame','Navy','Connecticut',
              'CentralFlorida','Middle Tennessee State','LouisianaTech','LouisianaMonroe','LouisianaLafayette']
ground_truth = {}
for i,j in enumerate(ground_lab):
    ground_truth[i] = list(df[df.conference==j].index)
K = len(ground_truth)

print('communities K =', K)

In [None]:
df.head(2)
# the ground_truth is given by the conference

In [None]:
ground_truth

In [None]:
# defrozen graph and make directed
graph = nx.DiGraph(graph)
print('nodes N =',graph.number_of_nodes(),'\nedges E =',graph.number_of_edges())

In [None]:
# delete nodes belonging to communities having len=1
del_idx = []
for i in np.arange(11,19):
    for v in ground_truth[i]:
        n = df.iloc[v]['names']
        del_idx.append(v)
        graph.remove_node(n)
df.drop(del_idx, inplace=True)
df = df.reset_index()
A = nx.to_numpy_array(graph)  # save adjacency matrix
print('nodes N =', graph.number_of_nodes(),'\nedges E =', graph.number_of_edges())
print(df.shape)

In [None]:
# delete communities having len=1
ground_lab = ['Atlantic Coast','Big East','Big Ten','Big Twelve','Conference USA','Big West','Mid-American',
              'Mountain West','Pacific Ten','Southeastern','Western Athletic']
ground_truth = {}
for i,j in enumerate(ground_lab):
    ground_truth[i] = list(df[df.conference==j].index)
K = len(ground_truth)

print('communities K =', K)

In [None]:
ground_truth

### Visualization of ground truth network

In [None]:
# communities per node
com = {}
for k in ground_truth:
    for v in ground_truth[k]:
        com[v] = k

In [None]:
node_size = [graph.degree[i]*17 for i in list(graph.nodes())]
position = nx.spring_layout(graph,iterations=200,seed=4)

In [None]:
plt.figure(figsize=(10,10))
fun.plot_net_hard(graph, position, node_size, com, plt, cmap)
plt.title('Ground Truth Partition')
plt.show()

### Exercise 1: Inference with EM

##### (a) Complete the functions of the class PMF_EM

In [None]:
class PMF_EM(object):
    
    def __init__(self, A, K=3):
        self.A = A                 # data
        self.K = K                 # number of communities
        self.N = self.A.shape[0]   # number of nodes

    def _init(self, prng):
        # random initialization 
        self.u = # YOUR CODE HERE
        self.v = # YOUR CODE HERE
        self.C = # YOUR CODE HERE
        
    def fit(self, prng, N_real=15, max_iter=100, tol=0.1, decision=2):
        maxL = - 1e12  # initialization of the maximum likelihood

        for r in range(N_real):
            # random initialization
            self._init(prng)
            
            # convergence local variables
            coincide, it = 0, 0
            convergence = False

            loglik_values = []  # keep track of the values of the loglik to plot
            loglik = - 1e12  # initialization of the loglik

            while not convergence and it < max_iter:
                self._em()
                it, loglik, coincide, convergence = self.check_for_convergence(it, loglik, coincide, convergence, tolerance=tol, decision=decision)
                loglik_values.append(loglik)
            print(f'Nreal = {r} - Loglikelihood = {fun.fl(loglik)} - Best Loglikelihood = {fun.fl(maxL)} - iterations = {it} - ')
    
            if maxL < loglik:
                u_f,v_f,C_f = self.update_optimal_parameters()
                maxL = loglik
                final_it = it
                best_loglik_values = list(loglik_values)
        
        return u_f, v_f, C_f, best_loglik_values

    def _em(self):
        # E-step
        # YOUR CODE HERE
        
        # M-step
        # YOUR CODE HERE

    def update_q(self):
        # YOUR CODE HERE
    
    def update_u(self, q):
        # YOUR CODE HERE

    def update_v(self, q):
        # YOUR CODE HERE

    def update_C(self, q):
        # YOUR CODE HERE
    
    def check_for_convergence(self, it, loglik, coincide, convergence, tolerance=0.1, decision=2):
        if it % 10 == 0:
            old_L = loglik
            loglik = self.Likelihood(EPS = 1e-12)
            if abs(loglik - old_L) < tolerance:
                coincide += 1
            else:
                coincide = 0
        if coincide > decision:
            convergence = True
        it += 1
        return it, loglik, coincide, convergence

    def Likelihood(self, EPS = 1e-12):
        # YOUR CODE HERE

    def update_optimal_parameters(self):
        u_f = np.copy(self.u)
        v_f = np.copy(self.v)
        C_f = np.copy(self.C)
        return u_f,v_f,C_f

In [None]:
pmf_em = PMF_EM(A, K=K)
u_em, v_em, C_em, best_loglik_values = pmf_em.fit(prng)

##### (b) Plot the log-likelihood values  [check the script functions.py]

In [None]:
# YOUR CODE HERE

##### (c) Plot the results: ground truth vs estimated overlapping partition vs estimated hard partition

In [None]:
u_norm_em = fun.normalize_nonzero_membership(u_em)
v_norm_em = fun.normalize_nonzero_membership(v_em)

q_em = np.argmax(u_norm_em, axis=1)  # extract hard communities

In [None]:
plt.figure(figsize=(24,8))
plt.subplot(1,3,1)
# YOUR CODE HERE
plt.title('Ground Truth Partition')
plt.subplot(1,3,2)
# YOUR CODE HERE
plt.title('Estimated via EM (soft)')
plt.subplot(1,3,3)
# YOUR CODE HERE
plt.title('Estimated via EM (hard)')
plt.show()

### Exercise 2: Inference with VI

##### (a) Complete the functions of the class PMF_VI

In [None]:
class PMF_VI(object):
    
    def __init__(self, A, K=3):
        self.A = A                 # data
        self.K = K                 # number of communities
        self.N = self.A.shape[0]   # number of nodes

    def _init(self, prng):
        # priors
        self.a = 1
        self.b = 1
        self.c = 1
        self.d = 1
        
        # random initialization
        self.alpha_shp = # YOUR CODE HERE
        self.alpha_rte = # YOUR CODE HERE
        self.beta_shp = # YOUR CODE HERE
        self.beta_rte = # YOUR CODE HERE

    def fit(self, prng, N_real=15, max_iter=100, tol=0.1, decision=2):
        maxElbo = - 1e12  # initialization of the maximum elbo

        for r in range(N_real):
            # random initialization
            self._init(prng)

            # convergence local variables
            coincide, it = 0, 0
            convergence = False

            elbo_values = []  # keep track of the values of the elbo to plot
            elbo = - 1e12  # initialization of the loglik

            while not convergence and it < max_iter:
                self._cavi()
                
                Eu, Elogu = compute_expectations(self.alpha_shp, self.alpha_rte)
                Ev, Elogv = compute_expectations(self.beta_shp, self.beta_rte)

                it, elbo, coincide, convergence = self.check_for_convergence_cavi(Eu, Elogu, Ev, Elogv, it, elbo, coincide,   
                                                                          convergence, tolerance=tol, decision=decision)
                elbo_values.append(elbo)
            print(f'Nreal = {r} - ELBO = {fun.fl(elbo)} - Best ELBO = {fun.fl(maxElbo)} - iterations = {it} - ')

            if maxElbo < elbo:
                alpha_shp_f,alpha_rte_f,beta_shp_f,beta_rte_f = self.update_optimal_parameters()
                maxElbo = elbo
                final_it = it
                best_elbo_values = list(elbo_values)
        
        return alpha_shp_f, alpha_rte_f, beta_shp_f, beta_rte_f, best_elbo_values

    def _cavi(self):
        # YOUR CODE HERE

    def update_phi(self):
        # YOUR CODE HERE
    
    def update_alphas(self, phi_ij):
        # YOUR CODE HERE
        
    def update_betas(self, phi_ij):
        # YOUR CODE HERE
   
    def check_for_convergence_cavi(self, Eu, Elogu, Ev, Elogv, it, elbo, coincide, convergence, tolerance=0.1,decision=2):
        if it % 10 == 0:
            old_elbo = elbo
            elbo = self.Elbo(Eu, Elogu, Ev, Elogv)
            if abs(elbo - old_elbo) < tolerance:
                coincide += 1
            else:
                coincide = 0
        if coincide > decision:
            convergence = True
        it += 1
        return it, elbo, coincide, convergence

    def Elbo(self, Eu, Elogu, Ev, Elogv):
        # YOUR CODE HERE

    def update_optimal_parameters(self):
        alpha_shp = np.copy(self.alpha_shp)
        alpha_rte = np.copy(self.alpha_rte)
        beta_shp = np.copy(self.beta_shp)
        beta_rte = np.copy(self.beta_rte)
        return alpha_shp,alpha_rte,beta_shp,beta_rte
    
def compute_expectations(alpha, beta):
    '''
    Given x ~ Gam(alpha, beta), compute E[x] and E[log x]
    '''    
    return (alpha / beta , sp.psi(alpha) - np.log(beta))

def gamma_elbo_term(pa, pb, qa, qb):
        # YOUR CODE HERE

In [None]:
pmf_vi = PMF_VI(A, K=K)
alpha_shp_vi, alpha_rte_vi, beta_shp_vi, beta_rte_vi, best_elbo_values = pmf_vi.fit(prng)

In [None]:
Eu_vi, Elogu_vi = compute_expectations(alpha_shp_vi,alpha_rte_vi)
Ev_vi, Elogv_vi = compute_expectations(beta_shp_vi,beta_rte_vi)

##### (b) Plot the elbo values  [check the script functions.py]

In [None]:
# YOUR CODE HERE

##### (c) Plot the results: ground truth vs estimated overlapping partition vs estimated hard partition

In [None]:
u_norm_vi = fun.normalize_nonzero_membership(Eu_vi)
v_norm_vi = fun.normalize_nonzero_membership(Ev_vi)

q_vi = np.argmax(u_norm_vi, axis=1)  # extract hard communities

In [None]:
plt.figure(figsize=(24,8))
plt.subplot(1,3,1)
# YOUR CODE HERE
plt.title('Ground Truth Partition')
plt.subplot(1,3,2)
# YOUR CODE HERE
plt.title('Estimated via EM (soft)')
plt.subplot(1,3,3)
# YOUR CODE HERE
plt.title('Estimated via EM (hard)')
plt.show()