## Factor Analysis Model
### Assumptions about the Latent Variables and Model
### Constructing the Joint Distribution of Observed RV and Latent RV
### The Incomplete Log-likelihood with discussion about Jensen's Inequality
### Thc Complete Log-likelihood
#### Q-step
#### E-step
##### Maximizing w.r.t the mean of the observed input space
##### Maximizing w.r.t the loading factor matrix
$\Lambda$
##### Maximizing w.r.t to the covariance matrix of the uncertainity in the joint space of observed RV and latent variable 
$\psi$
#### Convergence Test
#### Interpretation of the Loading Factor

In [1]:
%matplotlib inline
import numpy as np 
import sklearn.preprocessing
import sklearn.datasets
import pandas as pd
import sklearn.model_selection
import numpy.random
import math
import sklearn.metrics
import sklearn.decomposition
numpy.random.seed(42)

In [2]:
X, y = sklearn.datasets.load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)
standard = sklearn.preprocessing.StandardScaler()
#X_train = standard.fit_transform(X_train)
#y_train = standard.fit_transform(y_train.reshape(-1, 1))

training_data = np.c_[X_train, y_train]#All of the features are continuous, so, no need to use one-hot encoder and we can directly standard normalize the features of the data set

#X_test = standard.transform(X_test)
#y_test = standard.transform(y_test.reshape(-1, 1))

test_data = np.c_[X_test, y_test]
print(training_data.shape)
print(test_data.shape)

(379, 14)
(127, 14)


In [5]:
class FactorAnalysisModel(object):

    def __init__(self, X_train, G, initialization="random"):
        self.m = X_train.shape[0]
        self.n = X_train.shape[1]
        self.G = G
        self.X_train = X_train
        if initialization == "random":
            self.mean_x = (1/self.m) * (np.sum(self.X_train, axis=0)).reshape(-1, 1)#sum across all of the observations
            self.cov_x = np.cov(self.X_train.T)#I am just making sure it would have full rank, which is the objective of FAM
            self.loading_factor = np.zeros((self.n, self.G))
            np.fill_diagonal(self.loading_factor, 5)
            print(np.linalg.matrix_rank(self.loading_factor))
            self.cov_uncertainity = np.eye(self.n) * numpy.random.randn()
            self.mean_z_given_x = np.zeros((self.m, self.G))
            self.cov_z_given_x = np.eye(self.G) * numpy.random.randn()
        else:
            pass

    def computing_inv_cov_x(self):
        #Using Woodbury Identity
        inv_psi = np.linalg.inv(self.cov_uncertainity)
        t1 = np.linalg.pinv( np.add(np.eye(self.G), np.dot(self.loading_factor.T, np.dot(inv_psi, self.loading_factor))) )
        t2 = np.dot(self.loading_factor, np.dot(t1, self.loading_factor.T))
        t3 = np.dot(inv_psi, np.dot(t2, inv_psi))
        assert(t3.shape == inv_psi.shape)

        return np.subtract(inv_psi, t3)
        #return np.linalg.inv(np.add(self.cov_uncertainity, np.dot(self.loading_factor, self.loading_factor.T)))

    def E_step(self):
        #Estimate the covariance and mean of the conditional latent space
        temp = np.zeros((self.m, self.G))
        for i in range(0, self.m):
            temp[i, :] = (np.dot(self.loading_factor.T, np.dot(self.computing_inv_cov_x(), np.subtract(self.X_train[i, :].reshape(-1, 1), self.mean_x.reshape(-1, 1)).reshape(-1, 1)) ) ).reshape(1, -1)

        self.mean_z_given_x = temp
        self.cov_z_given_x = np.subtract( np.eye(self.G), np.dot( self.loading_factor.T,  np.dot(self.computing_inv_cov_x(), self.loading_factor)  ))
        assert(self.cov_z_given_x.shape == (self.G, self.G))
    
    def M_step(self):
        temp_LH = np.zeros((self.n, self.G))
        temp_RH = np.zeros((self.G, self.G))

        for i in range(0, self.m):
            #print((self.X_train[i, :].reshape(-1, 1) - self.mean_x.reshape(-1, 1)).shape)
            temp_LH += np.dot((self.X_train[i, :].reshape(-1, 1) - self.mean_x.reshape(-1, 1)).reshape(-1, 1), self.mean_z_given_x[i, :].reshape(1, -1))
            #print(temp_LH.shape)
            temp_RH += np.add(self.cov_z_given_x, np.dot(self.mean_z_given_x[i, :].reshape(-1, 1), self.mean_z_given_x[i, :].reshape(1, -1)))  

        loading_factor_new = np.dot(temp_LH, np.linalg.inv(temp_RH))
        #print(loading_factor_new.shape)
        #S = np.zeros((self.n, self.n))
        phi = np.zeros((self.n, self.n))
        for i in range(0, self.m):    
            phi += np.dot(loading_factor_new, np.dot(self.mean_z_given_x[i, :].reshape(-1, 1), (self.X_train[i, :].reshape(-1, 1) - self.mean_x.reshape(-1, 1)).reshape(1, -1)))
            
            """phi += ( np.dot(self.X_train[i, :].reshape(-1, 1), self.X_train[i, :].reshape(1, -1))
             - np.dot(np.dot(self.X_train[i, :].reshape(-1, 1), self.mean_z_given_x[i, :].reshape(1, -1)), loading_factor_new.T)- np.dot(loading_factor_new, np.dot(self.mean_z_given_x[i, :].reshape(-1, 1), self.X_train[i, :].reshape(1, -1)))
               + np.dot(loading_factor_new, np.dot( np.add(self.cov_z_given_x, np.dot(self.mean_z_given_x[i, :].reshape(-1, 1), self.mean_z_given_x[i, :].reshape(1, -1)) ), loading_factor_new.T) ) )"""
            #phi += np.dot(self.X_train[i, :].reshape(-1, 1), self.X_train[i, :].reshape(1, -1)) - np.dot(loading_factor_new, np.dot(self.mean_z_given_x[i, :].reshape(-1, 1), self.X_train[i, :].reshape(1, -1)))

            """S += np.dot((self.X_train[i, :].reshape(-1, 1) - self.mean_x.reshape(-1, 1)).reshape(-1, 1), (self.X_train[i, :].reshape(-1, 1) - self.mean_x.reshape(-1, 1)).reshape(1, -1))"""

        phi = np.subtract(self.cov_x, (1/self.m) * phi) 
        temp = np.eye(self.n)
        assert(temp.shape == phi.shape)
        #self.cov_uncertainity = (1/self.m) * np.multiply(temp, phi)
        self.cov_uncertainity =  np.multiply(temp, phi)

        self.loading_factor = loading_factor_new

    def computing_log_likelihood(self):
        temp = 0
        for i in range(0, self.m):

            temp += np.log( 
            (1/np.sqrt(np.linalg.det(2*np.pi*self.computing_inv_cov_x()))) *
            np.exp(
            (-1/2) * np.dot((self.X_train[i, :].reshape(-1, 1) - self.mean_x.reshape(-1, 1)).T, 
            np.dot(self.computing_inv_cov_x(), self.X_train[i, :].reshape(-1, 1) - self.mean_x.reshape(-1, 1)))
            )
            )
        
        return temp

    def fit(self, max_iterations, eps=1e-3):
        convergence_test = True
        count = 0 
        while((convergence_test == True) and (count < max_iterations)):
            self.E_step()#Update the soft latent values            
            log_likelihood_t = self.computing_log_likelihood()
            self.M_step()#Update the parameters of the condtional distribution of x given z
            log_likelihood_t_future = self.computing_log_likelihood()
            print(f"Number of iteration:{count}, max_iteration:{max_iterations}, past:{log_likelihood_t}, future:{log_likelihood_t_future}")
            count = count + 1
            if( (log_likelihood_t_future - log_likelihood_t) < eps and (count > 10)):
                print("We converged to the optimal value for the log-likelihood")
                convergence_test =False #We reached the parameters that maximize the log-likelihood, no adancement in the log-likelihood
        self.cov_x = np.add(self.cov_uncertainity, np.dot(self.loading_factor, self.loading_factor.T))#Update covariance matrix of the project input space
        return self.mean_x, self.loading_factor, self.cov_uncertainity
    
    def transform(self, X):
        temp = []
        print(np.linalg.matrix_rank(self.loading_factor))
        assert(np.linalg.matrix_rank(self.loading_factor) == self.G)
        projection= np.linalg.inv( np.dot(self.loading_factor.T, self.loading_factor))
        assert(projection.shape == (self.G, self.G))
        for i in range(0, X.shape[0]):
            temp.append(np.dot(projection, np.dot(self.loading_factor.T, (X[i, :].reshape(-1, 1) - self.mean_x).reshape(-1, 1) )))
        
        return np.array(temp)

In [6]:
###############################################
#There is a good chance that there is something wrong in the results, need to discover what is the issue in my code. I suspect there is something wrong in my initialization phase.
###############################################
q =4
model = FactorAnalysisModel(X_train, G=q, initialization="random")
mean_x, loading_factor, cov_uncertainity = model.fit(max_iterations=1000)
#loading_factor.shape

4
Number of iteration:0, max_iteration:1000, past:[[-inf]], future:[[-3811.48699451]]
Number of iteration:1, max_iteration:1000, past:[[-3811.48699451]], future:[[-3528.15078223]]
Number of iteration:2, max_iteration:1000, past:[[-3528.15078223]], future:[[-3001.21476933]]
Number of iteration:3, max_iteration:1000, past:[[-3001.21476933]], future:[[-2441.33479454]]
Number of iteration:4, max_iteration:1000, past:[[-2441.33479454]], future:[[-1959.11467202]]
Number of iteration:5, max_iteration:1000, past:[[-1959.11467202]], future:[[-1577.69708787]]
Number of iteration:6, max_iteration:1000, past:[[-1577.69708787]], future:[[-1287.1080311]]
Number of iteration:7, max_iteration:1000, past:[[-1287.1080311]], future:[[-1065.98576967]]
Number of iteration:8, max_iteration:1000, past:[[-1065.98576967]], future:[[-893.15983306]]
Number of iteration:9, max_iteration:1000, past:[[-893.15983306]], future:[[-754.14265641]]
Number of iteration:10, max_iteration:1000, past:[[-754.14265641]], futur

In [7]:
orthogonal_projection_data_onto_qSubspace = model.transform(X_train)
orthogonal_projection_data_onto_qSubspace[10, :] 

4


array([[-1.75843591],
       [ 5.68960239],
       [ 0.56836204],
       [ 3.6240142 ]])

In [8]:
empty_dictionary = {}
for i in range(0, q):
    empty_dictionary["unobs_var_" + str(i)] = loading_factor[:, i].tolist()
pd.DataFrame(data=empty_dictionary, index=["obs_var_"+str(i) for i in range(0, X_train.shape[1])])

Unnamed: 0,unobs_var_0,unobs_var_1,unobs_var_2,unobs_var_3
obs_var_0,6.667845,-0.248543,2.159935,-0.088554
obs_var_1,-2.647139,14.966404,-9.586004,-2.619961
obs_var_2,1.478685,-2.181959,5.31966,0.51778
obs_var_3,-0.023463,0.011112,0.0342,0.046844
obs_var_4,0.028085,-0.027067,0.093971,0.038972
obs_var_5,-0.096975,0.249967,-0.182682,0.093385
obs_var_6,5.98679,-12.719391,16.082667,8.429384
obs_var_7,-0.497238,0.918836,-1.377466,-0.707042
obs_var_8,4.725736,0.543958,5.99697,-2.475967
obs_var_9,76.872313,12.257698,134.049849,-44.607113


In [9]:
model = sklearn.decomposition.FactorAnalysis(n_components=4, random_state=0)
model.fit(X_train)
loading_factors = model.components_
loglike = model.loglike_
mean = model.mean_
transformed = model.transform(X_train)
print(transformed[0, :])
print(model.noise_variance_)


[-1.23236617  0.30758369 -1.43656498 -0.70399144]
[4.48235828e+01 1.11846806e+00 1.72547118e+01 6.97198149e-02
 4.78947510e-03 4.52447706e-01 1.46635416e+00 1.37973917e+00
 1.24118103e+01 1.07692721e+00 3.62546505e+00 1.00394704e+00
 3.02486304e+01]


In [10]:
pd.DataFrame(loading_factors.T)


Unnamed: 0,0,1,2,3
0,4.882882,-0.166284,-0.365222,-0.335824
1,-7.309185,-0.301091,19.282264,-10.269288
2,4.844053,-0.198577,-2.729536,-0.113635
3,0.000272,-0.011576,-0.023317,-0.01627
4,0.078058,0.000937,-0.054616,-0.016891
5,-0.205509,0.029813,0.126979,-0.073904
6,13.000962,-0.133949,-21.088251,-12.2561
7,-1.120167,0.003131,1.344535,0.095691
8,7.784843,-0.783732,-0.081635,0.388655
9,164.299462,-22.455679,1.897111,0.265951


In [11]:
temp = np.dot( np.linalg.inv(np.dot(loading_factors, loading_factors.T)), loading_factors)
print(temp.shape)
np.dot(temp, X_train[0,:] -mean)

(4, 13)


array([-1.23291283,  0.30700279, -1.40285944, -0.68994645])

In [12]:
loglike

[-31992.37941942195,
 -13931.879036722328,
 -13931.774530415927,
 -13931.7732158142]

### References 
* Chapter 2 and Chapter 12 from Bishop, C. (2006). Pattern Recognition and Machine Learning. Cambridge: Springer.
* Andrew Ng, Lec 13: (https://www.youtube.com/watch?v=LBtuYU-HfUg)
* Andrew Ng, Lec 14: (https://www.youtube.com/watch?v=ey2PE5xi9-A)
* McNicholas, P.D. (2016). Mixture Model-Based Classification. Boca Raton: Chapman &
Hall/CRC Press.
