Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
439 lines (340 sloc) 14.8 KB
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
from sklearn.utils.optimize import newton_cg
from scipy.special import expit, exprel
from scipy.linalg import eigvalsh
from sklearn.utils.multiclass import check_classification_targets
from sklearn.linear_model.base import LinearClassifierMixin, BaseEstimator
from sklearn.utils import check_X_y
from scipy.linalg import solve_triangular
from sklearn.linear_model.logistic import ( _logistic_loss_and_grad, _logistic_loss,
_logistic_grad_hess,)
class BayesianLogisticRegression(LinearClassifierMixin, BaseEstimator):
'''
Superclass for two different implementations of Bayesian Logistic Regression
'''
def __init__(self, n_iter, tol, fit_intercept, verbose):
self.n_iter = n_iter
self.tol = tol
self.fit_intercept = fit_intercept
self.verbose = verbose
def fit(self,X,y):
'''
Fits Bayesian Logistic Regression
Parameters
-----------
X: array-like of size (n_samples, n_features)
Training data, matrix of explanatory variables
y: array-like of size (n_samples, )
Target values
Returns
-------
self: object
self
'''
# preprocess data
X,y = check_X_y( X, y , dtype = np.float64)
check_classification_targets(y)
self.classes_ = np.unique(y)
n_classes = len(self.classes_)
# prepare for ovr if required
n_samples, n_features = X.shape
if self.fit_intercept:
X = self._add_intercept(X)
if n_classes < 2:
raise ValueError("Need samples of at least 2 classes")
if n_classes > 2:
self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes
self.intercept_ = [0]*n_classes
else:
self.coef_, self.sigma_, self.intercept_ = [0],[0],[0]
# make classifier for each class (one-vs-the rest)
for i in range(len(self.coef_)):
if n_classes == 2:
pos_class = self.classes_[1]
else:
pos_class = self.classes_[i]
mask = (y == pos_class)
y_bin = np.ones(y.shape, dtype=np.float64)
y_bin[~mask] = self._mask_val
coef_, sigma_ = self._fit(X,y_bin)
if self.fit_intercept:
self.intercept_[i],self.coef_[i] = self._get_intercept(coef_)
else:
self.coef_[i] = coef_
self.sigma_[i] = sigma_
self.coef_ = np.asarray(self.coef_)
return self
def predict_proba(self,X):
'''
Predicts probabilities of targets for test set
Parameters
----------
X: array-like of size [n_samples_test,n_features]
Matrix of explanatory variables (test set)
Returns
-------
probs: numpy array of size [n_samples_test]
Estimated probabilities of target classes
'''
# construct separating hyperplane
scores = self.decision_function(X)
if self.fit_intercept:
X = self._add_intercept(X)
# probit approximation to predictive distribution
sigma = self._get_sigma(X)
ks = 1. / ( 1. + np.pi*sigma / 8)**0.5
probs = expit(scores.T*ks).T
# handle several class cases
if probs.shape[1] == 1:
probs = np.hstack([1 - probs, probs])
else:
probs /= np.reshape(np.sum(probs, axis = 1), (probs.shape[0],1))
return probs
def _add_intercept(self,X):
'''Adds intercept to data matrix'''
raise NotImplementedError
def _get_intercept(self,coef):
'''
Extracts value of intercept from coefficients
'''
raise NotImplementedError
def _get_sigma(self,X):
'''
Computes variance of predictive distribution (which is then used in
probit approximation of sigmoid)
'''
raise NotImplementedError
class EBLogisticRegression(BayesianLogisticRegression):
'''
Implements Bayesian Logistic Regression with type II maximum likelihood
(sometimes it is called Empirical Bayes), uses Gaussian (Laplace) method
for approximation of evidence function.
Parameters
----------
n_iter: int, optional (DEFAULT = 50)
Maximum number of iterations before termination
tol: float, optional (DEFAULT = 1e-3)
If absolute change in precision parameter for weights is below threshold
algorithm terminates.
solver: str, optional (DEFAULT = 'lbfgs_b')
Optimization method that is used for finding parameters of posterior
distribution ['lbfgs_b','newton_cg']
n_iter_solver: int, optional (DEFAULT = 15)
Maximum number of iterations before termination of solver
tol_solver: float, optional (DEFAULT = 1e-3)
Convergence threshold for solver (it is used in estimating posterior
distribution),
fit_intercept : bool, optional ( DEFAULT = True )
If True will use intercept in the model. If set
to false, no intercept will be used in calculations
alpha: float (DEFAULT = 1e-6)
Initial regularization parameter (precision of prior distribution)
verbose : boolean, optional (DEFAULT = True)
Verbose mode when fitting the model
Attributes
----------
coef_ : array, shape = (n_features)
Coefficients of the regression model (mean of posterior distribution)
sigma_ : array, shape = (n_features, )
eigenvalues of covariance matrix
alpha_: float
Precision parameter of weight distribution
intercept_: array, shape = (n_features)
intercept
References:
-----------
[1] Pattern Recognition and Machine Learning, Bishop (2006) (pages 293 - 294)
'''
def __init__(self, n_iter = 50, tol = 1e-3,solver = 'lbfgs_b',n_iter_solver = 15,
tol_solver = 1e-3, fit_intercept = True, alpha = 1e-6, verbose = False):
super(EBLogisticRegression,self).__init__(n_iter, tol, fit_intercept, verbose)
self.n_iter_solver = n_iter_solver
self.tol_solver = tol_solver
self.alpha = alpha
if solver not in ['lbfgs_b','newton_cg']:
raise ValueError(('Only "lbfgs_b" and "newton_cg" '
'solvers are implemented'))
self.solver = solver
# masking value (this is set for use in lbfgs_b and newton_cg)
self._mask_val = -1.
def _fit(self,X,y):
'''
Maximizes evidence function (type II maximum likelihood)
'''
# iterative evidence maximization
alpha = self.alpha
n_samples,n_features = X.shape
w0 = np.zeros(n_features)
for i in range(self.n_iter):
alpha0 = alpha
# find mean & covariance of Laplace approximation to posterior
w, d = self._posterior(X, y, alpha, w0)
mu_sq = np.sum(w**2)
# use Iterative updates for Bayesian Logistic Regression
# Note in Bayesian Logistic Gull-MacKay fixed point updates
# and Expectation - Maximization algorithm give the same update
# rule
alpha = X.shape[1] / (mu_sq + np.sum(d))
# check convergence
delta_alpha = abs(alpha - alpha0)
if delta_alpha < self.tol or i==self.n_iter-1:
break
# after convergence we need to find updated MAP vector of parameters
# and covariance matrix of Laplace approximation
coef_, sigma_ = self._posterior(X, y, alpha , w)
self.alpha_ = alpha
return coef_, sigma_
def _add_intercept(self,X):
'''
Adds intercept to data (intercept column is not used in lbfgs_b or newton_cg
it is used only in Hessian)
'''
return np.hstack((X,np.ones([X.shape[0],1])))
def _get_intercept(self,coef):
'''
Returns intercept and coefficients
'''
return coef[-1], coef[:-1]
def _get_sigma(self,X):
''' Compute variance of predictive distribution'''
return np.asarray([ np.sum(X**2*s,axis = 1) for s in self.sigma_])
def _posterior(self, X, Y, alpha0, w0):
'''
Iteratively refitted least squares method using l_bfgs_b or newton_cg.
Finds MAP estimates for weights and Hessian at convergence point
'''
n_samples,n_features = X.shape
if self.solver == 'lbfgs_b':
f = lambda w: _logistic_loss_and_grad(w,X[:,:-1],Y,alpha0)
w = fmin_l_bfgs_b(f, x0 = w0, pgtol = self.tol_solver,
maxiter = self.n_iter_solver)[0]
elif self.solver == 'newton_cg':
f = _logistic_loss
grad = lambda w,*args: _logistic_loss_and_grad(w,*args)[1]
hess = _logistic_grad_hess
args = (X[:,:-1],Y,alpha0)
w = newton_cg(hess, f, grad, w0, args=args,
maxiter=self.n_iter, tol=self.tol)[0]
else:
raise NotImplementedError('Liblinear solver is not yet implemented')
# calculate negative of Hessian at w
xw = np.dot(X,w)
s = expit(xw)
R = s * (1 - s)
Hess = np.dot(X.T*R,X)
Alpha = np.ones(n_features)*alpha0
if self.fit_intercept:
Alpha[-1] = np.finfo(np.float16).eps
np.fill_diagonal(Hess, np.diag(Hess) + Alpha)
e = eigvalsh(Hess)
return w,1./e
#============== VB Logistic Regression (with Jaakola Jordan bound) ==================
def lam(eps):
''' Calculates lambda eps (used for Jaakola & Jordan local bound) '''
eps = -abs(eps)
return 0.25 * exprel(eps) / (np.exp(eps) + 1)
class VBLogisticRegression(BayesianLogisticRegression):
'''
Variational Bayesian Logistic Regression with local variational approximation.
Parameters:
-----------
n_iter: int, optional (DEFAULT = 50 )
Maximum number of iterations
tol: float, optional (DEFAULT = 1e-3)
Convergence threshold, if cange in coefficients is less than threshold
algorithm is terminated
fit_intercept: bool, optinal ( DEFAULT = True )
If True uses bias term in model fitting
a: float, optional (DEFAULT = 1e-6)
Rate parameter for Gamma prior on precision parameter of coefficients
b: float, optional (DEFAULT = 1e-6)
Shape parameter for Gamma prior on precision parameter of coefficients
verbose: bool, optional (DEFAULT = False)
Verbose mode
Attributes
----------
coef_ : array, shape = (n_features)
Coefficients of the regression model (mean of posterior distribution)
sigma_ : array, shape = (n_features, n_features)
estimated covariance matrix of the weights, computed only
for non-zero coefficients
intercept_: array, shape = (n_features)
intercepts
References:
-----------
[1] Bishop 2006, Pattern Recognition and Machine Learning ( Chapter 10 )
[2] Murphy 2012, Machine Learning A Probabilistic Perspective ( Chapter 21 )
'''
def __init__(self, n_iter = 50, tol = 1e-3, fit_intercept = True,
a = 1e-4, b = 1e-4, verbose = True):
super(VBLogisticRegression,self).__init__(n_iter, tol, fit_intercept, verbose)
self.a = a
self.b = b
self._mask_val = 0.
def _fit(self,X,y):
'''
Fits single classifier for each class (for OVR framework)
'''
eps = 1
n_samples, n_features = X.shape
XY = np.dot( X.T , (y-0.5))
w0 = np.zeros(n_features)
# hyperparameters of q(alpha) (approximate distribution of precision
# parameter of weights)
a = self.a + 0.5 * n_features
b = self.b
for i in range(self.n_iter):
# In the E-step we update approximation of
# posterior distribution q(w,alpha) = q(w)*q(alpha)
# --------- update q(w) ------------------
l = lam(eps)
w,Ri = self._posterior_dist(X,l,a,b,XY)
# -------- update q(alpha) ---------------
if self.fit_intercept:
b = self.b + 0.5*(np.sum(w[1:]**2) + np.sum(Ri[1:,:]**2))
else:
b = self.b + 0.5*(np.sum(w**2) + np.sum(Ri**2))
# -------- update eps ------------
# In the M-step we update parameter eps which controls
# accuracy of local variational approximation to lower bound
XMX = np.dot(X,w)**2
XSX = np.sum( np.dot(X,Ri.T)**2, axis = 1)
eps = np.sqrt( XMX + XSX )
# convergence
if np.sum(abs(w-w0) > self.tol) == 0 or i==self.n_iter-1:
break
w0 = w
l = lam(eps)
coef_, sigma_ = self._posterior_dist(X,l,a,b,XY,True)
return coef_, sigma_
def _add_intercept(self,X):
'''Adds intercept to data matrix'''
return np.hstack((np.ones([X.shape[0],1]),X))
def _get_intercept(self, coef):
''' Returns intercept and coefficients '''
return coef[0], coef[1:]
def _get_sigma(self,X):
''' Compute variance of predictive distribution'''
return np.asarray([np.sum(np.dot(X,s)*X,axis = 1) for s in self.sigma_])
def _posterior_dist(self,X,l,a,b,XY,full_covar = False):
'''
Finds gaussian approximation to posterior of coefficients using
local variational approximation of Jaakola & Jordan
'''
sigma_inv = 2*np.dot(X.T*l,X)
alpha_vec = np.ones(X.shape[1])*float(a) / b
if self.fit_intercept:
alpha_vec[0] = np.finfo(np.float16).eps
np.fill_diagonal(sigma_inv, np.diag(sigma_inv) + alpha_vec)
R = np.linalg.cholesky(sigma_inv)
Z = solve_triangular(R,XY, lower = True)
mean = solve_triangular(R.T,Z,lower = False)
# is there any specific function in scipy that efficently inverts
# low triangular matrix ????
Ri = solve_triangular(R,np.eye(X.shape[1]), lower = True)
if full_covar:
sigma = np.dot(Ri.T,Ri)
return mean, sigma
else:
return mean , Ri