# **Import Statements**

In [32]:
# Import Statements

# Standard Stuff
import numpy as np
from numpy import ma # (Masked Array): Has NumPy Functions That Work With NaN Data
from numpy.linalg import pinv # Moore-Penrose Pseudoinverse

# Sklearn Imports 
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from scipy.special import expit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.optimize import fmin_bfgs # BFGS Algorithm: Extremely Common Optimization Algorithm

# Scipy Imports
from scipy.optimize import minimize_scalar

# **Binary Logistic Regression**

In [33]:
# Standard Binary Logistic Regression
class BinaryLogisticRegression:
    def __init__(self, eta, iterations=20, C=0.001, penalty="l2"):
        self.eta = eta
        self.iters = iterations
        self.C = C
        self.penalty = penalty # Regularization penalty type: "none", "l1", "l2", or "both"
        # internally we will store the weights as self.w_ to keep with sklearn conventions
        
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    # convenience, private:
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # vectorized gradient calculation with regularization using L2 Norm
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        gradient = gradient.reshape(self.w_.shape)
        # gradient[1:] += -2 * self.w_[1:] * self.C

        # add regularization (none does nothing)
        if self.penalty == "l1":
            gradient[1:] += -self.C * np.sign(self.w_[1:])
        elif self.penalty == "l2":
            gradient[1:] += -2 * self.C * self.w_[1:]
        elif self.penalty == "both":
            gradient[1:] += -self.C * (np.sign(self.w_[1:]) + 2 * self.w_[1:])
        
        return gradient
    
    # public:
    def predict_proba(self,X,add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction
    
    
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 
            # add bacause maximizing 

# **Line Search Logistic Regression**

In [34]:
# Update Binary Logistic Regression To Use Line Search
class LineSearchLogisticRegression(BinaryLogisticRegression):
    
    # define custom line search for problem
    def __init__(self, line_iters=0.0, **kwds):        
        self.line_iters = line_iters
        # but keep other keywords
        super().__init__(**kwds) # call parent initializer
    
    # this defines the function with the first input to be optimized
    # therefore eta will be optimized, with all inputs constant
    @staticmethod
    def objective_function(eta,X,y,w,grad,C):
        wnew = w - grad*eta
        g = expit(X @ wnew)
        # the line search is looking for minimization, so take the negative of l(w)
        return -np.sum(ma.log(g[y==1]))-ma.sum(np.log(1-g[y==0])) + C*sum(wnew**2)
    
        
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = -self._get_gradient(Xb,y)
            # minimization is in opposite direction
            
            # do line search in gradient direction, using scipy function
            opts = {'maxiter':self.line_iters} # unclear exactly what this should be
            res = minimize_scalar(self.objective_function, # objective function to optimize
                                  bounds=(0,self.eta*10), #bounds to optimize
                                  args=(Xb,y,self.w_,gradient,self.C), # additional argument for objective function
                                  method='bounded', # bounded optimization for speed
                                  options=opts) # set max iterations
            
            eta = res.x # get optimal learning rate
            self.w_ -= gradient*eta # set new function values
            # subtract to minimize 

# **Stochastic Logistic Regression**

In [35]:
class StochasticLogisticRegression(BinaryLogisticRegression):
    # stochastic gradient calculation 
    def _get_gradient(self,X,y):
        idx = int(np.random.rand()*len(y)) # grab random instance
        ydiff = y[idx]-self.predict_proba(X[idx],add_bias=False) # get y difference (now scalar)
        gradient = X[idx] * ydiff[:,np.newaxis] # make ydiff a column vector and multiply through
        
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += -2 * self.w_[1:] * self.C
        
        return gradient     

# **Hessian Binary Logistic Regression**

In [36]:
class HessianBinaryLogisticRegression(BinaryLogisticRegression):
    # just overwrite gradient function
    def _get_gradient(self,X,y):
        g = self.predict_proba(X,add_bias=False).ravel() # get sigmoid value for all classes
        hessian = X.T @ np.diag(g*(1-g)) @ X - 2 * self.C # calculate the hessian

        ydiff = y-g # get y difference
        gradient = np.sum(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += -2 * self.w_[1:] * self.C
        
        return pinv(hessian) @ gradient

# **BFGS Binary Logistic Regression**

In [37]:
# BFGS (Broyden–Fletcher–Goldfarb–Shanno) Algorithm Implemented 
class BFGSBinaryLogisticRegression(BinaryLogisticRegression):
    
    @staticmethod
    def objective_function(w,X,y,C):
        g = expit(X @ w)
        # invert this because scipy minimizes, but we derived all formulas for maximzing
        return -np.sum(ma.log(g[y==1]))-np.sum(ma.log(1-g[y==0])) + C*sum(w**2) 
        #-np.sum(y*np.log(g)+(1-y)*np.log(1-g))

    @staticmethod
    def objective_gradient(w,X,y,C):
        g = expit(X @ w)
        ydiff = y-g # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(w.shape)
        gradient[1:] += -2 * w[1:] * C
        return -gradient
    
    # just overwrite fit function
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = fmin_bfgs(self.objective_function, # what to optimize
                            np.zeros((num_features,1)), # starting point
                            fprime=self.objective_gradient, # gradient function
                            args=(Xb,y,self.C), # extra args for gradient and objective function
                            gtol=1e-03, # stopping criteria for gradient, |v_k|
                            maxiter=self.iters, # stopping criteria iterations
                            disp=False)
        
        self.w_ = self.w_.reshape((num_features,1))

# **Multi Class Logistic Regression**

In [38]:
# allow for the user to specify the algorithm they want to solver the binary case
class MultiClassLogisticRegression:
    def __init__(self, eta, iterations=20, 
                 C=0.0001, 
                 solver=BFGSBinaryLogisticRegression,
                 penalty = 'l2'):
        self.eta = eta
        self.iters = iterations
        self.C = C
        self.solver = solver
        self.penalty = penalty
        self.classifiers_ = []
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.sort(np.unique(y)) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = []
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = np.array(y==yval).astype(int) # create a binary problem
            # train the binary classifier for this class
            
            hblr = self.solver(eta=self.eta,iterations=self.iters,C=self.C)
            hblr.fit(X,y_binary)

            # add the trained classifier to the list
            self.classifiers_.append(hblr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for hblr in self.classifiers_:
            probs.append(hblr.predict_proba(X).reshape((len(X),1))) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return np.argmax(self.predict_proba(X),axis=1) # take argmax along row

In [42]:
%%time
lr = MultiClassLogisticRegression(eta=1.0,
                                  iterations=4,
                                  C=0.01,
                                  solver=BFGSBinaryLogisticRegression,
                                  penalty='l2'
                                 )
lr.fit(X_train,y_train)
print(lr)

yhat = lr.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

MultiClass Logistic Regression Object with coefficients:
[[ 0.29042546  0.66260688  0.74538491 -1.13375801 -0.75909516]
 [-0.29042546 -0.66260688 -0.74538491  1.13375801  0.75909516]]
Accuracy of:  0.9666666666666667
CPU times: total: 0 ns
Wall time: 11.1 ms


In [53]:
%%time
lr = MultiClassLogisticRegression(eta=1.0,
                                  iterations=5,
                                  C=0.001,
                                  solver=HessianBinaryLogisticRegression,
                                  penalty='l1'
                                 )
lr.fit(X_train,y_train)
print(lr)

yhat = lr.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

MultiClass Logistic Regression Object with coefficients:
[[ 15.79560094   1.9123695    1.14728782  -3.81520138  -7.45947451]
 [-15.79560094  -1.9123695   -1.14728782   3.81520138   7.45947451]]
Accuracy of:  0.9666666666666667
CPU times: total: 0 ns
Wall time: 3.5 ms


In [None]:
%%time
# how do we compare now to sklearn?
from sklearn.linear_model import LogisticRegression

lr_sk = LogisticRegression(solver='lbfgs',n_jobs=1,
                           multi_class='ovr', C = 1/0.001, 
                           penalty='l2',
                          max_iter=50) # all params default
# note that sklearn is optimized for using the liblinear library with logistic regression
# ...and its faster than our implementation here

lr_sk.fit(X_train, y_train) # no need to add bias term, sklearn does it internally!!
print(lr_sk.coef_)
yhat = lr_sk.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

In [None]:
%%time
# actually, we aren't quite as good as the lib linear implementation
# how do we compare now to sklearn?
from sklearn.linear_model import LogisticRegression

lr_sk = LogisticRegression(solver='liblinear',n_jobs=1, 
                           multi_class='ovr', C = 1/0.001, 
                           penalty='l2',max_iter=100) 

lr_sk.fit(X_train,y_train) # no need to add bias term, sklearn does it internally!!
print(lr_sk.coef_)
yhat = lr_sk.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

In [28]:
from sklearn.datasets import load_iris
import numpy as np
from sklearn.metrics import accuracy_score
from scipy.special import expit
from sklearn.model_selection import train_test_split

ds = load_iris()
X = ds.data
y = (ds.target>1).astype(int) # make problem binary
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size=0.2)

In [30]:
%%time

#

blr = BinaryLogisticRegression(eta=0.1, iterations=50,C=0.001)

blr.fit(X_train,y_train)
print(blr)

yhat = blr.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

#

lslr = LineSearchLogisticRegression(eta=1.0,
                                    iterations=6, 
                                    line_iters=8, 
                                    C=0.001)

lslr.fit(X_train,y_train)

yhat = lslr.predict(X_test)
print(lslr)
print('Accuracy of: ',accuracy_score(y_test,yhat))

#

slr = StochasticLogisticRegression(eta=0.01, iterations=800, C=0.001) # take a lot more steps!!

slr.fit(X_train,y_train)

yhat = slr.predict(X_test)
print(slr)
print('Accuracy of: ',accuracy_score(y_test,yhat))

#

hlr = HessianBinaryLogisticRegression(eta=1.0,
                                      iterations=4,
                                      C=0.001) # note that we need only a few iterations here

hlr.fit(X_train,y_train)
yhat = hlr.predict(X_test)
print(hlr)
print('Accuracy of: ',accuracy_score(y_test,yhat))

#

bfgslr = BFGSBinaryLogisticRegression(_,iterations=3,C=0.001) # note that we need only a few iterations here

bfgslr.fit(X_train,y_train)
yhat = bfgslr.predict(X_test)
print(bfgslr)
print('Accuracy of: ',accuracy_score(y_test,yhat))

Binary Logistic Regression Object with coefficients:
[[-0.19967076]
 [-0.47455757]
 [-0.49940795]
 [ 0.77090226]
 [ 0.53602837]]
Accuracy of:  0.9666666666666667
Binary Logistic Regression Object with coefficients:
[[-1.06296708]
 [-2.49138229]
 [-3.0503734 ]
 [ 4.23395947]
 [ 2.68939073]]
Accuracy of:  1.0
Binary Logistic Regression Object with coefficients:
[[-0.25950273]
 [-0.59364614]
 [-0.61723524]
 [ 1.04556247]
 [ 0.71349414]]
Accuracy of:  0.9333333333333333
Binary Logistic Regression Object with coefficients:
[[-12.02978114]
 [ -1.18083715]
 [ -0.13854738]
 [  1.98140576]
 [  6.15424339]]
Accuracy of:  0.9666666666666667
Binary Logistic Regression Object with coefficients:
[[-0.19732186]
 [-0.47166791]
 [-0.58692298]
 [ 0.86137527]
 [ 0.5429108 ]]
Accuracy of:  0.9666666666666667
CPU times: total: 0 ns
Wall time: 35.2 ms


  return -np.sum(ma.log(g[y==1]))-ma.sum(np.log(1-g[y==0])) + C*sum(wnew**2)
  r = (xf - nfc) * (fx - ffulc)
  q = (xf - fulc) * (fx - fnfc)
