# Binary Logistic regression

In [44]:
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
# binary logistic regression
class Logistic_regression(object):
    
    def __init__(self, n_iter=10, l_rate = 0.3, fit_intercept=True, verbose=True):
        self.l_rate = l_rate
        self.n_iter=n_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
        
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.hstack([intercept, data])
    
    def get_score(self, newdata):
        return np.dot(newdata, self.coef)
        
    def sigmoid(self,score):
        # apply sigmoid function to score get probability
        return 1 / (1 + np.exp(-score))
    
    def predict_prob(self, newdata):
        return self.sigmoid(self.get_score(newdata))

    def log_likelihood(self, features, target, weights):
        scores = np.dot(features, weights)
        logli = np.sum( target*scores - np.log(1 + np.exp(scores)))
        return logli
    
    # estimate the coefficient values
    def stochastic_gradient_descent(self, data, label, l_rate, n_iter):
        # initial 0 weight
        self.coef = np.zeros(len(data[0]))
        #  run through the training data while updating the coefficients for each iteration
        for cycle in range(n_iter):
            sum_error = 0
            for i in range(len(label)):
                iter_result = self.predict_prob(data[i])
                print("Expect: ", label[i]," Predict: ", iter_result)
                # derivative of loss function is (sigmoid(score)-y)*x
                # error = prediction - true
                error_signal = iter_result - label[i] 
                # error * x get gradient
                gradient = np.dot(data[i], error_signal)
                weight_change = (-self.l_rate)*gradient
                print("Input sample x: ", data[i])
                print("weigth before update: ", self.coef)
                print("w*x: ", self.get_score(data[i]))
                print("error signal strength: ",error_signal)
                print("gradient: ", gradient)
                print("Weight changes: ", weight_change)
                self.coef += weight_change
                print("weigth after update: ", self.coef)
                print("w*x: ", self.get_score(data[i]))
                print("\n")
            if(self.verbose == True):
                print('\n>epoch=%d, lrate=%.3f, error=%.3f \n' % (cycle, self.l_rate, self.log_likelihood(data, label, self.coef)))
        return self.coef
    
    def fit(self, data, label):
        if isinstance(data,np.ndarray):
            print("pass")
        else:
            data = np.asarray(data)
        # add x_0 as 1 for bias term
        if self.fit_intercept:
            data = self.__add_intercept(data)
        print(data)
        self.trained_coef = self.stochastic_gradient_descent(data, label, self.l_rate, self.n_iter)
        self.intercept = self.trained_coef[0]
        self.trained_coef = self.trained_coef[1:]
        print("Intercept: ",self.intercept, "Final coefficient: ", self.trained_coef)
    
    def predict(self, newdata):
        if isinstance(newdata,np.ndarray):
            print("pass")
        else:
            newdata = np.asarray(newdata)
        if self.fit_intercept:
            newdata = self.__add_intercept(newdata)
        probabilities = self.predict_prob(newdata)
        print(probabilities)
        result = []
        for sample in probabilities:
            if(sample>=0.5):
                result.append(1)
            else:
                result.append(0)
        return result

1. Linear regression function: 
\begin{equation}
y=WX+b = w_{0}+w_{1}x_{1}+w_{2}x_{2}+...+w_{n}x_{n}
\end{equation}
where $W$ is initialized weight matrix, $X$ is input data, $b$ is bias. The right most function is the feature wise expansion on $WX+b$ where $w_{0}=b$
2. Sigmoid activation function:
\begin{equation}
Sigmoid(y) = \frac{1} {1 + e^{-y}}
\end{equation}
3. Apply sigmoid function on linear regression function to get probability:
\begin{equation}
P = \frac{1} {1 + e^{-(WX+b)}}
\end{equation}
4. Cost function of linear regression, can't used directly for logistic regression:
\begin{equation}
J(W) = \frac{1}{2m} \sum_{i=1}^{m} (h_w(x^{(i)}) - y^{(i)})^2
\end{equation}
rewritten to: 
\begin{equation}
J(W) = \frac{1}{m} \sum_{i=1}^{m} \frac{1}{2}(h_w(x^{(i)}) - y^{(i)})^2
\end{equation}
define a Cost function: 
\begin{equation}
\mathrm{Cost}(h_w(x^{(i)}),y^{(i)}) = \frac{1}{2}(h_w(x^{(i)}) - y^{(i)})^2
\end{equation}
takes two parameters for input, $h_w(x^{(i)})$ as hypothesis function (prediction) and $y^{(i)}$ as output (true label). Think it as error algorithm made, if model makes a prediction $h_w(x^{(i)})$ while the actual label was $y^{(i)}$.
Now rewrite the cost function for the entrie linear regression as:
\begin{equation}
J(W) = \dfrac{1}{m} \sum_{i=1}^m \mathrm{Cost}(h_w(x^{(i)}),y^{(i)})
\end{equation}
5. Optimize weight matrix W and bias b (intercept) for logistic regression, form objective (cost) function:
\begin{equation}
\mathrm{Cost}(h_w(x),y) =
\begin{cases}
-\log(h_w(x)) & \text{if y = 1} \\
-\log(1-h_w(x)) & \text{if y = 0}
\end{cases}
\end{equation}
which can be rewrite as 
\begin{equation}
\mathrm{Cost}(h_w(x),y) = -y \log(h_w(x)) - (1 - y) \log(1-h_w(x))
\end{equation}
now final logsitic regression objective function is:
\begin{equation}
J(W)= - \dfrac{1}{m} [\sum_{i=1}^{m} y^{(i)} \log(h_w(x^{(i)})) + (1 - y^{(i)}) \log(1-h_w(x^{(i)}))]
\end{equation}
6. Optimize weight matrix W and bias b (intercept) for logistic regression, gradient descent:
\begin{equation}
\Delta W = - \eta \nabla J = -\eta \frac{\partial}{\partial W} J(W)= - \eta (\dfrac{1}{m} \sum_{i=1}^{m} (h_w(x^{(i)}) - y^{(i)}) x^{(i)})
\end{equation}
First find the weight changes $\Delta W$ with above equation, where $\eta$ is learning rate from user input, $\nabla J$ means gradient of cost function $J$. Then we update weight $W$: 
\begin{equation}
W := W + \Delta W
\end{equation}



Example: 
Our goal is to predict whether a student will pass or fail based on number of hours slept and hours spent studying.

| Studied | Slept | Passed |
|---|---|---|
|  4.85	  |  9.63 |  	1  |
|  8.62	  |  3.23 |  	0  |
|  5.43	  |  8.23 |  	1  |
|  9.21	  |  6.34 |  	0  |

In [45]:
data = [[4.85, 9.63],
[8.62, 3.23],
[5.43, 8.23],
[9.21, 6.34]]

label = [1,0,1,0]

lrclf = Logistic_regression(fit_intercept=True, n_iter = 10, l_rate = 0.2)
lrclf.fit(data, label)
lrclf.predict(data)

[[1.00 4.85 9.63]
 [1.00 8.62 3.23]
 [1.00 5.43 8.23]
 [1.00 9.21 6.34]]
Expect:  1  Predict:  0.5
Input sample x:  [1.00 4.85 9.63]
weigth before update:  [0.00 0.00 0.00]
w*x:  0.0
error signal strength:  -0.5
gradient:  [-0.50 -2.42 -4.82]
Weight changes:  [0.10 0.48 0.96]
weigth after update:  [0.10 0.48 0.96]
w*x:  11.725940000000001


Expect:  0  Predict:  0.9993837183479858
Input sample x:  [1.00 8.62 3.23]
weigth before update:  [0.10 0.48 0.96]
w*x:  7.39119
error signal strength:  0.9993837183479858
gradient:  [1.00 8.61 3.23]
Weight changes:  [-0.20 -1.72 -0.65]
weigth after update:  [-0.10 -1.24 0.32]
w*x:  -9.74570233502335


Expect:  1  Predict:  0.014633632790014773
Input sample x:  [1.00 5.43 8.23]
weigth before update:  [-0.10 -1.24 0.32]
w*x:  -4.209691023209496
error signal strength:  -0.9853663672099853
gradient:  [-0.99 -5.35 -8.11]
Weight changes:  [0.20 1.07 1.62]
weigth after update:  [0.10 -0.17 1.94]
w*x:  15.146412333061884


Expect:  0  Predict:  0.999980531

[1, 0, 1, 0]

In [3]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=True, C = 1e15)
clf.fit(data, label)

print(clf.intercept_, clf.coef_)
clf.predict(data)

[0.11420064] [[-3.51068824  3.50466732]]




array([1, 0, 1, 0])