In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [3]:
# Step by Step Logistic regression

In [4]:
# This note book is fucused in Binary logistic regression. 

## Introduction

<h4>
    <font color='black'>
       The logistic regression starts with the assumption of the following equation.
    </font>
  
  <font color='#3333cc'>
  $$(1) \; \;  log(\frac{p}{1-p}) = \sum_i^n \theta_n x_n + ... +\theta_1 x_1 +\theta_0$$
  </font>
That is the base point.What does this equation say ? <br> <br>
The independent variables (feature variables) are linearly related to log-odds of the probability of some classification variable to be equal to 1.<br> <br>
The main important properties of the log-odds function $F(p)=log(\frac{p}{1-p})$ : <br> <br>
<font color='#3333cc'>
  $$1)\; p \in [0,1] ;(because\; that\; is\; probability)$$ <br>
  $$2)\; F(p) \in [-\infty,-\infty,] \;(that\; makes \;it\; capable\; to\; be\; fited\; to\; linear\; function )$$<br>
  $$3) The\;  probability\;  p \; refers \;to \;the \;label(target) \;variable\; to \;be\; eaqual\; to\; 1$$ 
  </font> 
  <br>
 </h4>

<h4>
Taking the exponent of eq(1) followed by some basic algebraic handling we can achieve the :
 <font color='#3333cc'>
  $$(2) h_{\Theta}(X.\Theta) =\sigma(z) = \frac{1}{1 +e^{-z}}  \; where  \; z= \sum_i^n \theta_n x_n + ... +\theta_1 x_1 +\theta_0\;  $$  
  </font>
The eq(2) is called sigmoid function. $X$ is the feature vector $\Theta$ is our estimator vector.This function is the hypothesis function, it is used in Logistic regression for predicting the classification variable.<br> <br>
The l-regression deals with finding the best estimators parameters  $\Theta(\theta) = [\theta_n x_n ,... ,\theta_1 x_1 ,\theta_0]$ by fitting it according to a given data set.
</h4>



<h5> 
    More details about logg-odds can be found here <a href="https://daodavid93.github.io/Machine-Learning/source/html/ML/logistic-regression/Cross-entropy%20function.Investigation%20and%20gradient%20descent.html">Origin of Sigmoid</a>
</h5>

<h4>
 We need a loss function that expresses ,for an observation x ,how close the classifier output ($\hat{y}=\sigma(\theta_i.x_i)$) is to the correct function output (which is 1 ,0 ).We will call this : <br> <br>
    $$ L(\hat{y},y)= How\;much\;\hat{y}\;is\;different\;from \;true \; y$$ <br>
    We do this via a loss function that prefers the correct class labels of the training data exaples to be more quickly.<br> <br>
    
In order to be fitting the sigmoid function, the cost function is not  R-squared which is used in Linear regression, because the sigmoid is very complex and non-convex, contains many extrema. The cost function which is used is the so-called Cross-entropy function<br> <br>

</h4>

<h4>
  <font color='#3333cc'>
       $$ \;(3)\; L(\hat{y},y)= J(\Theta) = \sum_i^m Cost(h_{\Theta}(x_i),y_i)=-\frac{1}{m}\big(\sum_i^my_i.log(h_{\Theta}(x_i)) + (y_i-1)log(1 - h_{\Theta}(x_i))\big)$$
   </font>
   <br>
  For the optimization problem, we will use the well-known algorithm <a href="https://daodavid93.github.io/Machine-Learning/source/html/Linear-Algebra/gradient%20descent.html">Gradient Descent</a>.<br> <br>
  Applying Gradient descent into Cross-entropy function we achieve the folowing equations : 
  <br> 
</h4>  
  

<h4>
   
  <font color='#3333cc'> 
      $$(4) \; \theta_i = \theta_i - \frac{1}{m}\sum_i^m \big( h_{\Theta}(z) - y_i\big )x_i = $$
      $$\theta_i - \frac{1}{m}\sum_i^m \big( h_{\Theta}(\theta_n x_n + ... +\theta_1 x_1 +\theta_0) - y_i\big )x_i $$
  </font>
</h1>  

<h4>
   for intersept $\theta_0$
  <font color='#3333cc'> 
      $$(5) \;\theta_0 = \theta_0 - \frac{1}{m}\sum_i^m \big( h_{\Theta}(z) - y_i\big )$$
  </font>
  <br>
 We will try to implement step by step the above eqations,In order to opimize (3) and to fited the best estimators <br>  <br>
 $$\vec{\Theta}(\theta)= [\theta_n,\theta_{n-1},..,\theta_1,\theta_0]$$ <br>  by given dataset
</h4>  



##### There is more mathematical details and proofs about cross-entropy and gradient descent here  : <a href="https://daodavid93.github.io/Machine-Learning/source/html/ML/logistic-regression/Cross-entropy%20function.Investigation%20and%20gradient%20descent.html">Investigation of Cross-entropy function. Math proof of formula of gradient descent over Cross-entropy. Mathеmaticaly resolved </a>.<br> <br>


## Methods

In [5]:
def sigmoid(X, theta, intercept = 0 ):
    """
    Target vector X,X can be the matrix of many vectors and numer as well
    theta is an estimator vector
    intercept is theta zero elemt
    
    """
    #convertions to ndarray
    X = np.array(X)
    theta = np.array(theta)
    z = X.dot(theta.T)+intercept # scallar product : <X|theta^(-1)> + intercept
    return 1/(1+np.exp(-z)) #sigmoid transformation of z
    

In [6]:
x = np.array([[1,2,3],
         [2,2,3]])
#x = np.array([1,2,3])
theta = np.array([1,2,3])

In [7]:
def lost(arg, y_target, x_i=1):
    """
    takes arg ,that is the result of sigmoid it has to be array
    y_target label variable which is 1 or 0
    x_i lement is every i element from X vectors in one array [x[i][j]] j is constant refer to column related to j_estimator
    """
    y = np.array(y_target)
    x_i = np.array(x_i)
    return (arg-y)*x_i
    

In [8]:
def cost(X, estimators, Y_label, intecept, x_i=1):
    """
    takes:
    X is Target vectors 
    estimators are our fitin parametes theta_i
    Y_label is our target values zero or one
    x_i is the i_th element (column) of target element related to Theta i_th estimator
    """
    m = Y_label.shape[0]
    n = np.array(x_i).shape[0]
    if m != n :
        print(n)
        raise ValueError('x_i and Y_label must have same shape')
    sigmoid_result = sigmoid(X, estimators, intecept)    
    result = lost(sigmoid_result, Y_label, x_i)
    return result.sum()   
        
        

In [9]:
#cost([[1,2],[1,3]],[3,2],np.array([1,0]),1,1)
n = np.array(1)
print(n)

1


In [10]:
def gradient_descent(X_data, Y_label, times_interaction=100, learning_rate=0.1, init_value=10):
    x = np.array(X_data)
    y_l = np.array(Y_label)
    n = x.shape[1]
    m = x.shape[0]
    intercept = init_value
    esimators = np.full(n, init_value)
    
    for i in range(times_interaction):
        for i in range(len(esimators)):
            esimators[i]-=cost(x, esimators, y_l, intercept, x[:,i] )
            
        intercept -=cost(x, esimators, y_l, intercept, np.full(m, 1) )     
        
    
    print("coef:",esimators)
    print("intercept:",intercept)
    
    

In [11]:
gradient_descent([[1,2],[2,3]], [1,0])

coef: [ 0 -2]
intercept: 5.846070847119247


Let to generate out training data from back to front : 
1) We should define the training data taking into acout that the variables will be linear dependent to one hidden variable,wich will the argument of sigmoid.

Let to difine $z = a*x + b$ where $a=2$ and $b=4$ and use to define some sequence ,find the out from sigmoid, 

Then out main purpose will be to find a,b from back to front,and to see is approximatlyu equal

In [12]:

#define z argument as lininear euation of z = 2*x + 4
z_f = lambda x : 2*x+4
x = np.linspace(-10,10,30)

z_args = np.array([z_f(i) for i in x])

In [13]:

y_prime = np.array( 1/(1+np.exp(z_args)))

In [14]:
print(y_prime.shape)
print(z_args.shape)
print(x.shape)

(30,)
(30,)
(30,)


In [15]:
## generate labeled  from already define y_prime data given the sigmoid with args z = 2*x+4
def generated_label(i):
        if i < 0.5 :
            return 0
        elif i > 0.5 :
            return 1
        else :
            return np.random.randint(0,2)

In [16]:
y_label = np.array([generated_label(i) for i in y_prime])