# Logistic Regression From Scratch

In this notebook, we will write logistic regression algorithm from scratch.

In [29]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

data = sm.datasets.fair.load_pandas().data
data['affairs'] = (data.affairs > 0).astype(int)

x = data.drop('affairs', axis = 1)
y = data['affairs']

In [30]:
class logisticRegression():
    
    def __init__(self):
        self.coef = 0
        
    def sigmoid(self, array):
       
        self.array = array
        
        coef_times_inputs = np.dot(array,self.coef)
        preds = (1/(1+np.exp(-coef_times_inputs))).round(3)
        return preds
        
    def fit(self, x, y, alpha, threshold):
        
        #1 no of rows check
        if x.shape[0] == y.shape[0]:
            pass
        else:
            raise ValueError('Number of rows in x and y do not match') 
        
        #2 missing value check
        if sum(x.isnull().sum()) or sum(y.isnull()) > 0:
            raise ValueError('Missing value in data')
        else:
            pass

        #3 categorical columns in data
        if len(x.select_dtypes(include = ['object']).columns) > 0 or y.dtype == 'object':
            raise ValueError('Categorical columns exist in data')
        else:
            pass

        #4 convert to numpy array
        transform_x = np.asarray(x)
        transform_y = np.asarray(y)        
     
        #5 add bias to the data
        bias = np.ones((transform_x.shape[0],1))
        transform_x = np.concatenate((bias, transform_x), axis = 1)
        
        #6 init coeff for all columns + bias
        self.coef = np.random.uniform(low=-1, high=1, size=(transform_x.shape[1])) 

        #7 list for cost values for each iteration
        list_cost = []
        
        #8 main function ===============================================================================
        
        iteration = 1
    
        while (iteration < 100):

            if (iteration <= 5):

                # predict values
                
                coef_times_inputs = np.dot(transform_x,self.coef)
                preds = (1/(1+np.exp(-coef_times_inputs))).round(3)
                
                # error 
                error = transform_y - preds 

                # cost function
                cost = np.sum((-transform_y+0.0001)*np.log(preds+0.0001) - (1-transform_y)*np.log(1-preds+0.0001))/transform_x.shape[0]

                list_cost.append(cost)

                # gradient
                gradient = np.dot(transform_x.T, error) / transform_x.shape[0]
                
                # update coefficients 
                self.coef = self.coef - alpha*gradient

                # update alpha
                alpha = alpha*0.95

                
            # for iterations more than 5, check threshold value criteria
            else:
                if (float(sum(list_cost[-5:]))/5 - list_cost[-1]) < threshold:
                    pass
                
                else:
                    # predict values
     
                    coef_times_inputs = np.dot(transform_x,self.coef)
                    preds = (1/(1+np.exp(-coef_times_inputs))).round(3)
                    
                    # error 
                    error = transform_y - preds 

                    # cost function
                    cost = np.sum((-transform_y+0.0001)*np.log(preds+0.0001) - (1-transform_y)*np.log(1-preds+0.0001))/transform_x.shape[0]

                    list_cost.append(cost)

                    # gradient
                    gradient = np.dot(transform_x.T, error) / transform_x.shape[0]

                    # update coefficients 
                    self.coef = self.coef - alpha*gradient

                    # update alpha
                    alpha = alpha*0.95

            # increment iterations
            iteration = iteration + 1
           # print iteration, cost
            
        """
        x: input pandas dataframe
        y: output pandas series (Class labels)
 
        step1: Check if the number of rows in x and y are the same. If not raise a value error with a message.
        
        step2: Check if there is any missing value in the dataset (both for x and y). 
               If there is, raise a value error with a message.
               
        step3: Check if there is any categorical value in x or y. If there is, raise a value error with a message.
        
        step4: Transform both x and y into numpy.arrays (it is easier to work with arrays for matrix operations).
        
        step5: Add bias to the input vector x. bias means add a column which is 1 across al the rows.
               This will increase the number of columns of x by 1. x.shape[1] will increase by 1.
               
        step6: initialize self.coef.
                
        step7: create a list to save the cost values for each iteration.
        
        step8: while not converged and iteration number > 10000
                    calculate the predicted values
                    calculate the error 
                    calculate the cost function and append it to the cost list
                    calculate the gradient in a way that gradient is 
                                      gradient = (t(x) * (error))/(size_of_x) (number of rows)
                    adjust the coef in a way that
                                        coef = coef - alpha*gradient
                    adjust alpha in a way that
                                        alpha = alpha*0.95
                    
        step 8: Check if the convergence criteria is satisfied:
                if you iterate at least as many times 10000
                if the difference between the average of the last 5 cost values and the last cost value 
                is less than the threshold.
        
        You will not need to return anything because you are working on the coefs, which are class attributes
        """
    
    def predict_prob(self, x):
        
        # transform x
        transform_x = np.asarray(x)

        #5 add bias to the data
        bias = np.ones((transform_x.shape[0],1))
        transform_x = np.concatenate((bias, transform_x), axis = 1)
        
        if transform_x.shape[1] == self.coef.shape[0]:
            preds = self.sigmoid(transform_x)
            
        else:
            raise ValueError('size of input array does not match coefficients')
        
        return preds
 
        """
        Convert x into numpy aray and add bias
        Check if size of self.coef is the same with the number of columns in x
        Using x and self.coef, make the predictions
        """
    
    def predict_class(self, x):
        
        prob_x = self.predict_prob(x)
        
        prob_x[prob_x>0.5] = 1
        prob_x[prob_x<=0.5] = 0
        
        return prob_x

        """
        Make discrete predictions. Instead of returning probabilities return 0 or 1.
        """
    
    def get_accuracy(self, x, y):
        
        #pred = 0
        pred = self.predict_class(x)
        accuracy = float(sum(np.equal(pred,(np.asarray(y)))))/y.shape[0]
        print accuracy
                               
        """
        Calculate the accuracy rate
        number of true classification/total number of instances
        number of true classification is True positive + True negative
        """
        