#### Logistical Regression from Scratch in python

In [1]:
# sigmoid function used to reduce the continuous value from 
# LinearRegression into the (0, 1) 
# sigmoid(z) = 1 / (1 + e^-z)
# also we can't use the cost function from LinearRegression
# because it is a non-convex function of weights
# so we use J = -y*log(h(x)) - (1-y)log(1-h(x))
# where h(x) = sigmoid( theta * x + intercept)
# !! y -> real target value(ClassA y = 1, ClassB y = 0)
# y = 1 => J = -log(h(x))
# y = 0 => J = -log(1-h(x))
# we need to maximize the probability by minimizing the loss function
# to use now gradient descent for this cost function 
# theta = theta - alpha * dtheta

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# import warnings
# warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression

In [15]:
# Class
from numpy import ndarray

class LogisticReg():
    
    def __init__(self, learningrate, iterations):
        self.learningrate = learningrate
        self.iterations = iterations
    
    def fit(self, X:ndarray, Y:ndarray) -> None:
        # partition the no. features and no.entries
        self.noentries, self.nofeatures = X.shape
        # theta coeff init
        self.theta = np.zeros(self.nofeatures)
        self.intercept = 0
        self.X = X
        self.Y = Y

        # gradient descent learning
        for i in range(self.iterations):
            self.update_theta()
        
        return self
    
    def update_theta(self):
        # compute the sigmoid
        linearvalue = self.X.dot(self.theta) + self.intercept
        sig = 1 / (1 + np.exp(-linearvalue))

        # compute the gradient 
        # the final formula of the gradient is 
        # 1/m * sum(h(x[i]) - y[i]) * x[i][j]
        # where j is the index of the theta after which we derivated
        # in practice we make use of the matrices and vectors
        tmp = (sig - self.Y.T)
        tmp = np.reshape(tmp, self.noentries)
        dtheta = np.dot(self.X.T, tmp) / self.noentries
        dintercept = np.sum(tmp) / self.noentries

        #update the weights
        self.theta = self.theta - self.learningrate * dtheta
        self.intercept = self.intercept - self.learningrate * dintercept

        return self


    def predict(self, X: ndarray):
        linearvalue = X.dot(self.theta) + self.intercept
        Z = 1 / (1 + np.exp(-linearvalue))
        Y = np.where(Z > .5, 1, 0)
        return Y

In [None]:
# main function

def main():
    data = pd.read_csv('../database/diabetes.csv')
    X = data.iloc[: , :-1].values
    Y = data.iloc[:, -1:].values

    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=1/3, random_state=0)

    #model training
    selfmademodel = LogisticReg(learningrate=.01, iterations=1000)

    selfmademodel.fit(xtrain, ytrain)
    model = LogisticRegression()
    model.fit(xtrain, ytrain)

    #predict the test set
    selfmadepred = selfmademodel.predict(xtest)
    pred = model.predict(xtest)

    #measure performance 
    self_correctly_classified = 0
    correctly_classified = 0

    for i in range(np.size(selfmadepred)):
        if selfmadepred[i] == ytest[i]:
            self_correctly_classified += 1
        if pred[i] == ytest[i]:
            correctly_classified += 1
        
    size = np.size(selfmadepred)
    print(f"Accuracy on test set by our model: {(self_correctly_classified / size) * 100}")
    print(f"Accuracy on the model from sklearn: {(correctly_classified / size) * 100}")

main()

256
Accuracy on test set by our model: 67.578125
Accuracy on the model from sklearn: 78.125


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1]:
import numpy as np
from numpy import ndarray 

class LogisticRegression():

    def __init__(self, learning_rate = 0.001, iteration=1000):
        self.learning_rate = learning_rate
        self.iteration = iteration
        self.theta = None
        self.intercept = None 

    def _sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def fit(self, X:ndarray, y:ndarray):
        samplesno, featuresno = X.shape
        self.theta = np.zeros(featuresno)
        self.intercept = 0

        for _ in range(self.iteration):
            #compute the linear equivalent of the prediction
            linear_pred = np.dot(X, self.theta) + self.intercept
            #combine it with the sigmoid value to map it between 0 and 1
            predict = LogisticRegression._sigmoid(linear_pred)
            #use gradient descent 
            dtheta = (2/samplesno) * np.dot(X.T, (predict - y))
            dintercept = (1/samplesno) * np.sum(predict-y)

            self.theta = self.theta - self.learning_rate * dtheta
            self.intercept = self.intercept - self.learning_rate * dintercept

    def predict(self, X: ndarray):
        linear_pred = np.dot(X, self.theta) + self.intercept
        pred = LogisticRegression._sigmoid(linear_pred)
        label_pred = [0 if y <= .5 else 1 for y in pred]
        return label_pred


# use the algorithm
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def act():
    regr = LogisticRegression()
    data = datasets.load_breast_cancer()
    X, y = data.data, data.target
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=1234)

    regr.fit(xtrain, ytrain)
    pred = regr.predict(xtest)

    print(np.sum(pred == ytest)/len(ytest))

act()

0.9210526315789473


  return 1 / (1 + np.exp(-x))
