Let's show equation (1) : $\mathcal{l(\theta) } = \frac{1}{n}\sum_{i=1}^n \log (1+e^{ - y_ix_{i}^{T} \theta}) $ \\
\
$
\left\{
    \begin{array}{ll}
        \frac{1}{1+e^{-x_{i}^{T} \theta}} & \mbox{if } y_i = 1 \\
        \frac{1}{1+e^{x_{i}^{T} \theta}} & \mbox{if } y_i = -1
    \end{array}
\right.
\implies
\left\{
    \begin{array}{ll}
        \frac{1}{1+e^{-x_{i}^{T} \theta}} & \mbox{if } y_i = 1 \\
        \frac{1}{1+e^{x_{i}^{T} \theta}} & \mbox{if } y_i = -1
    \end{array}
\right.
$ \\
\
$$
\mathbb{P}(y_i|x_i;\theta) = \frac{1}{1+e^{ - y_ix_{i}^{T} \theta}} \space \text{ then, }
\mathbb{P}(Y|X;\theta) = \prod_{i=1}^n \frac{1}{1+e^{ - y_ix_{i}^{T} \theta}}
$$
$$
\log \mathbb{P}(Y|X;\theta) = \log( \prod_{i=1}^n \frac{1}{1+e^{ - y_ix_{i}^{T} \theta}}) 
\implies
\log \mathbb{P}(Y|X;\theta) = \sum_{i=1}^n \log (\frac{1}{1+e^{ - y_ix_{i}^{T} \theta}}) 
$$
$$
\implies
\log \mathbb{P}(Y|X;\theta) = - \sum_{i=1}^n \log (1+e^{ - y_ix_{i}^{T} \theta})
\implies
- \log \mathbb{P}(Y|X;\theta) =  \sum_{i=1}^n \log (1+e^{ - y_ix_{i}^{T} \theta})
$$
$$
\fbox{ $ \mathcal{l(\theta) } = \frac{1}{n}\sum_{i=1}^n \log (1+e^{ - y_ix_{i}^{T} \theta}) $ }
$$ \\
\
$$
\frac{\partial l(\theta)}{\partial \theta} = - \frac{1}{n}  \sum_{i=1}^n \frac{y_ix_{i}}{1+e^{ y_ix_{i}^{T} \theta}} 
$$ \\
to update the parameters of our model we use the formula:
$$
\theta = \theta + \frac{1}{n}  \sum_{i=1}^n \frac{y_ix_{i}}{1+e^{ y_ix_{i}^{T} \theta}}
$$

# Libraries

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
epsilon = 1e-7

# Data

In [2]:
iris = datasets.load_iris()
features = iris.data
labels = iris.target

# Model

In [3]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=True):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        '''to add a biais '''
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def __loss(self, h,y):
        """the loss function of the equation(1)"""
        return -np.sum(np.log(h))/ y.size

    
    def fit(self, X, y):
        y=y.reshape(-1,1)

        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros((X.shape[1],1))

        for i in range(self.num_iter):
            z = X @ self.theta
            h = self.__sigmoid(-z*y)

            gradient = ((X*y).T @ h) / y.size
            self.theta += self.lr * gradient
            loss = self.__loss(self.__sigmoid(z*y),y)
                
            if(self.verbose ==True and i % 100 == 0):
                print(f'loss: {loss} \t')
    

# Training 

In [4]:
#split data in train and test set
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=2,stratify=labels)

# We apply OneVsAll
list_theta = []
u = np.unique(labels)
for i in u:
    X= X_train
    y = y_train
    # if label = 0 encodes y-->1 and for labels =/= 0 encodes y--> -1  for i = 0 and so on
    y=np.where((y==i),1,-1)
    model = LogisticRegression(lr=0.1, num_iter=1000)
    %time model.fit(X, y)
    list_theta.append(model.theta) #For each encoding, we collect its classifier (the calculated theta). 
theta = np.hstack(list_theta) #We put them in the theta so that if we want to predict, we choose the classifier with the highest confidence. 

loss: 0.6931471805599453 	
loss: 0.05585423996945716 	
loss: 0.029977142628093017 	
loss: 0.02078039554786432 	
loss: 0.016016391951014432 	
loss: 0.01308524424192899 	
loss: 0.011092352617869396 	
loss: 0.009645614536444178 	
loss: 0.008545491837373747 	
loss: 0.007679496405965182 	
CPU times: user 60.7 ms, sys: 166 µs, total: 60.8 ms
Wall time: 60.7 ms
loss: 0.6931471805599453 	
loss: 0.5745999875627902 	
loss: 0.5630583801192413 	
loss: 0.5543809058800406 	
loss: 0.5477609309709068 	
loss: 0.5426523347684292 	
loss: 0.5386625528692957 	
loss: 0.5355082934014916 	
loss: 0.5329837517303795 	
loss: 0.5309381992028918 	
CPU times: user 54.6 ms, sys: 134 µs, total: 54.7 ms
Wall time: 53.7 ms
loss: 0.6931471805599453 	
loss: 0.30010470718450466 	
loss: 0.2479547651271465 	
loss: 0.2177897711282658 	
loss: 0.19713583039225266 	
loss: 0.18191776216157848 	
loss: 0.17017619426590935 	
loss: 0.16081041124403608 	
loss: 0.15314586551266152 	
loss: 0.14674397210043072 	
CPU times: user 92.6 ms,

# Testing

In [5]:
# We have three theta of each classifier and to get a prediction, we choose the label with the highest probability.
def predict(X,theta):
    X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    return np.dot(X,theta).argmax(1).reshape(-1,1)


In [6]:
preds = predict(X_test,theta)

# Performance

In [8]:
#calculation of precision and recall
#Because the data is asymmetric, we are forced to use recall and precision to measure the performance of our model.
precision = precision_score(y_test, preds,average = 'micro')
recall = recall_score(y_test, preds,average = 'micro')

print('Precision: ',precision)
print('Recall: ',recall)

Precision:  0.98
Recall:  0.98


In [7]:
confusion_matrix(y_test, preds)


array([[17,  0,  0],
       [ 0, 16,  1],
       [ 0,  0, 16]])