<a href="https://colab.research.google.com/github/CedricDamais/LogisticRegression/blob/main/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression from Scratch

- In this Project I coded from scratch a `Logistic Regression`, with different loss function, `Log Loss and MSE` . Then I tested them on the `Breast cancer Dataset`. I ended my project by comparing my Logistic Regression made from scratch with the built in Logistic Regression within sklearn.

In [2]:
import math

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [17]:
class CustomLogisticRegression:

    def __init__(self, fit_intercept=True, lr=0.01, epochs=1000):
        self.fit_intercept = fit_intercept
        self.lr = lr
        self.epochs = epochs

    def sigmoid(self, t: np.ndarray) -> float:
        val:float = 1 / (1+math.exp(-t))
        return val

    def predict_proba(self, row, cof_):

        t = np.dot(cof_, row)
        return self.sigmoid(t)

    def fit_mse(self, X_train, y_train):
        row_count = X_train.shape[0]
        mse_error_first = [0.]*y_train.shape[0]
        mse_error_last = [0.]*y_train.shape[0]
        self.coef = self.coef_ = np.array([0.] * len(X_train.keys())) if self.fit_intercept else np.array(
            [0.] * len(X_train.keys()))

        for epoch in range(self.epochs):
            for i in range(len(X_train)):
                row = X_train.iloc[i]
                y_hat = self.predict_proba(np.array(row), self.coef_)
                y_i = y_train.iloc[i]

                # update the weights
                for index, train_row in enumerate(row):
                    if self.fit_intercept and index == 0:
                        self.coef_[index] += -self.lr * (y_hat - y_i) * y_hat * (1 - y_hat)
                        continue

                    if train_row != 'intercept':
                        x_i_j = row[index]
                        derive = self.lr * (y_hat - y_i) * y_hat * (1 - y_hat) * x_i_j
                        self.coef_[index] = self.coef_[index] - derive

                if epoch == 0:
                    mse_error_first[i] = ((y_hat - y_i) ** 2) / row_count

                if epoch == self.epochs - 1:
                    mse_error_last[i] = ((y_hat - y_i) ** 2) / row_count
            return mse_error_first, mse_error_last



    def fit_log_loss(self, X_train, y_train):
            row_count = X_train.shape[0]
            log_loss_error_first = [0.] * y_train.shape[0]
            log_loss_error_last = [0.] * y_train.shape[0]
            self.coef_ = np.array([0.] * len(X_train.keys())) if self.fit_intercept else np.array(
                [0.] * len(X_train.keys()))
            for epoch in range(self.epochs):
                for i in range(len(X_train)):

                    row = X_train.iloc[i]
                    y_hat = self.predict_proba(np.array(row), self.coef_)
                    y_i = y_train.iloc[i]

                    for index, train_row in enumerate(row):
                        if self.fit_intercept and index == 0:
                            self.coef_[index] += - ((self.lr * (y_hat - y_i)) / row_count)
                            continue
                        x_i_j = row[index]
                        derive = (self.lr * (y_hat - y_i) * x_i_j) / row_count
                        self.coef_[index] -= derive

                    if epoch == 0:
                        log_loss_error_first[i] = -(y_i * math.log(y_hat) + (1 - y_i) * math.log(1 - y_hat)) / row_count

                    if epoch == self.epochs - 1:
                        log_loss_error_last[i] = -(y_i * math.log(y_hat) + (1 - y_i) * math.log(1 - y_hat)) / row_count

            return log_loss_error_first, log_loss_error_last


    def predict(self, X_test, cut_off=0.5):
            predictions = []
            for i in range(len(X_test)):
                y_hat = self.predict_proba(X_test.iloc[i], self.coef_)
                predictions.append(1) if y_hat >= cut_off else predictions.append(0)
            return np.array(predictions)  # predictions are binary values - 0 or 1


In [4]:
def standardize(feature):
    mean = np.mean(feature)
    std_deviation = np.std(feature)
    z = (feature - mean) / std_deviation
    return z

In [5]:
def intercept_is_true(X_train, X_test):
    X_test['intercept'] = 1.
    X_train['intercept'] = 1.
    x_test_col = X_test.pop('intercept')
    x_train_col = X_train.pop('intercept')
    X_train.insert(0, 'intercept', x_train_col)
    X_test.insert(0, 'intercept', x_test_col)

    return X_test, X_train

- The features of the data we are going to use to make predictions are `worst concave points`, `worst perimeter`, `worst radius`.

In [23]:

    df_features, y = load_breast_cancer(return_X_y=True, as_frame=True)
    # The features we will be using to make predictions are
    X = df_features[['worst concave points', 'worst perimeter', 'worst radius']]
    X = standardize(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size=0.8,
                                                        random_state=43)

    # Creating  LogisiticRegression with different loss functions each
    model_2 = LogisticRegression()
    model_1 = CustomLogisticRegression(fit_intercept=True, lr=0.01, epochs=1000)
    model_0 = CustomLogisticRegression(fit_intercept=True, lr=0.01, epochs=1000)
    if model_0.fit_intercept:
        X_test, X_train = intercept_is_true(X_test=X_test, X_train=X_train)

    # Getting the mse error from the last and first epochs
    mse_error_first, mse_error_last = model_1.fit_mse(X_train, y_train)
    # Same for the log_loss to later compare the models
    log_loss_error_first, log_loss_error_last = model_0.fit_log_loss(X_train, y_train)



  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [26]:
predictions_mse = model_1.predict(X_test=X_test)

predictions_log_loss = model_0.predict(X_test=X_test)
mse_acc = accuracy_score(y_true=y_test, y_pred=predictions_mse)
log_loss_acc = accuracy_score(y_true=y_test, y_pred=predictions_log_loss)

model_2.fit(X_train, y_train)
sklearn_prediction = model_2.predict(X_test)
sklearn_acc = accuracy_score(y_true=y_test, y_pred=sklearn_prediction)

dic = {'mse_accuracy': mse_acc, 'logloss_accuracy': log_loss_acc, 'sklearn_accuracy': sklearn_acc,
           'mse_error_first': mse_error_first,
           'mse_error_last': mse_error_last, 'logloss_error_first': log_loss_error_first,
           'logloss_error_last': log_loss_error_last}

print(dic)

{'mse_accuracy': 0.9649122807017544, 'logloss_accuracy': 0.9649122807017544, 'sklearn_accuracy': 0.9649122807017544, 'mse_error_first': [0.0005494505494505495, 0.0005479373271209118, 0.0005462883498398668, 0.0005483642804399022, 0.000544972318400189, 0.0005465302795766751, 0.0005461069779873782, 0.0005437570329137248, 0.0005449449074052375, 0.0005332036459325841, 0.0005466937294306692, 0.0005348518144103991, 0.0005418792268097456, 0.0005452077846616447, 0.0005255235277976168, 0.0005410726787893249, 0.0005389618114841878, 0.0005382571392119537, 0.000535985471203609, 0.0005391434306907999, 0.0005285877446330425, 0.0005243358860323738, 0.0005194245003163275, 0.0005218662799461391, 0.0005275402556038037, 0.0005294432983057608, 0.000544080605494967, 0.0005275679254600012, 0.0005305357034998527, 0.0005150953830473473, 0.0004898084788349188, 0.000493493712331345, 0.0005908325058103857, 0.0005415874659092546, 0.0005292226643911327, 0.0005193244971354963, 0.0005299924268356507, 0.00050255898945