In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (15, 10)
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = load_breast_cancer()

In [3]:
print(dataset.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [4]:
data = pd.DataFrame(dataset.data, columns=dataset.feature_names)
data['target'] = dataset.target

In [5]:
X, y = data.drop('target', axis=1), data[['target']]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2020)

In [7]:
from sklearn.linear_model import LogisticRegressionCV

In [16]:
from sklearn.metrics import roc_auc_score
model = LogisticRegressionCV(max_iter=1000)
model.fit(X_train, y_train)
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.9914918414918414


In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [34]:
class MyLogisticRegression:
    def __init__(self):
        self.n_iter = 300
        self.lambda_ = 0.1
    
    def fit(self, X, y):
        X = X.copy()
        X = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)
        self.w = np.random.randn(X.shape[1], 1)
        for it in range(self.n_iter):
            y_pred = self.predict(X)
            grad = X.T @(y * (1 - y_pred) + (1 - y) * y_pred) / len(X)
            #grad = X.T @ (y_pred - y) / len(X)
            self.w = self.w - self.lambda_ * grad
    
    def predict(self, X):
        z = X @ self.w
        return self.sigmoid(z)
    
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

In [35]:
lr = MyLogisticRegression()
lr.fit(X_train, y_train.values)

In [36]:
X_ = np.concatenate([np.ones((X_test.shape[0], 1)), X_test], axis=1)
y_pred = lr.predict(X_)

In [37]:
y_pred

array([[3.30844576e-31],
       [7.07824835e-23],
       [6.78162827e-19],
       [2.30628139e-25],
       [1.52181567e-23],
       [1.77987979e-20],
       [6.59384916e-29],
       [2.65773770e-28],
       [1.15714662e-25],
       [1.41992261e-27],
       [3.29454565e-18],
       [1.67786094e-18],
       [7.31763004e-23],
       [4.73221442e-22],
       [5.55781401e-22],
       [5.40448590e-18],
       [1.61681534e-19],
       [7.33735084e-19],
       [6.48008326e-32],
       [6.78430783e-21],
       [4.34520504e-25],
       [1.02246255e-23],
       [4.01172455e-26],
       [9.21263830e-18],
       [3.11542525e-18],
       [9.13045676e-22],
       [6.18864759e-21],
       [2.24332000e-21],
       [1.10890897e-20],
       [2.68453320e-25],
       [1.74083828e-32],
       [5.80432933e-24],
       [5.36296358e-26],
       [3.50446615e-26],
       [1.32682736e-29],
       [4.53302858e-20],
       [4.29111323e-31],
       [1.72036638e-24],
       [6.75334824e-21],
       [1.07750593e-21],


In [38]:
from sklearn.metrics import roc_auc_score

In [39]:
roc_auc_score(y_test, y_pred)

0.9659673659673659