Alohan'ny mamerina dia avereno atao Run ny notebook iray manontolo. Ny fanaovana azy dia redémarrena mihitsy ny kernel aloha (jereo menubar, safidio **Kernel$\rightarrow$Restart Kernel and Run All Cells**).

Izay misy hoe `YOUR CODE HERE` na "YOUR ANSWER HERE" ihany no fenoina. Afaka manampy cells vaovao raha ilaina. Aza adino ny mameno references eo ambany raha ilaina.

## References
Eto ilay references rehetra no apetraka

---

In [1]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Linear regression

In [2]:
data = load_diabetes()
X_train1, y_train1 = data.data, data.target
w1 = np.random.randn(X_train1.shape[1]) * 0.0001
b1 = np.random.randn(1) * 0.0001

In [3]:
def mse_loss_naive(w, b, X, y, alpha=0):
    """
    MSE loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    n = len(X)
    for i in range(0, n) : 
        loss += ((y[i] - w@X[i] - b)**2)/n 
        dw += -2.0*(X[i]*(y[i] - w@X[i] - b))/n
        db += -2.0*(y[i] - w@X[i] - b)/n
    
    ## we know just add the regularisation
    loss += alpha * (np.linalg.norm(w, 2))**2
    dw += alpha * 2.0 * w
    
    return loss, dw, np.array(db).reshape(1,)

## Naive Linear regression loss

In [4]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
print("Loss error : ",rel_error(loss, sk_loss))
assert rel_error(loss, sk_loss) < 1e-9

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  1.8768922252608535e-16
Gradient check w
numerical: -1.553187 analytic: -1.553187, relative error: 1.488043e-07
numerical: -1.275043 analytic: -1.275042, relative error: 7.413576e-08
numerical: -1.275043 analytic: -1.275042, relative error: 7.413576e-08
numerical: -1.376393 analytic: -1.376394, relative error: 1.275297e-07
numerical: -3.234125 analytic: -3.234125, relative error: 9.961176e-09
numerical: -1.553187 analytic: -1.553187, relative error: 1.488043e-07
numerical: -1.553187 analytic: -1.553187, relative error: 1.488043e-07
numerical: -1.376393 analytic: -1.376394, relative error: 1.275297e-07
numerical: -1.275043 analytic: -1.275042, relative error: 7.413576e-08
numerical: -0.315452 analytic: -0.315454, relative error: 1.905481e-06
numerical: -3.234125 analytic: -3.234125, relative error: 9.961176e-09
numerical: 2.892059 analytic: 2.892060, relative error: 1.975422e-07
numerical: -1.553187 analytic: -1.553187, relative error: 1.488043e-07
numerical: -3.153317 anal

## Naive Ridge regression loss

In [5]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -4.145212 analytic: -4.145212, relative error: 3.211555e-08
numerical: -1.552932 analytic: -1.552931, relative error: 1.092921e-07
numerical: -1.552932 analytic: -1.552931, relative error: 1.092921e-07
numerical: -1.274897 analytic: -1.274897, relative error: 8.062249e-08
numerical: -0.315235 analytic: -0.315236, relative error: 1.889370e-06
numerical: -0.315235 analytic: -0.315236, relative error: 1.889370e-06
numerical: 2.892027 analytic: 2.892028, relative error: 2.053562e-07
numerical: -1.376433 analytic: -1.376433, relative error: 1.471336e-07
numerical: -1.552932 analytic: -1.552931, relative error: 1.092921e-07
numerical: -1.274897 analytic: -1.274897, relative error: 8.062249e-08
numerical: 2.892027 analytic: 2.892028, relative error: 2.053562e-07
numerical: 2.892027 analytic: 2.892028, relative error: 2.053562e-07
numerical: -0.315235 analytic: -0.315236, relative error: 1.889370e-06
numerical: -4.295941 analytic: -4.295940, relative error: 1.814921

In [6]:
def mse_loss_vectorized(w, b, X, y, alpha=0):
    """
    MSE loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    n = len(X)
    
    loss = (np.transpose(y - X@w - b) @ (y - X@w - b))/n + alpha * (np.linalg.norm(w, 2))**2
    dw = (-2.0 * (np.transpose(X) @ (y - X@w - b)))/n + 2.0 * alpha * w
    db = -2.0 * (y - X@w - b)/n
    
    ## We want to return a nuber, which is the sum of all the content of our list bd
    db = sum(db)
    
    return loss, dw, np.array(db).reshape(1,)

## Vectorised Linear regression loss

In [7]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
print("Loss error : ",rel_error(loss, sk_loss))
assert rel_error(loss, sk_loss) < 1e-9

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  0.0
Gradient check w
numerical: -1.275042 analytic: -1.275042, relative error: 2.805219e-09
numerical: -0.315454 analytic: -0.315454, relative error: 1.127156e-07
numerical: -2.801912 analytic: -2.801913, relative error: 6.651804e-08
numerical: -3.153316 analytic: -3.153316, relative error: 2.763992e-08
numerical: -2.801912 analytic: -2.801913, relative error: 6.651804e-08
numerical: -4.145423 analytic: -4.145423, relative error: 8.836474e-09
numerical: -2.801912 analytic: -2.801913, relative error: 6.651804e-08
numerical: -1.376394 analytic: -1.376394, relative error: 4.626485e-09
numerical: -4.296087 analytic: -4.296087, relative error: 2.480153e-08
numerical: -0.315454 analytic: -0.315454, relative error: 1.127156e-07
numerical: -1.553187 analytic: -1.553187, relative error: 3.169090e-08
numerical: -4.145423 analytic: -4.145423, relative error: 8.836474e-09
numerical: -3.234125 analytic: -3.234125, relative error: 9.961176e-09
numerical: -1.376394 analytic: -1.376394, 

## Vectorized ridge regression loss

In [8]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -1.552931 analytic: -1.552931, relative error: 7.840469e-09
numerical: -0.315236 analytic: -0.315236, relative error: 1.302202e-07
numerical: -3.234494 analytic: -3.234494, relative error: 8.968587e-09
numerical: -3.153407 analytic: -3.153407, relative error: 1.954253e-08
numerical: -4.145212 analytic: -4.145212, relative error: 1.176615e-08
numerical: -2.801751 analytic: -2.801752, relative error: 6.392256e-08
numerical: 2.892028 analytic: 2.892028, relative error: 1.666610e-08
numerical: -3.234494 analytic: -3.234494, relative error: 8.968587e-09
numerical: -3.234494 analytic: -3.234494, relative error: 8.968587e-09
numerical: -3.153407 analytic: -3.153407, relative error: 1.954253e-08
numerical: -2.801751 analytic: -2.801752, relative error: 6.392256e-08
numerical: -1.376433 analytic: -1.376433, relative error: 1.498111e-08
numerical: -2.801751 analytic: -2.801752, relative error: 6.392256e-08
numerical: -0.315236 analytic: -0.315236, relative error: 1.30

# Logistic regression

In [9]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

data = load_breast_cancer()
X_train2, y_train2 = data.data, data.target
w2 = np.random.randn(X_train2.shape[1]) * 0.0001
b2 = np.random.randn(1) * 0.0001

# Naive

In [10]:
def log_loss_naive(w, b, X, y, alpha=0):
    """
    log loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    ## We try to minimize the NLL
    n = X.shape[0]
    for i in range(0, n) :
        loss += -(y[i]*np.log(sigmoid((w.T)@X[i] +b)) + (1 - y[i]) * np.log(1 - sigmoid((w.T)@X[i] +b)))/n
        ## now we have €(y - ŷ)/n, ŷ is our estimator. This is what we will derivate
        dw += -1.0 * X[i] * (y[i] - sigmoid(w.T@X[i] + b))/n
        ## derivate p/r b
        db += -1.0 * (y[i] - sigmoid(w.T@X[i] +b))/n
        
    ## we know just add the regularisation
    loss += alpha * (np.linalg.norm(w, 2))**2
    dw += alpha * 2.0 * w
    
    return loss, dw, np.array(db).reshape(1,)

In [11]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)
print("Loss error : ",rel_error(loss, sk_loss))
assert rel_error(loss, sk_loss) < 1e-9

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  8.086847907453055e-17
Gradient check w
numerical: 67.588416 analytic: 67.587746, relative error: 4.957629e-06
numerical: 0.025747 analytic: 0.025747, relative error: 2.976319e-10
numerical: -0.177988 analytic: -0.177988, relative error: 1.170945e-10
numerical: -0.177988 analytic: -0.177988, relative error: 1.170945e-10
numerical: 21.522550 analytic: 21.522355, relative error: 4.533852e-06
numerical: 0.007094 analytic: 0.007094, relative error: 1.826377e-09
numerical: -0.002993 analytic: -0.002993, relative error: 4.804952e-09
numerical: -0.177988 analytic: -0.177988, relative error: 1.170945e-10
numerical: -0.009121 analytic: -0.009121, relative error: 2.538396e-09
numerical: 0.008023 analytic: 0.008023, relative error: 2.816544e-09
numerical: -0.029713 analytic: -0.029713, relative error: 4.696918e-10
numerical: -0.000211 analytic: -0.000211, relative error: 3.300817e-09
numerical: 5.766730 analytic: 5.766730, relative error: 3.701303e-08
numerical: 0.008023 analytic: 0.

# Naive with regulariztion

In [12]:
loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -0.844428 analytic: -0.844428, relative error: 4.408092e-10
numerical: 0.013208 analytic: 0.013208, relative error: 3.114761e-09
numerical: 21.522391 analytic: 21.522196, relative error: 4.533886e-06
numerical: 0.007790 analytic: 0.007790, relative error: 2.848646e-09
numerical: -0.029954 analytic: -0.029954, relative error: 4.247832e-10
numerical: 0.015271 analytic: 0.015271, relative error: 1.391651e-09
numerical: -0.001385 analytic: -0.001385, relative error: 1.107685e-08
numerical: 67.588324 analytic: 67.587653, relative error: 4.957635e-06
numerical: 0.015271 analytic: 0.015271, relative error: 1.391651e-09
numerical: -0.029954 analytic: -0.029954, relative error: 4.247832e-10
numerical: -0.001385 analytic: -0.001385, relative error: 1.107685e-08
numerical: 0.015271 analytic: 0.015271, relative error: 1.391651e-09
numerical: -0.001042 analytic: -0.001042, relative error: 6.312327e-09
numerical: 0.007383 analytic: 0.007383, relative error: 1.961315e-09
n

# Vectorized

In [13]:
def log_loss_vectorized(w, b,X, y, alpha=0):
    """
    log loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    
    # YOUR CODE HERE
    n = len(X)
    
    loss = -(y @ np.log(sigmoid(X@w + b)) + (1 - y) @ np.log(1 - sigmoid(X@w +b)))/n + alpha * (np.linalg.norm(w, 2))**2
    ## now we have €(y - ŷ)/n, ŷ is our estimator. This is what we will derivate
    dw = -((y - sigmoid(X@w + b))@X)/n + alpha * 2.0 * w
    ## derivate p/r b
    db = -(y - sigmoid(X@w +b))/n
        
    db = sum(db)
    return loss, dw, np.array(db).reshape(1,)

In [14]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)
print("Loss error : ",rel_error(loss, sk_loss))
assert rel_error(loss, sk_loss) < 1e-9

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  0.0
Gradient check w
numerical: -4.887680 analytic: -4.887681, relative error: 2.262328e-08
numerical: -0.021924 analytic: -0.021924, relative error: 7.325175e-11
numerical: 0.007094 analytic: 0.007094, relative error: 2.612871e-10
numerical: -0.177988 analytic: -0.177988, relative error: 2.325205e-11
numerical: 0.008023 analytic: 0.008023, relative error: 2.971495e-10
numerical: -0.002993 analytic: -0.002993, relative error: 1.687316e-10
numerical: -1.948738 analytic: -1.948738, relative error: 3.523462e-10
numerical: -0.000991 analytic: -0.000991, relative error: 2.911489e-09
numerical: -0.844697 analytic: -0.844697, relative error: 4.526792e-10
numerical: -0.011557 analytic: -0.011557, relative error: 2.439116e-10
numerical: -0.014574 analytic: -0.014574, relative error: 3.441431e-12
numerical: 5.766730 analytic: 5.766730, relative error: 3.701255e-08
numerical: -3.219932 analytic: -3.219932, relative error: 6.338770e-08
numerical: 0.007094 analytic: 0.007094, relative

# Vectorized with regularization

In [15]:
loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -0.014836 analytic: -0.014836, relative error: 6.749290e-11
numerical: -0.009360 analytic: -0.009360, relative error: 5.716002e-11
numerical: -2.385659 analytic: -2.385659, relative error: 6.908610e-10
numerical: 0.015271 analytic: 0.015271, relative error: 6.233415e-11
numerical: -0.000641 analytic: -0.000641, relative error: 2.611388e-09
numerical: -0.009482 analytic: -0.009482, relative error: 1.686687e-10
numerical: 0.110301 analytic: 0.110301, relative error: 1.713414e-10
numerical: 0.007198 analytic: 0.007198, relative error: 2.659024e-10
numerical: -4.887458 analytic: -4.887458, relative error: 2.262412e-08
numerical: 0.015271 analytic: 0.015271, relative error: 6.233415e-11
numerical: 0.110301 analytic: 0.110301, relative error: 1.713414e-10
numerical: -0.009360 analytic: -0.009360, relative error: 5.716002e-11
numerical: -0.600694 analytic: -0.600694, relative error: 1.139616e-09
numerical: -0.003011 analytic: -0.003011, relative error: 3.192534e-10

# Gradient descent for Linear models

In [16]:
class LinearModel():
    def __init__(self):
        self.w = None
        self.b = None

    def train(self, X, y, learning_rate=1e-3, alpha=0, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            
            # YOUR CODE HERE
            ## echantillonage
            i = np.random.choice (X.shape[0], batch_size)
            ## building matrix X and vector y
            X_batch = X[i, :]
            y_batch = y[i]
            
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update                                                                
            # Update the weights w and bias b using the gradient and the learning rate.          
            self.w = self.w - learning_rate * dw
            self.b = self.b - learning_rate * db
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
                
        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class LinearRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return mse_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        return X @ self.w + self.b

class LogisticRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return log_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        """ Return prediction labels vector of 0 or 1 """
        ## We will return a vector here so let's create it 1st
        result = np.zeros(X.shape[0])
        for i in range (0, X.shape[0]) :
            if X[i] @ np.transpose(self.w) + self.b <0 :
                result[i] = 0
            else :
                result[i] = 1
        return result

## Linear regression with gradient descent

In [17]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = LinearRegressor()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
assert mse - sk_mse < 100

iteration 0 / 75000: loss 30133.660194
iteration 10000 / 75000: loss 4252.919047
iteration 20000 / 75000: loss 2623.530449
iteration 30000 / 75000: loss 2856.680840
iteration 40000 / 75000: loss 3519.123356
iteration 50000 / 75000: loss 3688.946920
iteration 60000 / 75000: loss 2450.988196
iteration 70000 / 75000: loss 2844.601545
MSE scikit-learn: 2859.6903987680657
MSE gradient descent model : 2885.0645903782984


## Logistc regression with gradient descent

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)

sk_model = LogisticRegression(fit_intercept=True)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_log_loss = log_loss(sk_pred, y_train2)

model = LogisticRegressor()
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_log_loss = log_loss(pred, y_train2)

print("Log-loss scikit-learn:", sk_log_loss)
print("Log-loss gradiet descent model :", model_log_loss)
print("Error :", rel_error(sk_log_loss, model_log_loss))
assert rel_error(sk_log_loss, model_log_loss) < 1e-7

iteration 0 / 75000: loss 0.694878
iteration 10000 / 75000: loss 0.110702
iteration 20000 / 75000: loss 0.153093
iteration 30000 / 75000: loss 0.069042
iteration 40000 / 75000: loss 0.080207
iteration 50000 / 75000: loss 0.125726
iteration 60000 / 75000: loss 0.033689
iteration 70000 / 75000: loss 0.102923
Log-loss scikit-learn: 0.4249086712816093
Log-loss gradiet descent model : 0.4249086712816093
Error : 0.0
