Before you turn this problem in, make sure everything runs as expected. First, **restart the kernel** (in the menubar, select Kernel$\rightarrow$Restart) and then **run all cells** (in the menubar, select Cell$\rightarrow$Run All).

Make sure you fill in any place that says `YOUR CODE HERE` or "YOUR ANSWER HERE", as well as your name and collaborators below:

---

In [1]:
# AZA MANAMPY CODE ATO FA MNAOVA CELLULE VAOVAO

from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.linear_model import HuberRegressor
from sklearn.datasets import load_boston, load_diabetes, load_iris, load_digits
from scipy.special import huber
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def grad_check_sparse(f, x, analytic_grad, num_checks=12, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Robust linear regression - Huber loss

In [2]:
data = load_boston()
X_train1, y_train1 = data.data, data.target
w1 = np.random.randn(X_train1.shape[1]) * 0.0001
b1 = np.random.randn(1) * 0.0001

In [3]:
def huber_loss_naive(w, b, X, y, epsilon=1.35, alpha=0.0001):
    """
    Huber loss for all observations
    
    Inputs:
    - w: array of shape (D,) containing weights
    - b: float bias 
    - X: array of shape (N, D) containing a minibatch of data
    - y: array of shape (N,) containing training labels 
    - epsilon: float
    - alpha: regularization
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # YOUR CODE HERE
    # loss
    for i in range(y.shape[0]):
        r = sum(X[i].dot(w) + b - y[i]) # array -> float
        if abs(r) <= epsilon:
            loss += (r**2 / 2)
        else:
            loss += (epsilon * abs(r) - epsilon**2/2)
    loss *= (1/y.shape[0])
    for j in range(w.shape[0]):
        loss += alpha * w[j]**2

    # dw
    for i in range(w.shape[0]):
        alph = 2 * alpha * w[i]
        s1 = 0
        for j in range(y.shape[0]):
            r = sum(w.T.dot(X[j]) + b - y[j])
            if abs(r) <= epsilon:
                s1 += X[j][i] * r
            else:
                s1 += np.sign(r) * X[j][i] * epsilon
        dw[i] = (alph + (s1 / y.shape[0]))
    
    # db
    for j in range(y.shape[0]):
        r = sum(X[j].dot(w) + b - y[j])
        if abs(r) <= epsilon:
            db += r
        else:
            db += np.sign(r) * epsilon
    db *= 1/y.shape[0]
    
    return loss, dw, np.array(db).reshape(1,)

## without regularization

In [5]:
loss, dw1, db1 = huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=0)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15, error=1e-9)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15, error=1e-9)


# Large epsilon
large_eps_loss, large_eps_dw1, large_eps_db1 = huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=0)

print("Gradient check w large epsilon")
# Check with numerical gradient w
f = lambda w1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, large_eps_dw1, 15, error=1e-9)

print("Gradient check bias large epsilon")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, large_eps_db1, 15, error=1e-9)

Gradient check w
numerical: -5.123308 analytic: -5.123308, relative error: 1.798891e-11
numerical: -481.509943 analytic: -481.509943, relative error: 1.043997e-12
numerical: -12.891700 analytic: -12.891700, relative error: 3.525018e-11
numerical: -92.576117 analytic: -92.576117, relative error: 6.322008e-12
numerical: -8.484256 analytic: -8.484256, relative error: 7.328381e-11
numerical: -12.891700 analytic: -12.891700, relative error: 3.525018e-11
numerical: -17.081635 analytic: -17.081635, relative error: 2.049780e-11
numerical: -481.509943 analytic: -481.509943, relative error: 1.043997e-12
numerical: -24.914970 analytic: -24.914970, relative error: 3.378290e-11
numerical: -4.878257 analytic: -4.878257, relative error: 5.050771e-12
numerical: -15.034651 analytic: -15.034651, relative error: 8.479515e-11
numerical: -15.340909 analytic: -15.340909, relative error: 1.550770e-11
numerical: -15.034651 analytic: -15.034651, relative error: 8.479515e-11
numerical: -5.123308 analytic: -5.12

 ## with regularization

In [6]:
loss, dw1, db1 = huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15, error=1e-9)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15, error=1e-9)


# Large epsilon
large_eps_loss, large_eps_dw1, large_eps_db1 = huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=1)

print("Gradient check w large epsilon")
# Check with numerical gradient w
f = lambda w1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, large_eps_dw1, 15, error=1e-9)

print("Gradient check bias large epsilon")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, large_eps_db1, 15, error=1e-9)

Gradient check w
numerical: -481.509934 analytic: -481.509934, relative error: 9.763526e-13
numerical: -8.484479 analytic: -8.484479, relative error: 6.828077e-11
numerical: -0.092852 analytic: -0.092852, relative error: 8.415943e-10
numerical: -24.914977 analytic: -24.914977, relative error: 3.488864e-11
numerical: -8.484479 analytic: -8.484479, relative error: 6.828077e-11
numerical: -0.748732 analytic: -0.748732, relative error: 8.104566e-11
numerical: -5.123156 analytic: -5.123156, relative error: 2.282643e-11
numerical: -17.081522 analytic: -17.081522, relative error: 2.076759e-11
numerical: -481.509934 analytic: -481.509934, relative error: 9.763526e-13
numerical: -24.914977 analytic: -24.914977, relative error: 3.488864e-11
numerical: -4.878509 analytic: -4.878509, relative error: 1.321660e-12
numerical: -24.914977 analytic: -24.914977, relative error: 3.488864e-11
numerical: -24.914977 analytic: -24.914977, relative error: 3.488864e-11
numerical: -8.484479 analytic: -8.484479, 

In [7]:
def huber_loss_vectorized(w, b, X, y, epsilon=1.35, alpha=0.0001):
    """
    Huber loss for all observations
    
    Inputs:
    - w: array of shape (D,) containing weights
    - b: float bias 
    - X: array of shape (N, D) containing a minibatch of data
    - y: array of shape (N,) containing training labels 
    - epsilon: float
    - alpha: regularization
    """
    
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0
    
    # YOUR CODE HERE
    r = X.dot(w) + b - y
    size_y = y.shape[0]
    
    # array for |r| <= e, else 0
    eps_inf = np.copy(r)
    eps_inf[abs(r)>epsilon] = 0
    
    # array for |r| > e, else 0
    eps_sup = np.copy(r)
    eps_sup[abs(r)<=epsilon] = 0
    
    # loss
    s1 = eps_inf * (eps_inf) / 2
    s2 = epsilon * (np.abs(eps_sup) - epsilon/2)
    s2[abs(r)<=epsilon] = 0
    loss = (1/size_y) * sum(s1 + s2) + alpha * w.T.dot(w)
    
    # dw
    s1 = X.T.dot(eps_inf)
    s2 = epsilon * X.T.dot(np.sign(eps_sup))
    dw = (1/size_y) * (s1 + s2) + 2 * alpha * w
    
    # db
    s1 = eps_inf
    s2 = epsilon * np.sign(eps_sup)
    db = (1/size_y) * sum(s1 + s2)
    
    return loss, dw, np.array(db).reshape(1,)

## without regularization

In [10]:
loss, dw1, db1 = huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=0)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15, error=1e-9)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15, error=1e-9)


# Large epsilon
large_eps_loss, large_eps_dw1, large_eps_db1 = huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=0)

print("Gradient check w large epsilon")
# Check with numerical gradient w
f = lambda w1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=135, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, large_eps_dw1, 15, error=1e-9)

print("Gradient check bias large epsilon")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=135, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, large_eps_db1, 15, error=1e-9)

Gradient check w
numerical: -15.340909 analytic: -15.340909, relative error: 1.550781e-11
numerical: -12.891700 analytic: -12.891700, relative error: 4.214096e-11
numerical: -551.120158 analytic: -551.120158, relative error: 1.287207e-12
numerical: -8.484256 analytic: -8.484256, relative error: 7.328402e-11
numerical: -5.123308 analytic: -5.123308, relative error: 1.798839e-11
numerical: -8.484256 analytic: -8.484256, relative error: 7.328402e-11
numerical: -24.914970 analytic: -24.914970, relative error: 2.665693e-11
numerical: -15.340909 analytic: -15.340909, relative error: 1.550781e-11
numerical: -92.576117 analytic: -92.576117, relative error: 7.282179e-12
numerical: -24.914970 analytic: -24.914970, relative error: 2.665693e-11
numerical: -24.914970 analytic: -24.914970, relative error: 2.665693e-11
numerical: -15.340909 analytic: -15.340909, relative error: 1.550781e-11
numerical: -4.878257 analytic: -4.878257, relative error: 5.050862e-12
numerical: -0.748838 analytic: -0.748838

## with regularization

In [15]:
loss, dw1, db1 = huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15, error=1e-9)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=1.35, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15, error=1e-9)


# Large epsilon
large_eps_loss, large_eps_dw1, large_eps_db1 = huber_loss_naive(w1, b1, X_train1, y_train1, epsilon=135, alpha=1)

print("Gradient check w large epsilon")
# Check with numerical gradient w
f = lambda w1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=135, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, large_eps_dw1, 15, error=1e-9)

print("Gradient check bias large epsilon")
# Check with numerical gradient b
f2 = lambda b1: huber_loss_vectorized(w1, b1, X_train1, y_train1, epsilon=135, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, large_eps_db1, 15, error=1e-9)

Gradient check w
numerical: -0.748732 analytic: -0.748732, relative error: 8.104537e-11
numerical: -0.092852 analytic: -0.092852, relative error: 8.415939e-10
numerical: -92.576492 analytic: -92.576492, relative error: 7.283531e-12
numerical: -5.123156 analytic: -5.123156, relative error: 2.282591e-11
numerical: -17.081522 analytic: -17.081522, relative error: 3.116688e-11
numerical: -0.748732 analytic: -0.748732, relative error: 8.104537e-11
numerical: -12.891763 analytic: -12.891763, relative error: 4.521767e-11
numerical: -15.034556 analytic: -15.034556, relative error: 9.373634e-11
numerical: -481.509934 analytic: -481.509934, relative error: 9.758804e-13
numerical: -4.878509 analytic: -4.878509, relative error: 1.321569e-12
numerical: -15.034556 analytic: -15.034556, relative error: 9.373634e-11
numerical: -0.748732 analytic: -0.748732, relative error: 8.104537e-11
numerical: -8.484479 analytic: -8.484479, relative error: 6.828098e-11
numerical: -4.878509 analytic: -4.878509, rela

In [16]:
class LinearModel():
    def __init__(self):
        self.w = None
        self.b = None

    def train(self, X, y, learning_rate=1e-3, alpha=0.0001, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            # YOUR CODE HERE
            rand = np.random.choice(N, batch_size, replace=False)
            
            X_batch = X[rand, :]
            y_batch = y[rand]
            
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update                                                                
            # Update the weights w and bias b using the gradient and the learning rate.          
            # YOUR CODE HERE
            self.w -= learning_rate * dw
            self.b -= learning_rate * db
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
                
        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class HuberRegression(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return huber_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha=alpha)
    
    def predict(self, X):
        # YOUR CODE HERE
        y = X.dot(self.w) + self.b
        return y

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train1 = scaler.fit_transform(X_train1)

sk_model = HuberRegressor(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = HuberRegression()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
assert mse - sk_mse < 1

iteration 0 / 75000: loss 31.111990
iteration 10000 / 75000: loss 2.116153
iteration 20000 / 75000: loss 3.255500
iteration 30000 / 75000: loss 3.887477
iteration 40000 / 75000: loss 3.823629
iteration 50000 / 75000: loss 3.545063
iteration 60000 / 75000: loss 2.888331
iteration 70000 / 75000: loss 4.189011
MSE scikit-learn: 24.04102301055778
MSE gradient descent model : 24.429435217767683


# Multinomial logistic regression

In [18]:
data = load_iris()
X_train2, y_train2 = data.data, data.target

W = np.random.randn(X_train2.shape[1], 3) * 0.0001

In [19]:
def softmax_loss_naive(W, X, y, alpha):
    """
    Softmax loss function WITH FOR LOOPS

    Inputs:
    - W: array of shape (D, C) containing weights
    - X: array of shape (N, D) containing a minibatch of data
    - y: array of shape (N,) containing training labels
    - alpha: (float) regularization 

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W;  same shape as W
    """
    
    # Initialization
    loss = 0.0
    dW = np.zeros_like(W)
    
    # Tandremo ny numeric instability
    # YOUR CODE HERE
    # code from "https://stackoverflow.com/questions/41663874/cs231n-how-to-calculate-gradient-for-softmax-loss-function"
    # Get shapes
    W = W.T
    X = X.T
    dW = dW.T
    num_classes = W.shape[0]
    num_train = X.shape[1]

    for i in range(num_train):
        # Compute vector of scores
        f_i = W.dot(X[:, i]) # in R^{num_classes}

        # Normalization trick to avoid numerical instability, per http://cs231n.github.io/linear-classify/#softmax
        log_c = np.max(f_i)
        f_i -= log_c

        # Compute loss (and add to it, divided later)
        # L_i = - f(x_i)_{y_i} + log \sum_j e^{f(x_i)_j}
        sum_i = 0.0
        for f_i_j in f_i:
            sum_i += np.exp(f_i_j)
        loss += -f_i[y[i]] + np.log(sum_i)

        # Compute gradient
        # dw_j = 1/num_train * \sum_i[x_i * (p(y_i = j)-Ind{y_i = j} )]
        # Here we are computing the contribution to the inner sum for a given i.
        for j in range(num_classes):
            p = np.exp(f_i[j])/sum_i
            dW[j, :] += (p-(j == y[i])) * X[:, i]

    # Compute average
    loss /= num_train
    dW /= num_train

    # Regularization
    loss += 0.5 * alpha * np.sum(W * W)
    dW += alpha * W
    
    dW = dW.T
            
    return loss, dW

## Without regularization

In [20]:
loss, dW = softmax_loss_naive(W, X_train2, y_train2, 0.0)

f = lambda W: softmax_loss_naive(W, X_train2, y_train2, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, dW, error=1e-7)

numerical: -0.275613 analytic: -0.275613, relative error: 3.023324e-11
numerical: -0.166945 analytic: -0.166945, relative error: 3.223255e-10
numerical: 0.027606 analytic: 0.027606, relative error: 1.193053e-09
numerical: -0.042097 analytic: -0.042097, relative error: 2.910109e-10
numerical: 0.027606 analytic: 0.027606, relative error: 1.193053e-09
numerical: 0.027606 analytic: 0.027606, relative error: 1.193053e-09
numerical: 0.317710 analytic: 0.317710, relative error: 5.755739e-11
numerical: 0.027606 analytic: 0.027606, relative error: 1.193053e-09
numerical: -0.248529 analytic: -0.248529, relative error: 5.489662e-10
numerical: -0.166945 analytic: -0.166945, relative error: 3.223255e-10
numerical: 0.317710 analytic: 0.317710, relative error: 5.755739e-11
numerical: -0.166945 analytic: -0.166945, relative error: 3.223255e-10


## With regularization

In [21]:
loss, dW = softmax_loss_naive(W, X_train2, y_train2, 2)

f = lambda W: softmax_loss_naive(W, X_train2, y_train2, 2)[0]
grad_numerical = grad_check_sparse(f, W, dW, error=1e-7)

numerical: 0.764987 analytic: 0.764987, relative error: 1.029771e-10
numerical: 0.317602 analytic: 0.317602, relative error: 7.396658e-11
numerical: -0.248252 analytic: -0.248252, relative error: 5.693897e-10
numerical: 0.764987 analytic: 0.764987, relative error: 1.029771e-10
numerical: -0.167027 analytic: -0.167027, relative error: 2.909990e-10
numerical: 0.027272 analytic: 0.027272, relative error: 1.222492e-09
numerical: -0.042216 analytic: -0.042216, relative error: 4.166766e-10
numerical: -0.275733 analytic: -0.275733, relative error: 3.017799e-11
numerical: -0.167027 analytic: -0.167027, relative error: 2.909990e-10
numerical: -0.248252 analytic: -0.248252, relative error: 5.693897e-10
numerical: 0.764987 analytic: 0.764987, relative error: 1.029771e-10
numerical: -0.123645 analytic: -0.123645, relative error: 1.829084e-11


In [22]:
def softmax_loss_vectorized(W, X, y, alpha, fit_intercept=False):
    """
    Softmax loss function WITHOUT FOR LOOPS

    Inputs:
    - W: array of shape (D, C) containing weights
    - X: array of shape (N, D) containing a minibatch of data
    - y: array of shape (N,) containing training labels
    - alpha: (float) regularization 

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W;  same shape as W
    """
    # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)

    # YOUR CODE HERE
    # code from "https://tomaxent.com/2017/03/05/cs231n-Assignment-1-softmax/"
    
    num_train = X.shape[0]
    f = X.dot(W)
    f = f - np.max(f, axis=1)[:, np.newaxis]
    loss = -np.sum(np.log(np.exp(f[np.arange(num_train), y]) / np.sum(np.exp(f), axis=1)))
    loss /= num_train
    loss += 0.5 * alpha * np.sum(W * W)
    ind = np.zeros_like(f)
    ind[np.arange(num_train), y] = 1
    dW = X.T.dot(np.exp(f) / np.sum(np.exp(f), axis=1, keepdims=True) - ind)
    dW /= num_train
    dW += alpha * W

    return loss, dW

## Without regularization

In [23]:
loss, dW = softmax_loss_vectorized(W, X_train2, y_train2, 0.0)

f = lambda W: softmax_loss_vectorized(W, X_train2, y_train2, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, dW, error=1e-7)

numerical: -0.248529 analytic: -0.248529, relative error: 5.266291e-10
numerical: 0.317710 analytic: 0.317710, relative error: 4.727635e-11
numerical: 0.027606 analytic: 0.027606, relative error: 5.898000e-10
numerical: -0.248529 analytic: -0.248529, relative error: 5.266291e-10
numerical: 0.278824 analytic: 0.278824, relative error: 4.533724e-10
numerical: 0.317710 analytic: 0.317710, relative error: 4.727635e-11
numerical: -0.166945 analytic: -0.166945, relative error: 3.223255e-10
numerical: -0.248529 analytic: -0.248529, relative error: 5.266291e-10
numerical: 0.096088 analytic: 0.096088, relative error: 1.738586e-10
numerical: 0.765127 analytic: 0.765127, relative error: 8.198148e-11
numerical: -0.598182 analytic: -0.598182, relative error: 1.021405e-10
numerical: -0.030295 analytic: -0.030295, relative error: 4.516753e-09


## With regularization

In [24]:
loss, dW = softmax_loss_vectorized(W, X_train2, y_train2, 2)

f = lambda W: softmax_loss_vectorized(W, X_train2, y_train2, 2)[0]
grad_numerical = grad_check_sparse(f, W, dW, error=1e-7)

numerical: 0.278889 analytic: 0.278889, relative error: 4.551382e-10
numerical: -0.123645 analytic: -0.123645, relative error: 1.529763e-10
numerical: 0.027272 analytic: 0.027272, relative error: 6.118414e-10
numerical: -0.030150 analytic: -0.030150, relative error: 4.526397e-09
numerical: -0.598349 analytic: -0.598349, relative error: 1.068972e-10
numerical: -0.598349 analytic: -0.598349, relative error: 1.068972e-10
numerical: 0.096206 analytic: 0.096206, relative error: 1.930709e-10
numerical: 0.317602 analytic: 0.317602, relative error: 3.090258e-11
numerical: -0.030150 analytic: -0.030150, relative error: 4.526397e-09
numerical: 0.027272 analytic: 0.027272, relative error: 6.118414e-10
numerical: -0.598349 analytic: -0.598349, relative error: 1.068972e-10
numerical: 0.278889 analytic: 0.278889, relative error: 4.551382e-10


## Gradient descent

In [27]:
class LinearModel():
    def __init__(self, fit_intercept=True):
        self.W = None
        self.fit_intercept = fit_intercept

    def train(self, X, y, learning_rate=1e-3, alpha=0, num_iters=100, batch_size=200, verbose=False):
        if self.fit_intercept:
            # YOUR CODE HERE
            tmp = np.ones((len(X),1))
            X = np.append(tmp, X, axis = 1)
            
        N, d = X.shape
        
        C = (np.max(y) + 1) 
        if self.W is None: # Initialization
            self.W = 0.001 * np.random.randn(d, C)

        # Run stochastic gradient descent to optimize W
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            # YOUR CODE HERE
            rand = np.random.choice(N, batch_size, replace=False)
            
            X_batch = X[rand, :]
            y_batch = y[rand]
            
            # evaluate loss and gradient
            loss, dW = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update                                                                
            # Update the weights w using the gradient and the learning rate.          
            # YOUR CODE HERE
            self.W -= learning_rate * dW
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
                
        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class MultinomialLogisticRegressor(LinearModel):
    """ Softmax regression """

    def loss(self, X_batch, y_batch, alpha):
        return softmax_loss_vectorized(self.W, X_batch, y_batch, alpha)
    
    def predict(self, X):
        """ 
        Inputs:
        - X: array of shape (N, D) 

        Returns:
        - y_pred: 1-dimensional array of length N, each element is an integer giving the predicted class 
        """
        # YOUR CODE HERE
        # code from "towardsdatascience.com/softmax-regression-in-python-multi-class-classificationè3cb560d90cb2?gi=2c4a096fb7e0"
        if self.fit_intercept:
            # YOUR CODE HERE
            tmp = np.ones((len(X),1))
            X = np.append(tmp, X, axis=1)
  
        z = X.dot(self.W)
        y_pred = np.argmax(z, axis=1)
        
        return y_pred

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)

sk_model = LogisticRegression(fit_intercept=False)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_accuracy = accuracy_score(y_train2, sk_pred)

model = MultinomialLogisticRegressor(fit_intercept=False)
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_accuracy = accuracy_score(y_train2, pred)

print("Accuracy scikit-learn:", sk_accuracy)
print("Accuracy gradient descent model :", model_accuracy)
assert sk_accuracy - model_accuracy < 0.01

iteration 0 / 75000: loss 1.098594
iteration 10000 / 75000: loss 0.377476
iteration 20000 / 75000: loss 0.377048
iteration 30000 / 75000: loss 0.339127
iteration 40000 / 75000: loss 0.371754
iteration 50000 / 75000: loss 0.305053
iteration 60000 / 75000: loss 0.358625
iteration 70000 / 75000: loss 0.333108
Accuracy scikit-learn: 0.86
Accuracy gradient descent model : 0.8666666666666667


In [29]:
sk_model = LogisticRegression(fit_intercept=True)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_accuracy = accuracy_score(y_train2, sk_pred)

model = MultinomialLogisticRegressor(fit_intercept=True)
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_accuracy = accuracy_score(y_train2, pred)

print("Accuracy scikit-learn:", sk_accuracy)
print("Accuracy gradient descent model :", model_accuracy)
assert sk_accuracy - model_accuracy < 0.02

iteration 0 / 75000: loss 1.098336
iteration 10000 / 75000: loss 0.327248
iteration 20000 / 75000: loss 0.262469
iteration 30000 / 75000: loss 0.186599
iteration 40000 / 75000: loss 0.194513
iteration 50000 / 75000: loss 0.198297
iteration 60000 / 75000: loss 0.173120
iteration 70000 / 75000: loss 0.176601
Accuracy scikit-learn: 0.9733333333333334
Accuracy gradient descent model : 0.96


# K-Nearest Neighbor

## Computing distances

In [30]:
data = load_digits()
X_train3, y_train3 = data.data, data.target
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_train3, y_train3, test_size=0.33, random_state=2)

def get_distances_two_loops_with_norm(X_train, X_test):
    num_test = X_test.shape[0]
    num_train = X_train.shape[0]
    distances = np.zeros((num_test, num_train))
    for i in range(num_test):
        for j in range(num_train):
            distances[i, j] = np.linalg.norm(X_test[i] - X_train[j])
    return distances

In [31]:
def get_distances_two_loops(X_train, X_test):
    """
    Compute the distance between each test point in X_test and each training point
    in X_train 

    Inputs:
    - X_test: array of shape (num_test, D) 

    Returns:
    - distances: array of shape (num_test, num_train), dists[i, j] is Euclidean distance between 
    the ith test point and the jth training point.
    """
    num_test = X_test.shape[0]
    num_train = X_train.shape[0]
    distances = np.zeros((num_test, num_train))
    for i in range(num_test):
        for j in range(num_train):
            # Ataovy ao anaty distances[i, j] ny distance entre ith test point sy th training point
            # Aza manao boucle instony ato anatiny
            # TSY MAHAZO MAMPIASA np.linalg.norm() :D
            # YOUR CODE HERE
            distances[i][j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
            
    return distances

In [32]:
distances = get_distances_two_loops(X_train3, X_test3)
true_distances = get_distances_two_loops_with_norm(X_train3, X_test3)

difference = np.linalg.norm(distances - true_distances, ord='fro')

print(difference)
assert difference < 1e-10

0.0


In [33]:
def compute_distances_one_loop(X_train, X_test):
    """
    Compute the distance between each test point in X_test and each training point
    in X_train 

    Inputs:
    - X_test: array of shape (num_test, D) 

    Returns:
    - dists: array of shape (num_test, num_train), dists[i, j] is Euclidean distance between 
    the ith test point and the jth training point.
    """
    num_test = X_test.shape[0]
    num_train = X_train.shape[0]
    distances = np.zeros((num_test, num_train))
    for i in range(num_test):
        # Ataovy ao anaty dists[i, j] ny distance entre ith test point sy th training point
        # Aza manao boucle instony ato anatiny
        # TSY MAHAZO MAMPIASA np.linalg.norm() :D
            
        # YOUR CODE HERE
        distances[i] = np.sqrt(np.sum((X_test[i] - X_train)**2, axis=1))
    return distances    
distances = compute_distances_one_loop(X_train3, X_test3)

In [34]:
distances = compute_distances_one_loop(X_train3, X_test3)
true_distances = get_distances_two_loops_with_norm(X_train3, X_test3)

difference = np.linalg.norm(distances - true_distances, ord='fro')

print(difference)
assert difference < 1e-10

0.0


In [35]:
def get_distances_zero_loop(X_train, X_test):
    """
    Compute the distance between each test point in X_test and each training point
    in X_train 

    Inputs:
    - X_test: array of shape (num_test, D) 

    Returns:
    - distances: array of shape (num_test, num_train), dists[i, j] is Euclidean distance between 
    the ith test point and the jth training point.
    """
    num_test = X_test.shape[0]
    num_train = X_train.shape[0]
    distances = np.zeros((num_test, num_train))  
    # Ataovy ao anaty dists[i, j] ny distance entre ith test point sy th training point
    # Aza manao boucle instony
    # TSY MAHAZO MAMPIASA np.linalg.norm() NA FONCTIONS AO AMIN'NY SCIPY :D
            
    # YOUR CODE HERE
    # code from "https://programmerall.com/article/1079947341/"
    M = X_test.dot(X_train.T)
    nrow = M.shape[0]
    ncol = M.shape[1]
    
    te = np.diag(X_test.dot(X_test.T))
    tr = np.diag(X_train.dot(X_train.T))
    
    te = np.reshape(np.repeat(te, ncol), M.shape)
    tr = np.reshape(np.repeat(tr, nrow), M.T.shape)
    
    sq = -2 * M + te + tr.T
    distances = np.sqrt(sq)
    
    return distances

distances = get_distances_zero_loop(X_train3, X_test3)

In [36]:
distances = get_distances_zero_loop(X_train3, X_test3)
true_distances = get_distances_two_loops_with_norm(X_train3, X_test3)

difference = np.linalg.norm(distances - true_distances, ord='fro')

print(difference)
assert difference < 1e-10

0.0


## K-Nearest Neighbor (knn) classifier

In [37]:
class KNearestNeighborClassifier():
    """ kNN classifier using L2 distance """

    def __init__(self, k=1):
        """
        Inputs:
        - k: number of nearest neighbors that vote for the predicted labels.
        """
        self.k = k

    def fit(self, X, y):
        """
        Train the classifier. Just memorize the training data.

        Inputs:
        - X: array of shape (num_train, D) 
        - y: array of shape (N,) 
        """
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """
        Predict labels for test data using this classifier.

        Inputs:
        - X: array of shape (num_test, D) 

        Returns:
        - y: array of shape (num_test,) 
        """
        distances = get_distances_zero_loop(self.X_train, X)
        return self.predict_labels(distances)

    def predict_labels(self, distances):
        """
        Given a matrix of distances between test points and training points,
        predict a label for each test point.

        Inputs:
        - distances: array of shape (num_test, num_train), dists[i, j] is Euclidean distance between 
        the ith test point and the jth training point.

        Returns:
        - y:  array of shape (num_test,) 
        """
        num_test = distances.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            # list storing the labels of the k nearest neighbors to the ith test point.
            closest_y = []

            # Ampidirina ao anaty closest_y ny labels an'ny k neighbors akaiky indrindra
            # Jereo fampiasana np.argsort
            # YOUR CODE HERE
            r = np.argsort(distances[i, :])[:self.k]
            closest_y = self.y_train[r]
            
            # Tadiavo ny label betsaka indrindra dia iny no atao prediction
            # Raha misy mitovy dia izay label kely raisina
            # YOUR CODE HERE
            # print(closest_y, " -> ", np.bincount(closest_y))
            y_pred[i] = np.bincount(closest_y).argmax()
        return y_pred

In [38]:
sk_model = KNeighborsClassifier(n_neighbors=3)
sk_model.fit(X_train3, y_train3)
sk_pred = sk_model.predict(X_test3)
sk_accuracy = accuracy_score(y_test3, sk_pred)

model = KNearestNeighborClassifier(k=3)
model.fit(X_train3, y_train3)
pred = model.predict(X_test3)
model_accuracy = accuracy_score(y_test3, pred)

print("Accuracy scikit-learn:", sk_accuracy)
print("Accuracy gradient descent model :", model_accuracy)
assert sk_accuracy - model_accuracy < 1e-10

Accuracy scikit-learn: 0.9831649831649831
Accuracy gradient descent model : 0.9831649831649831


## cross-validation

In [39]:
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []

# Split up the data into folds
# X_train_folds and y_train_folds lits of length num_folds

# YOUR CODE HERE
X_train, y_train = data.data, data.target

X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)

# A dictionary of length num_folds holding the accuracies for different values of k 
k_to_accuracies = {}

# Ataovy ary ilay k-fold cross validation 
# Atao ao anaty k_to_accuracies ny accuracy isaky ny valeur k
# YOUR CODE HERE

# testing data: i_th fold
# training data: all other folds
for k in k_choices:
    model_cv = KNearestNeighborClassifier(k)
    k_to_accuracies[k] = np.zeros(num_folds)
    for i in range(num_folds):
        # test fold X_test_vc | y_train_cv
        X_test_cv = X_train_folds[i]
        y_test_cv = y_train_folds[i]

        # training folds X_train_cv | y_train_cv
        X_train_cv = None
        y_train_cv = None
        for j in range(num_folds):
            if i != j:
                if X_train_cv is None:
                    X_train_cv = X_train_folds[j]
                    y_train_cv = y_train_folds[j]
                else:
                    X_train_cv = np.append(X_train_cv, X_train_folds[j], axis=0)
                    y_train_cv = np.append(y_train_cv, y_train_folds[j], axis=0)
        
        # cross validation
        model_cv.fit(X_train_cv, y_train_cv)
        pred_cv = model_cv.predict(X_test_cv)
        
        # computing accuracy
        accuracy = accuracy_score(y_test_cv, pred_cv)
        k_to_accuracies[k][i] = accuracy

In [40]:
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))

k = 1, accuracy = 0.961111
k = 1, accuracy = 0.952778
k = 1, accuracy = 0.966574
k = 1, accuracy = 0.988858
k = 1, accuracy = 0.955432
k = 3, accuracy = 0.955556
k = 3, accuracy = 0.961111
k = 3, accuracy = 0.963788
k = 3, accuracy = 0.986072
k = 3, accuracy = 0.966574
k = 5, accuracy = 0.950000
k = 5, accuracy = 0.963889
k = 5, accuracy = 0.963788
k = 5, accuracy = 0.980501
k = 5, accuracy = 0.963788
k = 8, accuracy = 0.941667
k = 8, accuracy = 0.961111
k = 8, accuracy = 0.966574
k = 8, accuracy = 0.974930
k = 8, accuracy = 0.949861
k = 10, accuracy = 0.938889
k = 10, accuracy = 0.952778
k = 10, accuracy = 0.966574
k = 10, accuracy = 0.974930
k = 10, accuracy = 0.949861
k = 12, accuracy = 0.941667
k = 12, accuracy = 0.955556
k = 12, accuracy = 0.966574
k = 12, accuracy = 0.974930
k = 12, accuracy = 0.949861
k = 15, accuracy = 0.941667
k = 15, accuracy = 0.955556
k = 15, accuracy = 0.966574
k = 15, accuracy = 0.972145
k = 15, accuracy = 0.947075
k = 20, accuracy = 0.930556
k = 20, accu