# Feature engineering - nonlinear decision boundaries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as mcs

In [None]:
with open("./data/log_reg_3.txt") as f:
    X = []
    y = []
    for line in f:
        x0, x1, label = line.split(',')
        X.append((float(x0), float(x1)))
        y.append(int(float(label)))


X = np.array(X)
y = np.expand_dims(np.array(y), 1)

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

In [None]:
def initialize_params(X):
    _, nr_features = X.shape
    w0 = np.zeros((nr_features, 1), dtype=np.float_)
    b = 0.0
    return w0, b


def activation(Z):
    return 1 / (1 + np.exp(-Z))


def predict(X, w, b):
    A = activation(np.matmul(X, w) + b)
    return np.round(A)


def calc_gradient(X, y, w, b):
    m = len(X)
    A = activation(np.matmul(X, w) + b)
    cost = (-1 / m) * np.sum(np.multiply(y, np.log(A)) + np.multiply(1 - y, np.log(1 - A)))
    
    dZ = A - y
    dw = (1 / m) * np.matmul(X.T, dZ)
    db = (1 / m) * np.sum(dZ)
    return cost, dw, db

In [None]:
def batch_gd(X, y, alpha, nr_epochs=1000):
    w, b = initialize_params(X)
    costs = []
    for _ in range(nr_epochs):
        cost, dw, db = calc_gradient(X, y, w, b)
        costs.append(cost)
        w = w - alpha * dw
        b = b - alpha * db
    return costs, w, b


alpha = 0.01
costs, w, b = batch_gd(X, y, alpha)

In [None]:
figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 1)
plt.show()

In [None]:
predictions = predict(X, w, b)


def accuracy(predicted, ground_truth):
    return np.sum(predicted == ground_truth) / len(ground_truth)


print(accuracy(predictions, y))
mcs.confusion_matrix(y, predictions)

In [None]:
def expand(X):
    M = np.copy(X)
    Q = np.array([np.square(X[:, 0]), np.multiply(X[:, 0], X[:, 1]), np.square(X[:, 1])]).T
    return np.hstack((M, Q))


def expand_3(X):
    M = expand(X)
    Q = np.array(
        [np.power(X[:, 0], 3), 
         np.multiply(np.power(X[:, 0], 2), X[:, 1]), 
         np.multiply(X[:, 0], np.power(X[:, 1], 2)),
         np.power(X[:, 1], 3)]).T
    return np.hstack((M, Q))

In [None]:
M = expand(X)


m = np.mean(M, axis=0)
s = np.std(M, axis=0, ddof=1)
M = (M - m) / s


alpha = 0.01
costs, w, b = batch_gd(M, y, alpha, 10000)

In [None]:
figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 1)
plt.show()

In [None]:
predictions = predict(M, w, b)


def accuracy(predicted, ground_truth):
    return np.sum(predicted == ground_truth) / len(ground_truth)


print(accuracy(predictions, y))
mcs.confusion_matrix(y, predictions)

In [None]:
h = 0.01
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
meshpoints = np.c_[xx.ravel(), yy.ravel()]

In [None]:
Z = (expand(meshpoints) - m) / s
Z = predict(Z, w, b)
Z = Z.reshape(xx.shape)

plt.figure(figsize=(16, 10))
plt.contourf(xx, yy, Z, alpha=0.1)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.show()

In [None]:
M = expand_3(X)


m = np.mean(M, axis=0)
s = np.std(M, axis=0, ddof=1)
M = (M - m) / s


alpha = 0.01
costs, w, b = batch_gd(M, y, alpha, 10000)

In [None]:
figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 1)
plt.show()

In [None]:
predictions = predict(M, w, b)


def accuracy(predicted, ground_truth):
    return np.sum(predicted == ground_truth) / len(ground_truth)


print(accuracy(predictions, y))
mcs.confusion_matrix(y, predictions)

In [None]:
h = 0.01
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
meshpoints = np.c_[xx.ravel(), yy.ravel()]

In [None]:
Z = (expand_3(meshpoints) - m) / s
Z = predict(Z, w, b)
Z = Z.reshape(xx.shape)

plt.figure(figsize=(16, 10))
plt.contourf(xx, yy, Z, alpha=0.1)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.show()

# Batch gradient descent and other optimization methods

In [None]:
with open("./data/log_reg_1.txt") as f:
    X = []
    y = []
    for line in f:
        x0, x1, label = line.split(',')
        X.append((float(x0), float(x1)))
        y.append(int(label))
        
X = np.array(X)
y = np.expand_dims(np.array(y), 1)

X = (X - np.mean(X, axis=0)) / np.std(X, axis=0, ddof=1)

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

In [None]:
def generate_batches(X, y, batch_size):
    length = len(y)
    indices = np.arange(length)
    np.random.shuffle(indices)
    ix = 0
    while ix < length:
        mini_batch_indices = indices[ix:ix+batch_size]
        yield X[mini_batch_indices, :], y[mini_batch_indices]
        ix += batch_size
    


def minibatch_gd(X, y, alpha, batch_size, nr_epochs=1000):
    w, b = initialize_params(X)
    costs = []
    for _ in range(nr_epochs):
        batch_generator = generate_batches(X, y, batch_size)
        for X_batch, y_batch in batch_generator:
            cost, dw, db = calc_gradient(X_batch, y_batch, w, b)
            costs.append(cost)
            w = w - alpha * dw
            b = b - alpha * db
    return costs, w, b

In [None]:
alpha = 0.01
costs, w, b = minibatch_gd(X, y, alpha, 20, 2000)

In [None]:
figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 1)
plt.show()

### Gradient descent with momentum

Momentum is a method that helps accelerate SGD in the relevant direction and dampens oscillations.

$$
v_{dw} = \beta v_{dw} + (1 - \beta) \cdot dw
$$

$$
v_{db} = \beta v_{db} + (1 - \beta) \cdot db
$$


The update step smoothes out the heavy oscillations

$$
w = w - \alpha \cdot v_{dw}
$$

$$
b = b - \alpha\cdot v_{db}
$$

In [None]:
def minibatch_gd_momentum(X, y, alpha, beta, batch_size, nr_epochs=1000):
    w, b = initialize_params(X)
    v_dw, v_db = initialize_params(X)
    costs = []
    for _ in range(nr_epochs):
        batch_generator = generate_batches(X, y, batch_size)
        for X_batch, y_batch in batch_generator:
            cost, dw, db = calc_gradient(X_batch, y_batch, w, b)
            costs.append(cost)
            v_dw = beta * v_dw + (1 - beta) * dw
            v_db = beta * v_db + (1 - beta) * db
            
            w = w - alpha * v_dw
            b = b - alpha * v_db
    return costs, w, b

In [None]:
alpha = 0.001
beta = 0.9
costs, w, b = minibatch_gd_momentum(X, y, alpha, beta, 20, 2000)

In [None]:
figure = plt.figure(figsize=(16, 8))
plt.plot(costs)
plt.ylim(0, 1)
plt.show()

The learning algorithm can be speed-up in various other ways


### RMSprop 

This is unpublished optimization algorithm designed for neural networks, proposed by Geoff Hinton in lecture 6 of the online course "Neural Networks for Machine Learning".


Root mean square prop or RMSprop is using the same concept of the exponentially weighted average of the gradients like gradient descent with momentum but the difference is the update of parameters.

![](images/hinton_rmsprop.jpg)

### RMSProp

$$
s_{dw} = \beta\cdot s_{dw} + (1 - \beta)\cdot dw^2 \qquad \text{(elementwise square)}
$$

$$
s_{dv} = \beta\cdot s_{dv} + (1 - \beta)\cdot dv^2\qquad \text{(elementwise square)}
$$

$$
w = w - \alpha \frac{dw}{\sqrt{\epsilon + s_{dw}}}
$$
 
$$
b = b - \alpha \frac{db}{\sqrt{\epsilon + s_{db}}}
$$

In [None]:
def minibatch_gd_rmsprop(X, y, alpha, beta, batch_size, nr_epochs=1000):
    w, b = initialize_params(X)
    costs = []
    eps = 1e-8
    for _ in range(nr_epochs):
        batch_generator = generate_batches(X, y, batch_size)
        for X_batch, y_batch in batch_generator:
            cost, dw, db = calc_gradient(X_batch, y_batch, w, b)
            costs.append(cost)
            ###TODO:
            # ???
            # ???
            
            # w = w - alpha * ???
            # b = b - alpha * ???
            ###
    return costs, w, b

### Adaptive moment estimation (ADAM)

On iteration $k$:
$$
v_{dw} = \beta_1\cdot v_{dw} + (1 - \beta_1)\cdot dw,\qquad v_{db} = \beta_1\cdot v_{db} + (1 - \beta_1)\cdot db
$$

$$
s_{dw} = \beta_2\cdot s_{dw} + (1 - \beta_2)\cdot dw^2,\qquad s_{db} = \beta_2\cdot s_{db} + (1 - \beta_2)\cdot db^2,
$$

$$
v_{dw}^c = \frac{v_{dw}}{1 - \beta_1^k},\qquad v_{db}^c = \frac{v_{db}}{1 - \beta_1^k},\qquad s_{dw}^c = \frac{s_{dw}}{1 - \beta_2^k},\qquad s_{db}^c = \frac{s_{db}}{1 - \beta_2^k}
$$
Finally,

$$
w = w - \alpha\cdot \frac{v_{dw}^c}{\sqrt{s_{dw}^c + \epsilon}}
$$

$$
b = b - \alpha\cdot \frac{v_{db}^c}{\sqrt{s_{db}^c + \epsilon}}
$$

$\beta_1\approx 0.9$, $\beta_2\approx 0.99$, $\epsilon\approx 10^{-10}$.

In [None]:
def minibatch_gd_adam(X, y, alpha, beta1, beta2, batch_size, nr_epochs=1000):
    w, b = initialize_params(X)
    eps = 1e-10
    costs = []
    for k in range(nr_epochs):
        batch_generator = generate_batches(X, y, batch_size)
        for X_batch, y_batch in batch_generator:
            cost, dw, db = calc_gradient(X_batch, y_batch, w, b)
            costs.append(cost)
            ###TODO:
            # ???
            # ???
            
            # w = w - alpha * ???
            # b = b - alpha * ???
            ###
    return costs, w, b