In [1]:
import numpy as np
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
plt.style.use('ggplot')

ModuleNotFoundError: No module named 'sklearn'

# Neural Network from scratch

Today, you'll learn how to code and train a neural network from scratch using just `numpy` and your brain. 

> This notebook closely follows the exercises and content on the course website (chapter 10.1 and 10.2)


Let's start with some toy dataset:

In [None]:
X, y = make_moons(n_samples=50, noise=0.1, random_state=42)
y = y.reshape(-1, 1) # make y a column vector
plt.scatter(X[:,0], X[:,1], c=y, cmap='coolwarm')
plt.xlabel('x1')
plt.ylabel('x2')

In [None]:
X.shape, y.shape

### Recap questions

- How many observations does your data have? How many input features?
- For this classification task, why does a simple Logistic Regression (LogReg) model performance poorly?
- How many model parameters (weights) does a LogReg model have for this task?
- Which feature engineering could you apply to solve this classification problem with a LogReg model?

## Elements of a Supervised Learning model

1. A prediction function that maps the input `X` to the output `y`: $F(X;w) = \hat{y}$
2. A loss function that evaluates the goodness of fit: $L(y, \hat{y})$
3. Training data that is used to find the weights `w` that minimize the loss function. This is done via the Gradient Descent Algorithm:

    $$
    w_{new} = w_{old} - LR \cdot \nabla_L(w)
    $$

4. Separate validation data that is used to assess the model's performance on unseen data.

### Let's start with a Log Reg model!

In [None]:
# add a column of ones to the input data. why are we doing this?

def add_bias(X):
    return np.hstack([X, np.ones((X.shape[0], 1))])

X = add_bias(X)


assert X.shape[1] == 3

$$
F(X) = w_0X_0 + w_1X_1 + w_21 = Xw
$$

In [None]:
# initialize some random weights from the normal distribution
w = np.random.randn(3,1)
w

In [None]:
# calculate the linear predictor (the linear combination between the input and the weights)
# X[:,0]*w[0] + X[:,1]*w[1] + w[2]
X.dot(w)

In [None]:
# calculate the sigmoid non linear transformation
def sigmoid(x):
    return 1/(1+np.exp(-x))

a = np.array([-10.0, -1.0, 0.0, 1.0, 10.0])
expected = np.array([0.0, 0.27, 0.5, 0.73, 1.0])
assert np.all(sigmoid(a).round(2) == expected)

In [None]:
# calculate the log loss (aka binary crossentropy)

def log_loss(y, y_pred):
    return - (y*np.log(y_pred) + (1-y)*np.log(1-y_pred))

a = np.array([0.0, 0.0, 1.0, 1.0])
b = np.array([0.01, 0.99, 0.01, 0.99])
expected = np.array([0.01, 4.61, 4.61, 0.01])
assert np.all(log_loss(a, b).round(2) == expected)

# Part I: Making predictions with a neural net (the feed forward function)

We build a Neural Net with 

- one hidden layer that contains 3 "neurons"/ units
- one output layer with 1 unit
- a `sigmoid` activation function

In [None]:
# how can we calculate the output of several LogReg models at the same time? 
# this is the first layer of a neural net!

weights = []

weights.append(np.random.randn(3,3)) # 3 units
weights[0]

In [None]:
# first layer
X_hidden = sigmoid(X.dot(weights[0]))
X_hidden

In [None]:
# feed the output of the first hidden layer into a second layer! this is an ordinary logistic regression.

X_hidden_with_bias = add_bias(X_hidden)

weights.append(np.random.randn(4, 1))  # 3 for each hidden feature + 1 bias weight

sigmoid(X_hidden_with_bias.dot(weights[1]))

In [None]:
# combine everything in one function

def feed_forward(X, weights):

    """
    1. Calculate the dot product of X
       and the weights of the first layer.

    2. Apply the sigmoid function on the result.

    3. Append an extra column of ones to the result (i.e. the bias).

    4. Calculate the dot product of the previous step
       with the weights of the second (i.e. outer) layer.

    5. Apply the sigmoid function on the result.

    6. Return all intermediate results (i.e. anything that is outputted
       by an activation function).
    """
    
    output1 = sigmoid(np.dot(X, weights[0]))   
    output2 = sigmoid(np.dot(add_bias(output1), weights[1]))
    return output1, output2

In [None]:
# initialize some random weights

weights = [
    np.random.randn(3, 3),
    np.random.rand(4, 1)
]

# testing 

out1, out2 = feed_forward(X, weights)

assert out1.shape == (50, 3)
assert out2.shape == (50, 1)

Xref = np.array([[1.0, 2.0, 1.0]])
whidden = np.array([[1.0, 2.0, 0.0],
                 [-1.0, -2.0, 0.0]
                    ]).T
wout = np.array([[1.0, -1.0, 0.5]]).T

out1, out2 = feed_forward(Xref, [whidden, wout])
assert np.all(out1.round(2) == np.array([[0.99, 0.01]]))
assert np.all(out2.round(2) == np.array([[0.82]]))

# Part II: Training a Neural Net via Gradient Descent and Backpropagation

Backpropagation is the fancy name for calculating the gradient (partial derivative) of the loss function with respect to all its weights in every layer. To make this easier we first calculate the derivatives for the indiviual parts of the model.

In [None]:
def sigmoid_der(X):
    """derivative of sigmoid with respect to X"""
    return sigmoid(X) * (1-sigmoid(X))

In [None]:
def log_loss(y, y_pred):
    return - (y*np.log(y_pred) + (1-y)*np.log(1-y_pred))

def log_loss_der(y, y_pred):
    """derivative of log loss with respect to y_pred"""
    return - ((y*1/y_pred) + (1-y)*(1/(1-y_pred))*(-1))

In [None]:
def backprop(weights,
             output1,
             output2,
             ytrue,
             X_input,
             LR):    

    wH = weights[0]
    wO = weights[1]

    '''EQUATION A:'''
    # error = (output2 - ytrue) * log_loss(ytrue , output2)
    error = log_loss_der(ytrue, output2)

    '''EQUATION B:'''
    #derivative of the sigmoid function with respect to the
    #hidden output * weights
    hidden_out_with_bias = add_bias(output1)
    y_grad = sigmoid_der(hidden_out_with_bias.dot(wO)) * error
    

    '''EQUATION C:'''
    # hidden_out_with_bias = add_bias(output1)
    #don't forget the bias!
    delta_wO = - np.dot(y_grad.T, hidden_out_with_bias ) * LR

    #and finally, old weights + delta weights -> new weights!
    wO_new = wO + delta_wO.T

    '''EQUATION D:'''
    H_grad = sigmoid_der(X_input.dot(wH))  * y_grad.dot(wO[:-1].T)
    #exclude the bias (3rd column) of the outer weights,
    #since it is not backpropagated!

    '''EQUATION E:'''
    delta_wH = -np.dot(H_grad.T, X_input) * LR
    wH_new = wH + delta_wH.T
    #old weights + delta weights -> new weights!

    return wH_new, wO_new

## Final test

In [None]:
X, y = make_moons(n_samples=200, noise=0.1, random_state=42)
X = add_bias(X)
y = y.reshape(-1, 1)

weights = [
   np.random.normal(size=(3, 4)), # 4 neurons in the hidden layer
   np.random.normal(size=(5, 1))
]

LOSS_VEC = []    
for i in range(500):
    out1, out2 = feed_forward(X, weights)
    LOSS_VEC.append(log_loss(y, out2).sum())
    new_weights = backprop(weights, out1, out2, y, X, 0.01)
    weights = new_weights
plt.plot(LOSS_VEC)

## Decision boundary

In [None]:
x = np.linspace(-3, 3, 200)
X_vis = np.array([(x1, x2) for x1 in x for x2 in x])
X_vis = add_bias(X_vis)
_, y_pred = feed_forward(X_vis, weights)
Z = y_pred.reshape((len(x), len(x)), order='F')

In [None]:
fig,ax=plt.subplots(1,1)
cp = ax.contourf(x, x, Z, alpha=0.8, cmap='coolwarm')
ax.contour(x, x, Z, levels=[0.5])
fig.colorbar(cp) # Add a colorbar to a plot
ax.scatter(X[:,0], X[:,1], c=y, cmap='coolwarm')

# Part III: Neural Nets with Keras (tensorflow)

In [None]:
import tensorflow.keras as keras

In [None]:
# Step 1: Setup the model and its layers
# Step 2: Compile the model (optimization algorithm)
# Step 3: Fit the model on training data

In [None]:
# Step 1
model = keras.models.Sequential()
# input_dim only defined at the first layer
model.add(keras.layers.Dense(units=4, activation=keras.activations.sigmoid, input_dim=2))
model.add(keras.layers.Dense(units=1, activation=keras.activations.sigmoid))

In [None]:
model.summary()

In [None]:
# Step 2
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.3), 
              loss=keras.losses.binary_crossentropy,  # just another name for the log-loss
              metrics=[keras.metrics.binary_accuracy])

In [None]:
# Step 3
X, y = make_moons(n_samples=200, noise=0.1, random_state=42)
y = y.reshape(-1, 1)


hist = model.fit(X, y, 
          epochs=200,       # number of iterations over all datapoints
          batch_size=200  # number of observation to use in each weight update
)

In [None]:
plt.plot(hist.history['loss'])