### Problem statement

Toy example of a simple binary classification problem:
- randomly generated data set
- logistic regression loss
- ANN - forward and back-propagation with gradient descent and optional regularization

### Requirements
The logic is straightforward and should reamin executable for any modern Python3 version >= 3.5.0

Requirements include:
- numpy
- matplotlib
- sklearn

### Licensing terms and copyright notice
***None.***
This is free as in both "free-speech" and "free beer".

In [1]:
# Boiler-plate libraries
import os, sys, locale, time, ast, pickle, string, random, faulthandler  # builtins
from typing import Union, Any, List, Tuple, Optional, Iterable, cast
import copy                      # enable 'copy.deepcopy()'
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

# Set cell display width, matplotlib image formats
import matplotlib_inline
%matplotlib inline
# Several image formats can be enabled: 'png', 'retina', 'jpeg', 'svg', 'pdf'.
matplotlib_inline.backend_inline.set_matplotlib_formats('png', 'jpeg', quality=90)

from IPython.display import HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Set more explicit python error messages
faulthandler.enable()

# import python debugger
import pdb

In [3]:
import numpy as np
from numpy.random import default_rng
# Call default_rng to get new instance of Generator, then call its methods to obtain samples from different distributions.
# E.g.:    vals = rng.standard_normal(10)
rng = default_rng()                       
# By default, uses bits provided by PCG64 with better statistical properties than other legacy random generators, e.g. MT19937.

from sklearn import datasets, linear_model
import matplotlib.pyplot as plt

### Methods

In [None]:
class Config:
    nn_input_dim = 2  # input layer dimensionality
    nn_output_dim = 2  # output layer dimensionality
    
    # Gradient descent parameters
    epsilon = 0.01  # learning rate for gradient descent
    reg_lambda = 0.01  # regularization strength



def generate_data():
    '''
    Make two interleaving half circles
    See: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html
    Output:
      - X: generated samples
           ndarray of shape (n_samples, 2)
      - y: integer labels (0 or 1) for class membership of each sample.
           ndarray of shape (n_samples,)
    '''
    np.random.seed()          # to seeds generator or RandomState
    X, y = datasets.make_moons(n_samples=200, noise=0.20)
    #  noise: standard deviation of Gaussian noise added to the data. 
    return X, y



def visualize(X, y, model):
    # plt.scatter(X[:, 0], X[:, 1], s=40, c=y, cmap=plt.cm.Spectral)
    # plt.show()
    plot_decision_boundary(lambda x:predict(model,x), X, y)
    plt.title("Logistic Regression")


    
def plot_decision_boundary(pred_func, X, y):
    '''
    Set min and max values and give it some padding
    Generate a grid of points with distance h between them
    Plot the contour and training examples
    '''
    
    # Set min and max values and give it some padding
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
    
    # Generate grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    # Predict function value for whole grid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Plot contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
    plt.show()


    
def calculate_loss(model, X, y):
    '''
    Evaluate total loss on dataset
    '''
    num_examples = len(X)  # training set size
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    
    # Forward propagation to calculate predictions
    z1 = X.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    # Calculate loss
    corect_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(corect_logprobs)
    
    # Add regulatization term to loss (optional)
    data_loss += Config.reg_lambda / 2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return data_loss/num_examples



def predict(model, x):
    '''
    Forward propagation with tanh() activation function
    '''
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    # Forward propagation
    z1 = x.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return np.argmax(probs, axis=1)



def build_model(X, y, nn_hdim, num_passes=20000, print_loss=False):
    '''
    Regress over model weights to be learned, 
    based on forward pass and back-propagation 
    with straightforward gradient descent.
    Calculate loss.
    '''
    # Initialize the parameters to random values. We need to learn these.
    num_examples = len(X)
    np.random.seed(0)
    W1 = np.random.randn(Config.nn_input_dim, nn_hdim) / np.sqrt(Config.nn_input_dim)
    b1 = np.zeros((1, nn_hdim))
    W2 = np.random.randn(nn_hdim, Config.nn_output_dim) / np.sqrt(nn_hdim)
    b2 = np.zeros((1, Config.nn_output_dim))

    # This is what we return at the end
    model = {}

    # Gradient descent. For each batch...
    for i in range(0, num_passes):

        # Forward propagation
        z1 = X.dot(W1) + b1
        a1 = np.tanh(z1)
        z2 = a1.dot(W2) + b2
        exp_scores = np.exp(z2)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        # Backpropagation
        delta3 = probs
        delta3[range(num_examples), y] -= 1
        dW2 = (a1.T).dot(delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
        dW1 = np.dot(X.T, delta2)
        db1 = np.sum(delta2, axis=0)

        # Add regularization terms (b1 and b2 don't have regularization terms)
        dW2 += Config.reg_lambda * W2
        dW1 += Config.reg_lambda * W1

        # Gradient descent parameter update
        W1 += -Config.epsilon * dW1
        b1 += -Config.epsilon * db1
        W2 += -Config.epsilon * dW2
        b2 += -Config.epsilon * db2

        # Assign new parameters to the model
        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

        # Optionally print the loss.
        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, calculate_loss(model, X, y)))

    return model



def classify(X, y):
    # clf = linear_model.LogisticRegressionCV()
    # clf.fit(X, y)
    # return clf
    pass



def main():
    X, y = generate_data()
    model = build_model(X, y, 3, print_loss=True)
    visualize(X, y, model)

In [None]:
if __name__ == "__main__":
    main()