In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython import display
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import matplotlib
cmap=plt.cm.Spectral
np.random.seed(42)

In [None]:
X, y = sklearn.datasets.make_moons(200, noise=0.20)
plt.scatter(X[:,0], X[:,1], s=40, c=y,cmap=cmap)

log_reg = LogisticRegression(C=1e20,fit_intercept = False)
log_reg.fit(X,y)

def plot_decision(X, model, threshold = 0.5):
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    delta = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, delta), np.arange(y_min, y_max, delta))
    # Predict the function value for the whole gid
    grid_X = np.c_[xx.ravel(), yy.ravel()]
    if not isinstance(model,dict):
        if threshold == 0.5:
            Z = model.predict(grid_X) # predictions_or_yhat
        else:
            probas = model.predict_proba(grid_X)[:,0]
            Z = np.int8(probas > threshold)
    else:
        Z = model['predict'](grid_X)
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.figure()
    plt.contourf(xx, yy, Z,cmap =cmap,alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y,cmap=cmap)
    
    
plot_decision(X,log_reg,0.5)

In [None]:
interact(lambda Threshold: plot_decision(X, log_reg, Threshold), Threshold = widgets.FloatSlider(value=0.5, min=0, max=1, step=0.05))

## Logistic regression

To predict probability we will use output of logistic function:

$$ P( y=1) = \dfrac{1}{1 + \exp(- \langle w, x \rangle)} = \sigma(\langle w, x \rangle)$$


In [None]:
def probability(X, w):    
    # X -input data 
    # w - weights 
    
    #### your code here #########
    
    return ???

In logistic regression the optimal parameters $w$ are found by cross-entropy minimization:

$$ L(w) =  - {1 \over N} \sum_{i=1}^N \left[ {y_i \cdot log P( \hat{y_i} =1) + (1-y_i) \cdot log (1-P(\hat{y_i}=1))}\right] $$



In [None]:
def compute_loss(X, y, w):
    """
    Complete the loss function use the probability function defined above
    """
    
    #### your code here #########

    return loss

Check your implementation versus the 'right' one

In [None]:
from sklearn.metrics import log_loss
assert compute_loss(X, y, log_reg.coef_[0]) == log_loss(y,log_reg.predict_proba(X)), "Wrong implementation sorry :/"

In [None]:
def compute_grad(X, y, w):
    """
    Derive the gradient and complete the function 
    """
    gradient = np.zeros((X.shape[1]))

    #### your code here #########
    
    
    
    
    return gradient

In [None]:
w = log_reg.coef_[0]
def visualize(X,y,w):
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    delta = 0.01
    # Generate a grid of points with distance delta between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, delta), np.arange(y_min, y_max, delta))
    # Predict the function value for the whole gid
    grid_X = np.c_[xx.ravel(), yy.ravel()]
    Z = probability(grid_X, w)
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.figure()
    plt.contourf(xx, yy, Z,50,cmap = cmap,alpha=0.7)
    plt.scatter(X[:, 0], X[:, 1], c=y,cmap=cmap)
    
visualize(X,y,w)

## Stochastic gradient descent (mini batch)

Stochastic gradient descent takes a random example on each iteration, calculates a gradient of the loss on it and makes a step:
$$ w_t = w_{t-1} - \eta \dfrac{1}{m} \sum_{j=1}^m \nabla_w L(w_t, x_{i_j}, y_{i_j}) $$

m - size of the batch 

In [None]:
w = np.array([0, 0])

eta = 0.1 # learning rate

n_iter = 1000
batch_size = 1
loss_history = np.zeros(n_iter)


for i in range(n_iter):
    ind = np.random.choice(X.shape[0], batch_size)
    loss_history[i] = compute_loss(X, y, w)
    if i % 10 == 0 or i == len(loss_history)-1:
        plt.figure(figsize=(15, 10))
        plt.plot(loss_history)
        ymin, ymax = plt.ylim()
        plt.ylim(0, ymax)
        display.clear_output(wait=True)
        plt.show()
    w = w - eta*compute_grad(X[ind,:], y[ind], w)

In [None]:
log_reg.coef_

In [None]:
nn_input_dim = 2 # input layer dimensionality
nn_output_dim = 2 # output layer dimensionality

# Gradient descent parameters (I picked these by hand)
eta = 0.01 # learning rate for gradient descent

In [None]:
# Evaluate the total loss on the dataset
def calculate_loss(model):
    num_examples = len(X)
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    # Forward propagation to calculate our predictions
    z1 = X.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    # Calculating the loss
    logs = -np.log(probs[range(num_examples), y])
    entropy_loss = np.sum(logs)
    return 1./num_examples * entropy_loss

In [None]:
# This function learns parameters for the neural network and returns the model.
# - nn_hdim: Number of nodes in the hidden layer
# - n_iter: Number of passes through the training data for gradient descent

def build_model(nn_hdim, n_iter=1000, print_loss=False):
    
    # Initialize the parameters to random values. We need to learn these.
    num_examples = len(X)
    np.random.seed(42)
    W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((1, nn_hdim))
    W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
    b2 = np.zeros((1, nn_output_dim))
    
    # This is what we return at the end
    model = {}
    loss_history = np.zeros(n_iter)
    # Gradient descent. For each batch...
    for i in range(0, n_iter):

        # Forward propagation
        z1 = X.dot(W1) + b1
        a1 = np.tanh(z1)
        z2 = a1.dot(W2) + b2
        # softmax final activation layer
        exp_scores = np.exp(z2)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        # Backpropagation
        delta3 = probs
        delta3[range(num_examples), y] -= 1
        dW2 = (a1.T).dot(delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
        dW1 = np.dot(X.T, delta2)
        db1 = np.sum(delta2, axis=0)

        # Gradient descent parameter update
        W1 += -eta * dW1
        b1 += -eta * db1
        W2 += -eta * dW2
        b2 += -eta * db2
        
        # Assign new parameters to the model
        
        model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
        loss_history[i] = calculate_loss(model)
        
        if print_loss and i % 100 == 0:
          print("Loss after iteration %i: %f" %(i, calculate_loss(model)))
    
    model['loss_history'] = loss_history
    
    # Helper function to predict an output (0 or 1)
    def NN_predict(x):
        W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
        # Forward propagation
        z1 = x.dot(W1) + b1
        a1 = np.tanh(z1)
        z2 = a1.dot(W2) + b2
        exp_scores = np.exp(z2)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        return np.argmax(probs, axis=1)
    
    model['predict'] = NN_predict
    return model

In [None]:
# Build a model with a 3-neurons in the hidden layer
model = build_model(10, print_loss=True)

In [None]:
#plot_decision(X,model)
interact(lambda neurons: plot_decision(X, build_model(neurons)), neurons = widgets.IntSlider(value=3, min=3, max=50, step=1,
                                                                                            continuous_update=False,))

# Varying the hidden layer size and experimenting with new dataset


In [None]:
# change parameters to generate new dataset 
num_observations = 400 
noise = 0.3
X, y = sklearn.datasets.make_moons(num_observations, noise=noise)

model = build_model(5,print_loss=True)
plot_decision(X,model)

### Higher level NNets library example - Keras 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
model = Sequential()
model.add(Dense(10, input_dim=2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='RMSprop'
             )

model.fit(X, y,
          epochs=100,
          batch_size=50, verbose = 0)

In [None]:
def plot_decision_keras(X, model):
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    delta = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, delta), np.arange(y_min, y_max, delta))
    # Predict the function value for the whole gid
    grid_X = np.c_[xx.ravel(), yy.ravel()]
    Z = model.predict_classes(grid_X) # predictions_or_yhat

    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.figure()
    plt.contourf(xx, yy, Z,cmap =cmap,alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y,cmap=cmap)
    


In [None]:
plot_decision_keras(X, model)