### Perceptrons

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import Perceptron

iris = load_iris()
X = iris.data[:, (2, 3)]  # petal length, width
y = (iris.target == 0).astype(np.int)    # Iris Sestosa?

per_clf = Perceptron(random_state=42)
per_clf.fit(X, y)

y_pred = per_clf.predict([[2, 0.5]])

y_pred



## Training Multilayer Perceptrons  

    

In [None]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf



n_inputs = 28*28 # from MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

# use placeholders for the data
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")  # will act as the input layer, during execution it 
                                                                  # will be replaced with one training batch at a time.
y = tf.placeholder(tf.int64, shape=None , name ="y")

# create 2 hidden layers and the output layer

# tensorflow has a standard neural network

with tf.name_scope("s_dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1', activation=tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, n_hidden2, name='hidden2', activation=tf.nn.relu)
    logits = tf.layers.dense(hidden2, n_outputs, name='outputs')
    
# after defining the model, next define the cost function to train it, here using cross entropy
# cross entropy will penalize models with a low probability for the target class.
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
# define a gradient descent optimizer that will tweak the model parameters to minimize the cost function

learning_rate = 0.01
with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

# model evaluation, here use accuracy basically test if the models logit is the same as the target class
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
# create a node to initialize all variables and create a saver
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Execution phase: load MNIST from TensorFlow
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

# define the number of epochs and batch sizes
n_epochs = 40
batch_size = 50

# train the model
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,  y: mnist.validation.labels})
        print(epoch, " Train accuracy: ", acc_train, " Val accuracy: ", acc_val)
    save_path = saver.save(sess, "./my_model_final.ckpt")

## Use the Neural Network

In [None]:
with tf.Session() as sess:
    saver.restore(sess, "./my_model_final.ckpt")
    X_new_scaled = [...]  # some new images (sclaed from 0 to 1)
    Z = logits.eval(feed_dict={X: X_new_scaled})
    y_pred = np.argmax(Z, axis=1)

## Fine-Tuning Neural Netwokr Hyperparameters

### Number of Hidden Layers
One hidden layer can model even the most complex functions if it has enough neurons.<br/>
But deep networks have higher parameter efficiency: they need less neurons to model, making them faster<br/>
You can use a neural network trained on a more general problem as the lower levels for a more specific problem.<br/>
For most problems start with 1 or 2 hiddeen layers, then ramp up the number of levels once you determine if you are 
getting a good solution or not.  

### Number of Neurons Per Hidden Layer
The input layer is determined by the size of the dataset, the output is determined by the size of the output (how many
classes for the decision).  The neurons should form a funnel with the input feeding into the largest layer.  The number
of neurons can be increased gradually until overfitting.  It is generally better to increase the number of layers than
the number of neurons within a layer.  Stretch pants method: choose one larger than you need then stop early to prevent 
overfitting.

### Activation Function
ReLU is the most common for the hidden layers because it is a bit faster and Gradient Descent doesn't tend to get stuck 
on plateaus.  Softmax is preferred for classification tasks output layer where the tasks are mutually exclusive.  If 
they are not mutually exclusive (or binary) then the logistic function is preferred.
