In [1]:
"""
    TensorFlow --> Define graph model in Python for computations
    you'll perform, then Tf will take that graph and run it efficiently
    using optimized C++ code; 
    
    Some benefits of TF over competitors:
    
    - Runs on mobile as well
    - Provides compatible APIs for Python and Scikit-Learn
    - Provides TF-Slim API to simplify building, training, and evaluating NN
    - Keras is built on top of TF
    - Its main Python API offers much more flexibility to create all sorts
        of computations, including any nn architectures you can think of
    - It includes highly efficient C++ implementations of many ML operations
        , particularly those needed for NN. + C++ API to define customized
        high-performance operations
    - Advanced optimization nodes to search for the params that minimize
        a cost function!! Because TF takes care of computing the
        gradients of the functions => they are very easy to use?
    - Comes with TensorBoard, visualization tool, for browsing 
        through the computation graph
    - Google has launched a cloud service to run TF graphs
        
"""

# Linear Regression with TF
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_california_housing

In [6]:
housing = fetch_california_housing()
m, n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name="y")
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

with tf.Session() as sess:
    theta_value = theta.eval()

# Chief benefit of this code instead of computing the Normal Eq directly
# using numpy, is that tf will automatically run this on your gpu - if av & tf-gpu

# Next we use autodiff for computing gradients in bgd

n_epochs = 1000
learning_rate = 0.01

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_housing_data = scaler.fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m, 1)), scaled_housing_data]

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name="y")


In [7]:
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
            sess.run(training_op)
    best_theta = theta.eval()

('Epoch', 0, 'MSE =', 13.90306)
('Epoch', 100, 'MSE =', 13.90306)
('Epoch', 200, 'MSE =', 13.90306)
('Epoch', 300, 'MSE =', 13.90306)
('Epoch', 400, 'MSE =', 13.90306)
('Epoch', 500, 'MSE =', 13.90306)
('Epoch', 600, 'MSE =', 13.90306)
('Epoch', 700, 'MSE =', 13.90306)
('Epoch', 800, 'MSE =', 13.90306)
('Epoch', 900, 'MSE =', 13.90306)


In [8]:
def my_func(a, b):
    z = 0
    for i in range(100):
        z = a * np.cos(z + i) + z * np.sin(b - i)
        return z

In [9]:
gradients = tf.gradients(mse, [theta])[0]

In [10]:
# Using GD optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

In [12]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
momentum=0.9)

In [14]:
A = tf.placeholder(tf.float32, shape=(None, 3))

In [15]:
B = A + 5

In [16]:
 with tf.Session() as sess:
        B_val_1 = B.eval(feed_dict={A: [[1, 2, 3]]})
        B_val_2 = B.eval(feed_dict={A: [[4, 5, 6], [7, 8, 9]]})

In [17]:
print(B_val_1)

[[6. 7. 8.]]


In [18]:
print(B_val_2)

[[ 9. 10. 11.]
 [12. 13. 14.]]


In [19]:
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

In [23]:
# def fetch_batch(epoch, batch_index, batch_size):
#     return X_batch, y_batch

# with tf.Session() as sess:
#     sess.run(init)
#     for epoch in range(n_epochs):
#         for batch_index in range(n_batches):
#             X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
#             sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
#     best_theta = theta.eval()

In [28]:
"""
    Theory behind backprop:
    For each training instance the BP alg first makes a prediction (
    forward pass), measures the error, then goes through each layer 
    in reverse to measure the error contribution from each connection
    (reverse pass), and finally slightly tweaks the connection weights
    to reduce the error (GD step)
    
    In order for this alg to work properly, a key change was done to the
    MLP's architecture: they replaecd the step function 
    with the logistic regression function: sigma(z) = 1/(1+exp(-z)). 
    This was essential because the step function contains only 
    flat segments, so there is no gradient to work with 
    because GD cannot move on a flat surface, while the logistic regression
    aka - sigmoid, has a well-defined nonzero derivative everywhere, allowing
    the GD to make some progress at every step. The backprop
    may be used with various activation functions instead of logistic;
    
    To popular ones are:
        - tanh(z) -- tangent fn
        - ReLU --> is continuous but unfortunately not differentiable at z = 0
            (the slope changes abruptly, which can make GD bounce around). 
            However, in practice it works very well and has the advantage of 
            being fast to compute. Most importantly, the fact that it 
            doesn't have a maximum output value also helps reduce 
            some issues during GD. 
            
            
    Fine-Tuning Neural Network Hyperparameters
    
        
    The flexibility of nn is a double-edged sword: there are simply 
    many hyperparams to tweak. Not only can you use any imaginable 
    network topology, but even in a simple MLP you can change the 
    number of layers, the number of neurons per layer, 
    the type of activation function to use in each layer, the weight 
    initialization logic etc. 
    How to know what combination of hyperparams is best for your task?
    
    
    1. Number of hidden layers:
        
        Because MLPs with a single hidden layer and enough neurons
        gives reasonable results, deeper architectures were not considered;
        But they had overlooked the fact that DEEP NETWORKS HAVE 
        A MUCH HIGHER __PARAMETER EFFICIENCY__ than shallow ones:
        Because they can model complex functions
        with less neurons in hidden layers => much faster to train!
    
    
    2. Number of neurons per Hidden Layer:
        
        For the input and output layers, this is already predetermined by 
        the task at hand. As for the hidden layers, a common practice is to size them to form a funnel, with fewer
        and fewer neurons at each layer — the rationale being that many low-level features can coalesce into far
        fewer high-level features.
        
    3. Activation Functions:
    
        In most cases you can use the:
        
            ___ReLU___ activation funtion 
            in the hidden layers (or one of its variants). It is a bit FASTER
            to compute than other atc fn. Also GD doesn't get stuck as much
            on plateaous, thanks to the fact that it does not saturate 
            for large input values (as opposed to the logistic function
            or the hyperbolic tanh fn which saturate)
            
        For the output layer, the ___softmax___ act fn is generally good
        for classification tasks, as for regression tasks, you can simply use
        no activation fn at all. 
        
        

    The BIG ISSUE OF: Vanishing/Exploding Gradients Problems
    
    Because we'll be using Backprop as a training alg, and it goes from 
    the output layer to the input layer, propagating the error gradient
    on the way. Once the algorithm has computed the gradient of the cost
    function with regards to each parameter in the network, it uses these 
    gradients to UPDATE EACH PARAM WITH A GD STEP; 
    
    Unfortunately, gradients often get smaller and smaller as the alg 
    progresses down to the lower layers. As a result, the GD update
    leaves the lower layer connection weights virtually unchanged, and 
    training never converges to a good solution (So divergence can occur
    on two cases: 1. Too large learning rate 2. Vanishing GD). Vice-versa 
    is called exploding GD, which is mostly encountered in RNN; 
    
    this was one of the reasons why dnn were abandoned for a long time. 
    Until Glorot and Bengio found a new suspect.
    
    Why was this they asked? It seemed that the combination of 
    logistic sigmoid act fn with a classical set up for weight initialization
    , namely random init using a normal distribution with a mean 
    of 0 and a standard deviation of 1. => this makes the variance 
    of the outputs of each layer much greater than the variance of its inputs
    So going foward in the network, the variance keeps increasing 
    after each layer until the activation fn saturates at the top layers. 
    This is actually made worse by the fact that the logistic fn has a 
    mean of .5, not 0 like tanh (which behaves slightly better in dnn). 
    
    fix: variance of outputs of each layer to == variance of layers inputs
        + gradients should have equal variance before and after flowing through
        a layer in the reverse direction. 
        
        They gave the Xavier initialization strategy. 
        
        Act fn have either uniform or normal distribution
        

    Faster Optimizers:
        Training a very large deep neural network can be painfully slow. 
        Huge speed can be guaranteed optimizers other than GD, most populars:
            - Momentum optimization
            - Nesterov Accelerated Gradient
            - AdaGrad
            - RMSProp
            - Adam optimizer - optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    Regularization:
        - The most pop in dnn: ___Dropout___ :D. 
        - ___Max-Norm Regularization___
        
    Most cases you would pick:
    
        Initialization --------> He init. 
        Activation Function ---> ELU
        Normalization ---------> Batch 
        Regularization --------> Dropout
        Optimizaer ------------> Nesteroc Accelerated Gradient
        Learning rate schedule - none
"""

# rnn
n_inputs = 3
n_neurons = 5

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

Wx = tf.Variable(tf.random_normal(shape=[n_inputs, n_neurons],dtype=tf.float32))
Wy = tf.Variable(tf.random_normal(shape=[n_neurons,n_neurons],dtype=tf.float32))
b = tf.Variable(tf.zeros([1, n_neurons], dtype=tf.float32))

Y0 = tf.tanh(tf.matmul(X0, Wx) + b)
Y1 = tf.tanh(tf.matmul(Y0, Wy) + tf.matmul(X1, Wx) + b)

init = tf.global_variables_initializer()

import numpy as np

# Mini-batch
X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) 
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) 

with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})
    
print(Y0_val)
print(Y1_val)

[[ 0.9716347   0.9352136   0.9957704   0.995877   -0.42612398]
 [ 0.9999853   0.9990953   1.          0.99999875 -0.99958736]
 [ 1.          0.9999877   1.          1.         -0.9999998 ]
 [ 0.9660293   0.4489524  -0.99848855 -0.99995774 -1.        ]]
[[ 1.          0.99891627  1.          1.         -0.9999999 ]
 [ 0.9962279  -0.42565945 -0.945565   -0.74405974  0.98450255]
 [ 0.9999999   0.9731786   0.9999598   0.9995155  -0.99983245]
 [ 0.9859449  -0.5882173   0.99929863 -0.9871588  -0.9998044 ]]


In [38]:
tf.reset_default_graph()
n_steps = 28
n_outputs = 10
learning_rate = 0.001

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs, states = tf.contrib.rnn.static_rnn(basic_cell, [X0, X1],
dtype=tf.float32)

Y0, Y1 = output_seqs

In [39]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
X_seqs = tf.unstack(tf.transpose(X, perm=[1, 0, 2]))

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs, states = tf.contrib.rnn.static_rnn(basic_cell, X_seqs,
                                                dtype=tf.float32)

outputs = tf.transpose(tf.stack(output_seqs), perm=[1, 0, 2])

In [42]:
#######################################################################
####### Predicting Stock Prices
#######################################################################

tf.reset_default_graph()

# unroll over 20 time steps, since each sequence
# will be 20 inputs long.
n_steps = 20     
# each input will contain only one feature (the value at 
# that time)    
n_inputs = 1
n_neurons = 100 # 100 recurrent neurons
n_outputs = 1

# The targets are also sequences of 20 inputs, 
# each containing a single value

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])
cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
