**Lab1**
Carlos García 21000475

### Import libraries

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from datetime import datetime

print(tf.__version__)

In [None]:
if tf.__version__.startswith("2."):
  import tensorflow.compat.v1 as tf
  tf.compat.v1.disable_v2_behavior()
  tf.compat.v1.disable_eager_execution()
  print("Enabled compatitility to tf1.x")

### Getting the data

In [None]:
data = np.load('inputs/proyecto_training_data.npy')

In [None]:
print(data.shape)
n_train = int(np.ceil(data.shape[0]*0.8))

In [None]:
train, test = data[:n_train, :], data[n_train:, :] 
assert (train.shape[0] + test.shape[0]) == data.shape[0]

In [None]:
ds_train = pd.DataFrame(train, columns = ['SalePrice', 'OverallQual', '1stFlrSF', 'TotRmsAbvGrd', 'YearBuilt', 'LotFrontage'])
ds_train.head()

In [None]:
response = 'SalePrice'
regressor = 'OverallQual'

### Defining the model

In [None]:
def error(y_real,y_aprox):
  return 1/2 * tf.reduce_mean(tf.math.square(y_real - y_aprox))

In [None]:
#mini batch model
def trainModel(x, y, epochs = 100, batch_size = 10, lr = 0.001, kprint = 10):
    
    #define iterations
    total_iterations = x.shape[0] // batch_size

    #initializing the graph
    tf.reset_default_graph()

    #initializing data
    tensor_x = tf.placeholder(tf.float32, [None, 2], "tensor_x")
    tensor_y = tf.placeholder(tf.float32, [None, 1], "tensor_y")
    w = tf.get_variable("parameters_m_b", dtype=tf.float32, shape=[2,1],
                    initializer=tf.zeros_initializer())

    #estimating values
    yhat = tf.matmul(tensor_x, w, name = 'yhat')

    #Cost/Error calculation
    with tf.name_scope("cost_definition"):
        cost = error(tensor_y, yhat)
        
    #Scalar summary
    cost_summary = tf.summary.scalar(name = 'MSE', tensor = cost)

    #gradients and cost/error optimization
    with tf.name_scope("params_update"):
        gradients = tf.gradients(cost, [w], name = 'gradients') #calculating error and gradients
        w_update = tf.assign(w, w - lr * gradients[0], name = 'weigths_update') #updating parameters weights
    
    #with tf.train.MonitoredSession() as session: #this object doesn't generate a clean graph due to initialized variables
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        #Reshaping the data
        x = x[:, np.newaxis]
        x = np.hstack((x, np.ones_like(x)))
        y = y[:, np.newaxis]

        feed_dict_model = {tensor_x: x, tensor_y: y} #whole batch dictionary

        #Define tensorboard writer and config string
        dt_string = datetime.now().strftime("%Y%m%d_%H%M")
        writer = tf.summary.FileWriter('./graphs/{}_lm_epochs={}_mbatch={}_lr={}'.format(dt_string, epochs, batch_size, lr), session.graph)
        
        for epoch in range(0, epochs):    
            for i in range(0, total_iterations):
                start_sample = i * batch_size
                end_sample = start_sample + batch_size
                x_mb = x[start_sample:end_sample]
                y_mb = y[start_sample:end_sample]
                
                feed_dict = {tensor_x: x_mb, tensor_y: y_mb}
                _, weights = session.run([w_update, w], feed_dict = feed_dict) #be careful not to use the same name of a previous variable
                #m, b = weights[0, 0], weights[1, 0]

                #print("Epoch {} iteration {} m={} b={}".format(epoch,i,m,b))

            predictions = session.run(yhat, feed_dict = feed_dict_model)
            mse, csummary = session.run([cost, cost_summary], feed_dict = feed_dict_model)
            writer.add_summary(csummary, epoch + 1)

            if (epoch + 1) % kprint == 0:            
                m, b = weights[0, 0], weights[1, 0]
                print("Epoch {} parameters: m={} b={} mse={}".format(epoch + 1, m, b, mse))

        _, weights = session.run([w_update, w], feed_dict = feed_dict_model)
        predictions, mse = session.run([yhat, cost], feed_dict = feed_dict_model)
        m, b = weights[0, 0], weights[1, 0]
        print("Final model parameters: m={} b={} mse={}".format(m,b,mse))

        writer.close()
            

### Graph definition

<img src="imgs/graph_definition.png">

### Experiments

In [None]:
#Experiment1. Initial experiment, expecting MSE going down. This experiment will work as a starting point to tune next experiments. Small lr and epochs
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 100, batch_size = 250, lr = 0.01, kprint = 10 )

Good starting point, MSE is going down at a good peace but too few epochs implemented

In [None]:
#Experiment2. Since the MSE decayed really quickly will only add more epochs. This will keep lowering it down
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 1000, batch_size = 250, lr = 0.01, kprint = 100)

As expected initial hyper parameters work fine, increasing epochs shows a good progress

In [None]:
#Experiment 3. Experiment with a higher batch size may cause a better performance, so batch_size is increasing to 500
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 1000, batch_size = 500, lr = 0.01, kprint = 100)

A too high batch size is not working since MSE is going up again

In [None]:
#Experiment 4. Higher batchsize didn't work out, lowering it down a little more to check for optimal hyper parameters, expect better performance
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 1000, batch_size = 200, lr = 0.01, kprint = 100)

Lowering batch size a bit more makes an improvement without consuming too much additional time

In [None]:
#Experiment 5. Lower batch size even more to check time-performance tradeoff, expecting poor performance
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 1000, batch_size = 50, lr = 0.01, kprint = 100)

A too small batch size converge faster than other without leaving space to improvement

In [None]:
#Experiment 6. Returning to best performance settings on experiment 4 with a lower learning rate, this would improve the model
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 1000, batch_size = 200, lr = 0.001, kprint = 100)

A too small learning rate makes it time consuming to optimize the cost function

In [None]:
#Experiment 7. Lower learning rate was too slowly decaying, trying more epochs to check if it's going down
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 2000, batch_size = 200, lr = 0.001, kprint = 100)

Definitely going down, but taking too much time in comparison with other models

In [None]:
#Experiment 8. Lower learning rate is too slow, changing to best params (experiment 4) but increasing learning rate expect better performance
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 1000, batch_size = 200, lr = 0.1, kprint = 100)

A too high learning rate crash the model, it may be related to divergence

In [None]:
#Experiment 9. Slightly increasing mini batch size to improve performance from experiment 4
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 1000, batch_size = 300, lr = 0.01, kprint = 100)

Batch size should not increase any farther, it's value seems optimal for the model

In [None]:
#Experiment 10. Minibatch size set, increasing epochs to tune best model and check if improvement is made, expecting lower improvement
trainModel(ds_train[regressor].values, ds_train[response].values, epochs = 2000, batch_size = 200, lr = 0.01, kprint = 200)

Converge was reach around 1000 epochs, so a higher epoch is not needed

### Models results

<img src="imgs/ModelsOutput.png">

### Conclusions

- Hyper parameter tunning makes a big difference in model performance. As shown in the graph **experiment number 4** (epochs = 1000, batch_size = 200, lr = 0.01) was the best in time-performance evaluation. 
- Experiment 4 learning rate was small enough to allow a fast optimization of the cost function (MSE) without taking too much time and not allowing for divergence. 
- Epochs do help optimizing but we should check for the point where convergence starts to avoid training more than needed
- Mini batch size has a sweet point between small and high, a too high batch size is not so useful, and a lower one makes training longer