In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from keras import backend as K
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

def generate_report(y_actual, y_pred):
    mse = round(mean_squared_error(y_actual, y_pred),3)
    rmse = round(sqrt(mean_squared_error(y_actual, y_pred)),3)
    r2 = round(r2_score(y_actual, y_pred),3)
    error = np.mean(pd.DataFrame(y_train) - pd.DataFrame(y_pred))[0]
    print('mse',mse)
    print('RMSE', rmse)
    print('R2', r2)
    print('error', error)
    return mse,rmse,r2,error

def generate_loss_plot(history, filename=None):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('loss curve')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    if (filename!=None):
        plt.savefig(filename)
    plt.show()

def generate_hist_plot(y_actual, y_pred, filename=None):
    y = pd.DataFrame(y_actual)
    y['new']=y.index
    pred = pd.DataFrame(y_pred)
    pred.index=y['new'].values
    y = y.drop('new',axis=1)
    pred = pred.rename(columns={0:'predicted'})
    x =pd.DataFrame(y[0]-pred['predicted'])
    x = x.rename(columns={0:'difference'})
    done = pd.concat([x,y,pred],axis=1)
    p = x['difference'].values
    type(p)
    plt.hist(p, bins='auto', range=(-75000, 75000))
    if (filename!=None):
        plt.savefig(filename)
    plt.show()


def get_data(): 
    df = pd.read_csv('pluto5_stddum.csv')
    df.drop(['assessland'], axis=1, inplace=True)
    
    X = df[df.columns]
    X.drop('assesstot', axis=1, inplace=True)
    predictors = X.columns
    X = X.values
    Y = df['assesstot'].values
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    return x_train, x_test, y_train, y_test, predictors

#3)Adam combines the good properties of Adadelta and RMSprop and hence tend to do better for most of the problems.
def fit_model(model, x_train, x_test, y_train, y_test, optimizer, epochs):
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    history = model.fit(x_train, y_train, epochs=epochs, verbose=0, validation_data=(x_test, y_test))
    generate_loss_plot(history, filename=None)
    return model

def plot_comparation(y_test, y_test_pred, filename):
    fig, ax = plt.subplots()
    ax.plot(y_test, color = 'blue')
    ax.plot(y_test_pred, color = 'red')
    ax.legend(['Real', 'Predicted'])
    if (filename!=None):
        fig.savefig(filename)
    plt.show()

def predict(model, x_train, y_train, x_test, y_test, filename=None):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    print('ERROR Training')
    generate_report(y_train, y_train_pred)
    print('ERROR Test')
    mse,rmse,r2,error = generate_report(y_test, y_test_pred)
    print('Histogram Training')
    generate_hist_plot(y_train, y_train_pred)
    print('Histogram Test')
    generate_hist_plot(y_test, y_test_pred)
    return y_train_pred, y_test_pred, mse,rmse,r2,error
    
def run_model(hidden_nodes, x_train, x_test, y_train, y_test, optimizer, epochs):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(input_nodes, tf.keras.activations.linear))
    model.add(tf.keras.layers.Dense(hidden_nodes, tf.keras.activations.relu))
    model.add(tf.keras.layers.Dense(1, tf.keras.activations.linear))
    model = fit_model(model, x_train, x_test, y_train, y_test, optimizer, epochs)
    y_train_pred, y_test_pred, mse,rmse,r2,error = predict(model, x_train, y_train, x_test, y_test, filename=None)
    plot_comparation(y_test, y_test_pred, filename=None)
    return y_train_pred, y_test_pred, mse,rmse,r2,error



In [None]:
x_train, x_test, y_train, y_test, predictors = get_data()
input_nodes = len(predictors)
epochs = 20

In [None]:
#choose the amount of nodes in hidden layers: http://www.faqs.org/faqs/ai-faq/neural-nets/part3/section-10.html
#NN0: 1 hidden layer with (Number of inputs + outputs) * (2/3) nodes: overfitting
#does not predicts well high values but it might be because sample 
print('Model 0')
y_train_pred, y_test_pred, mse,rmse,r2,error = run_model(int((input_nodes+1)*(2/3)), x_train, x_test, y_train, y_test, 'adam', epochs)

In [None]:
#NN1: # A typical recommendation is that the number of weights should be no more than 1/30 of the number of training cases: underfitting
print('Model 1')
y_train_pred, y_test_pred = run_model(int(len(x_train)/(30*2)), x_train, x_test, y_train, y_test, 'adam', epochs)

In [None]:
#NN2: reduce amount of nodes hidden layer: underfitting
print('Model 2')
y_train_pred, y_test_pred = run_model(int(len(x_train)/(30*4)), x_train, x_test, y_train, y_test, 'adam', epochs)

In [None]:
#NN3: reduce amount of nodes hidden layer: underfitting
print('Model 3')
y_train_pred, y_test_pred = run_model(int(len(x_train)/(30*6)), x_train, x_test, y_train, y_test,'adam', epochs)

In [None]:
#NN4: reduce amount of nodes hidden layer: underfitting
print('Model 4')
y_train_pred, y_test_pred = run_model(int(len(x_train)/(30*8)), x_train, x_test, y_train, y_test, 'adam', epochs)

In [None]:
#NN5: without hidden layer
print('Model 5')
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(input_nodes, tf.keras.activations.linear))
model.add(tf.keras.layers.Dense(1, tf.keras.activations.linear))
model = fit_model(model, x_train, x_test, y_train, y_test, epochs)
model.compile(loss='mse', optimizer='adam', metrics=['mse'])
history = model.fit(x_train, y_train, epochs=epochs, verbose=0, validation_data=(x_test, y_test))
generate_loss_plot(history, filename=None)
predict(model, x_train, y_train, x_test, y_test, filename=None)

In [None]:
#ACTIVATION FUNCTION
#what is it? Convert a input signal of a node in a A-NN to an output signal.
#Decides, whether a neuron should be activated or not by calculating weighted sum and further adding bias with it. 
#The purpose of the activation function is to introduce non-linearity into the output of a neuron.

#https://medium.com/the-theory-of-everything/understanding-activation-functions-in-neural-networks-9491262884e0
#Sigmoid -> good for classifier 
#Tanh -> scaled sigmoid 
#Relu -> output 1 if greater than 0, makes the activations sparse and efficient. Good when you don’t know the nature of the function you are trying to learn. 
#But its limitation is that it should only be used within Hidden layers of a Neural Network Model.
#softmax -> classifier with multiple classes

#For prediction problem it should simply use a linear function for output layer, for classification sigmoid 
#The basic rule of thumb is if you really don’t know what activation function to use, then simply use RELU

In [None]:
#OPTIMIZERS
#https://medium.com/datadriveninvestor/overview-of-different-optimizers-for-neural-networks-e0ed119440c3
#https://www.dlology.com/blog/quick-notes-on-how-to-choose-optimizer-in-keras/

#RMSprop: Root Mean Square Propagation. 
#It utilizes the magnitude of the recent gradient descents to normalize the gradient.
#This optimizer is usually a good choice for recurrent neural networks.
#learning rate gets adjusted automatically and it chooses a different learning rate for each parameter.
#RMSProp divides the learning rate by the average of the exponential decay of squared gradients

#Adam: Adam can be viewed as a combination of Adagrad, which works well on sparse gradients and 
#RMSprop which works well in online and nonstationary settings.

#Stochastic gradient descent(SGD): for shallow networks.

#Adagrad:  perform larger updates for infrequent parameters and smaller updates for frequent parameters.

#AdaDelta: Adadelta is an extension of Adagrad and it also tries to reduce Adagrad’s aggressive, monotonically reducing the learning rate
