# Tensorflow examples

## Deep Neural Network for regression
*I will be using plotly for its interactive capabilities and its easy integration into plotly-dash dashboards*

In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
import plotly.express as px

### Dataset inspection

Let's use the sample datasets we get in colab automatically when we open the session

In [8]:
import os

os.getcwd()

'/content'

In [9]:
os.listdir()

['.config', 'sample_data']

In [29]:
# Loading the dataset - in this case the Anscombe quartet dataset

import pathlib

dir_path = pathlib.Path('./').absolute() / 'sample_data'
assert dir_path.is_dir(), print("The provided path to a directory isn't an actual directory")

#filename = 'california_housing_train.csv'
filename = 'anscombe.json'

file_path = dir_path / filename
assert file_path.is_file(), print("The provided path to a file doesn't exists - no file found")


df = pd.read_json(file_path)

df.head(10)

Unnamed: 0,Series,X,Y
0,I,10,8.04
1,I,8,6.95
2,I,13,7.58
3,I,9,8.81
4,I,11,8.33
5,I,14,9.96
6,I,6,7.24
7,I,4,4.26
8,I,12,10.84
9,I,7,4.81


In [30]:
# Not a huge dataset
df.shape

(44, 3)

In [32]:
df['Series'].unique()

array(['I', 'II', 'III', 'IV'], dtype=object)

So, we have 4 series of data, from the Anscombe's quartet distributions - which are 4 wildly different distributions with similar statistics

In [58]:
df.groupby('Series')['Y'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Series,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
I,11.0,7.5,2.03289,4.26,6.315,7.58,8.57,10.84
II,11.0,7.500909,2.031657,3.1,6.695,8.14,8.95,9.26
III,11.0,7.5,2.030424,5.39,6.25,7.11,7.98,12.74
IV,11.0,7.500909,2.030579,5.25,6.17,7.04,8.19,12.5


In the descriptive statistics above we see that some of the usual statistical descriptors are deceiving - mean and std are almost equal in all 4 sets. However, median tell a different story (although still pretty close)

Let's do some plotting - **graphical analytical inspection**

In [60]:
fig = px.scatter(df, x = 'X', y = 'Y',color = 'Series',symbol = 'Series',marginal_y = 'box')
fig.update_traces(marker_size = 10, opacity=0.75,)

Clearly 4 very different distributions. 

The one we will be using is the Series I - a clear linear tendency can be seen in it, so let's see if a single NN model can infer the linear regresion and predict some new values ...

We will also try the second series, which is a quadratic like distribution, and series III that appears to be a linear model with a single outlier point.

In [100]:
data = df[df['Series'] == 'I'][['X','Y']].to_numpy()
xs = data[:,0]
ys = data[:,1]

In [101]:
model = tf.keras.Sequential(
    tf.keras.layers.Dense(1, input_shape = [1])
)

optimizer = tf.keras.optimizers.SGD()
loss = tf.keras.losses.MeanSquaredError()

In [102]:
model.compile(optimizer=optimizer, loss=loss)

In [105]:
fitting_history = model.fit(epochs=5000,x = xs, y = ys, verbose = 0)

In [114]:
fig = px.scatter(x = fitting_history.epoch[::100], y = fitting_history.history['loss'][::100])
fig.show()

In [115]:
# After approx epoch 1250, the loss stagnates
# Let's try to predict some numbers and plot then against the initial ones, 
# to see if we are following the same statistics

new_xs = np.linspace(1, 20, 250)
new_ys = model.predict(new_xs)



In [121]:
predicted_data = np.zeros((new_xs.size,2))
predicted_data[:,0] = new_xs
predicted_data[:,1] = new_ys[:,0]

In [122]:
df_predicted = pd.DataFrame(data = predicted_data, columns=['X','Y'])

In [126]:
fig_prediction = px.line(df_predicted, x = 'X', y = 'Y')
fig_raw = px.scatter(df[df['Series']=='I'],x = 'X', y = 'Y')

In [127]:
import plotly.graph_objects as go

In [128]:
fig_total =  go.Figure(data = fig_prediction.data + fig_raw.data)
fig_total.show()

Alright, so a single neuron with a linear activatio can perform linear regression ... which is expected.

Let's see how an outlier will affect the predictions

In [134]:
ds = df[df['Series'] == 'III'][['X','Y']].to_numpy()
xs = ds[:,0]
ys = ds[:,1]

In [138]:
model = tf.keras.Sequential(
    tf.keras.layers.Dense(1, input_shape = [1])
)

model.compile(loss='mean_squared_error', optimizer = 'sgd')

fitting_history = model.fit(epochs=2000,x = xs, y = ys, verbose = 0)

In [140]:
new_ys = model.predict(new_xs)
predicted_data = np.zeros((new_xs.size,2))
predicted_data[:,0] = new_xs
predicted_data[:,1] = new_ys[:,0]
df_predicted = pd.DataFrame(data = predicted_data, columns=['X','Y'])
fig_prediction = px.line(df_predicted, x = 'X', y = 'Y')
fig_raw = px.scatter(df[df['Series']=='III'],x = 'X', y = 'Y')
fig_total =  go.Figure(data = fig_prediction.data + fig_raw.data)
fig_total.show()



Such a little dataset for training and an extremelly simple model results in an overrepresented weight for the outlier


Let's do this process in a more elegant way

In [None]:
class Linear_regression_evaluator:

  def __init__(self, epochs = 100, loss_function = 'mean_squared_error', optimizer = 'sgd'):
    self.epochs = 100
    self.loss_function = loss_function
    self.optimizer = optimizer

  def nn_model_creation_compilation(self):
    self.model = tf.keras.Sequential(
        tf.keras.layers.Dense(1, input_shape = [1])
    ) 
    self.model.compile(loss = self.loss_function, optimizer=self.optimizer)

  def model_fit(self, x_data, y_data, verbose = 0, return_h = True):
    print('Fitting model - begun')
    fitting_history = self.model.fit(epochs = self.epochs, x = x_data, y = y_data, verbose = verbose)
    print('Finnished fitting model')
    if return_h:
      return fitting_history

  def plot_loss(self,model_h, spacing = 1):
    fig = px.scatter(x = model_h.epoch[::spacing], y = model_h.history['loss'][::spacing])
    fig.show()

  def show_prediction()