In [None]:
#!pip install torchviz

In [None]:
import numpy as np
import torch
from torch import nn
import torch.functional as F
from torchviz import make_dot
import hiddenlayer as hl

import pandas as pd

import matplotlib
import matplotlib.pyplot as plt


# Introduction to Neural Networks.
 <a target="_blank" href="https://colab.research.google.com/github/ChemAI-Lab/Math4Chem/blob/main/website/Lecture_Notes/Coding/Introduction_to_neural_networks.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## **Abstract**

1. **Introduction to Activation Functions.**: Activation functions in neural networks, such as sigmoid, ReLU, and softmax, apply non-linear transformations to inputs, enabling the network to capture complex data patterns and behaviors.

2. **Introduction to Non-Linear Layers**: Non-linear layers in neural networks transform linear inputs into non-linear outputs using activation functions, enabling the network to handle complex, non-linear relationships in data.


>### **References: Essential Resources for Further Learning**
>
>- **PyTorch**: [Official Documentation](https://pytorch.org/docs/stable/index.html)
>- **Activation Functions in Neural Networks**: [Online Course](https://towardsdatascience.com/activation-functions-neural-networks-1cbd9f8d91d6)
>- **Activation Functions in Neural Networks [12 Types & Use Cases]**: [Blog](https://www.v7labs.com/blog/neural-networks-activation-functions)



## Beyond Linear Models

In extending the linear model to a **non-linear model**, the function $f(\mathbf{x})$ becomes dependent on both external and internal parameters. This can be dissected as follows:

- **Non-Linear Model Representation**:
  - The model is now expressed as $f(\mathbf{x}) = \mathbf{w}^\top \phi(\mathbf{x},\mathbf{w}') = \sum_i w_i \phi_i(\mathbf{x},\mathbf{w}')$.
  - In this representation, $\phi(\mathbf{x},\mathbf{w}')$ indicates a non-linear map to a new **feature representation**, which also depends on internal parameters $\mathbf{w}'$.

- **Internal Parameter Optimization**:
  - The model includes optimization of the non-linear parameters $\mathbf{w}'$.
  - This introduces an additional layer of complexity compared to the linear model, as $\mathbf{w}'$ need to be adjusted along with $\mathbf{w}$.

Let's assume $\phi(\mathbf{x},\mathbf{w}')$ is another linear model,\
$\phi(\mathbf{x},\mathbf{w}') = \mathbf{z} = [z_0,z_1,\cdots,z_\ell]$, where $\ell$ is the "new" number of features.




In [None]:
# Function
def f(x):
 return -(1.4 - 3.0 * x) * torch.sin(18.0 * x)

def get_data(n_batch=200):
    # X = torch.randn((n_batch,1))
    X = torch.distributions.uniform.Uniform(-0.01,1.).sample([n_batch,1])
    y = f(X)
    return X,y

In [None]:
# Model definition

model = nn.Sequential(
  nn.Linear(1, 100),
  nn.Linear(100, 75),
  nn.Linear(75, 1)
    )
for parameter in model.parameters():
    print(parameter.shape)


In [None]:
layer_1 = nn.Linear(1, 100) # 1 to 100
layer_2 = nn.Linear(100, 75) # 100 to 75
layer_3 = nn.Linear(75, 1) # 75 to 1

x = torch.randn(1,1)
z1 = layer_1(x)
z2 = layer_2(z1)
z3 = layer_3(z2)

print('layer 1', z1.shape)
print('layer 2', z2.shape)
print('layer 3', z3.shape)

In [None]:
y = model(x)
make_dot(y, params=dict(model.named_parameters())).render("mlp", format="png")

In [None]:
  # Training function
def training(model, training_iter=500, n_batch=50, lr=0.05, feedback_interval=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    model.train()

    for itr in range(1, training_iter + 1):
        X, y_true = get_data(n_batch)
        output = model(X)
        loss_val = loss_fn(output, y_true)

        optimizer.zero_grad()
        loss_val.backward()
        optimizer.step() #gradient step

        if itr % feedback_interval == 0:
            print(f'Iteration = {itr}, Loss = {loss_val.item():.4f}')

    return model

In [None]:
def plot_model_performance(model, n_samples=75, n_grid_points=5000):
    plt.clf()
    X, y = get_data(n_samples)
    X_grid = torch.linspace(0., 1., n_grid_points).unsqueeze(1)

    plt.scatter(X.detach().numpy(), y.detach().numpy(), label='Batch')
    plt.plot(X_grid.detach().numpy(), f(X_grid).detach().numpy(), ls='--', c='k', label=r'$f(x)$')
    plt.plot(X_grid.detach().numpy(), model(X_grid).detach().numpy(), c='red', label=r'$NN(x)$')

    plt.ylabel(r'$f(x)$', fontsize=12)
    plt.xlabel(r'$x$', fontsize=12)
    plt.legend()

In [None]:
model_trained = training(model)
plot_model_performance(model_trained)

### **Non-Linear Layers**

Consider the composition of two linear models:

$$
f(x,\{\mathbf{W}\}_{\ell=1}^{2}) = \mathbf{W}_2^\top (\mathbf{W}^\top_1\mathbf{x}) = \mathbf{W}^\top_2 \mathbf{z}
$$

<br>

In this expression, $\mathbf{W}_1$ and $\mathbf{W}_2$ are the weight matrices of the two linear models. The function $f(x,\{\mathbf{W}\}_{\ell=1}^{2})$ can be rewritten to highlight the composition of these models:

$$
f(x,\{\mathbf{W}\}_{\ell=1}^{2}) = \mathbf{W}_2^\top \phi(\mathbf{W}_1, \mathbf{x})
$$

<br>

Where the function $\phi(\mathbf{W}_1, \mathbf{x})$ is defined as the product of the input $\mathbf{x}$ and the transpose of the first weight matrix $\mathbf{W}_1$:



$$
\phi(\mathbf{W}_1, \mathbf{x}) = \mathbf{x}\mathbf{W}^\top_1
$$


## **Introducting Activation functions**

Let's revisit the structure of Nueral Networks to understand activation functions:

<!DOCTYPE html>
<html>
<head>
    <style>
        .centered-image {
            display: block;
            margin-left: auto;
            margin-right: auto;
            width: 50%;
        }
    </style>
</head>
<body>

<a href="https://www.geeksforgeeks.org/activation-functions/" target="_blank">
    <img src="https://media.geeksforgeeks.org/wp-content/cdn-uploads/33-1-1.png"
         alt="Varied Initial Conditions for Gradient Descent"
         class="centered-image">
</a>

<br>
<figcaption align = "center"><b>Figure 1 - Activation Functions. Figure by
Vineet Joshi.</b></figcaption>

</body>
</html>

<br>

An **activation function** (also known as "transfer functions") in a neural networks define how the weighted sum of the input (**Wnj**) is transformed into an output from a node or nodes in a layer of the network.

<br>

<!DOCTYPE html>
<html>
<head>
    <style>
        .centered-image {
            display: block;
            margin-left: auto;
            margin-right: auto;
            width: 50%;
        }
    </style>
</head>
<body>

<a href="https://prabhakar-rangarao.medium.com/activation-functions-9020acfa80b6" target="_blank">
    <img src="https://miro.medium.com/v2/resize:fit:1358/1*uUzr1DsZm5P6IqRXIlpfhQ.gif"
         alt="Activation Functions Animated"
         class="centered-image">
</a>

<br>
<figcaption align = "center"><b>Figure 2 - Activation Functions Animated. Figure by
Prabhakar Rangarao.</b></figcaption>

</body>
</html>

<br>

Typically, a differentiable nonlinear activation function is used in the hidden layers of a neural network. This allows the model to learn more complex functions than a network trained using a linear activation function.

<br>

* Hyperbolic tangent
$$
tanh(x) = \frac{\exp(x)-\exp(-x)}{\exp(x)+\exp(-x)}
$$

* Sigmoid
$$
\text{sigmoid}(x_i) = \frac{1}{1+ \exp(-x)}
$$

* ReLU
$$
\text{ReLU}(x) = \max(0,x)
$$

* Leaky RLU
$$
\text{LeakyReLU}(x) = \max(0,x) + \beta*\min(0,x)
$$

* SiLU
$$
\text{SiLU}(x) = x * \sigma(x)\\
\sigma(x) = \frac{1}{1+\exp(-x)}
$$


In [None]:
x = torch.linspace(-5,5,1000)

act_tanh = nn.Tanh()
y_tanh = act_tanh(x)

act_sigmoid = nn.Sigmoid()
y_sigmoid = act_sigmoid(x)

act_relu = nn.ReLU()
y_relu = act_relu(x)

act_lrelu = nn.LeakyReLU(0.1)
y_lrelu = act_lrelu(x)

act_silu = nn.SiLU()
y_silu = act_silu(x)

In [None]:
xnp = x.detach().numpy()
plt.plot(xnp,y_tanh.detach().numpy(),label='Tanh')
plt.plot(xnp,y_sigmoid.detach().numpy(),label='Sigmoid')
plt.plot(xnp,y_relu.detach().numpy(),label='ReLU')
plt.plot(xnp,y_lrelu.detach().numpy(),label='Leaky ReLU')
plt.plot(xnp,y_silu.detach().numpy(),label='SiLU')
plt.xlabel('x',fontsize=15)
plt.ylabel('activation function',fontsize=15)
plt.legend()

## **In Class Activity - Discuss one of the Activation Functions**

**Diagram**\
<img src="https://raw.github.com/RodrigoAVargasHdz/CHEM-4PB3/master/Course_Notes/Figures/MLP_diagram.png"  width="400" height="300">

Chose one of the activation functions discussed and use it in your linear model. Which function works the best?

In [None]:
# Code here
# Define a model
model = nn.Sequential(
nn.Linear(1, 100),
nn.SiLU(),
nn.Linear(100, 1)
)

model = training(model,10000)
X_grid = torch.linspace(0., 1., 5000).unsqueeze(1)
plt.clf()
X, y = get_data(25)
print(X.shape, y.shape, model(X).shape)
plt.scatter(X.detach().numpy(), y.detach().numpy(), label='Batch')
plt.plot(X_grid.detach().numpy(), f(X_grid).detach().numpy(),
         ls='--', c='k', label=r'$f(x)$')
plt.plot(X_grid.detach().numpy(), model(
    X_grid).detach().numpy(), c='red', label=r'$NN(x)$')
plt.ylabel(r'$f(x)$', fontsize=12)
plt.xlabel(r'$x$', fontsize=12)
plt.legend()


Work in a small groups and discuss the following.
1. How many layers we need?
2. What is the *best* activation function?


# Extra

Go to the following [link](https://playground.tensorflow.org/#activation=tanh&batchSize=10&dataset=circle&regDataset=reg-plane&learningRate=0.03&regularizationRate=0&noise=0&networkShape=4,2&seed=0.03345&showTestData=false&discretize=false&percTrainData=50&x=true&y=true&xTimesY=false&xSquared=false&ySquared=false&cosX=false&sinX=false&cosY=false&sinY=false&collectStats=false&problem=classification&initZero=false&hideText=false) and try to solve all the different tasks!