In [None]:
# python libraries

from typing import Tuple
import torch, math

In [None]:
# Forward-Forward Class Objects

class FFAdamParams(object):
    '''
    Profile Settings for Adam Optimizer that is used to create an instance of the Adam Optimizer for each
    linear layer for the forward-forward algorithm
    '''

    def __init__(self,
                 lr: float = 0.001,
                 betas: Tuple[float, float] = (0.9, 0.999),
                 eps: float = 1e-08,
                 weight_decay: float = 0,
                 amsgrad: bool = False):
        '''
        Parameter settings that is necessary for the Adam Optimizer to create a new instance for each layer
        to perform the extended version of stochastic gradient descent

        :param lr: Learning Rate (default: 0.001)
        :param betas: Beta value (default: (0.9, 0.999))
        :param eps: Epsilon (default: 1e-08)
        :param weight_decay: Weight Decay (default: 0)
        :param amsgrad: Amsgrad (default: False)
        '''

        self.lr = lr
        self.betas = betas
        self.eps = eps
        self.weight_decay = weight_decay
        self.amsgrad = amsgrad


class FFSGDParams(object):
    '''
    Profile Settings for SGD Optimizer that is used to create an instance of the Stochastic Gradient Descent
    Optimizer for each linear layer for the forward-forward algorithm
    '''
    
    def __init__(self,
                 lr: float = 0.001,
                 momentum: float = 0,
                 weight_decay: float = 0,
                 dampening: float = 0,
                 nesterov: bool = False):
        '''
        Parameter settings that is necessary for the Stochastic Gradient Descent to create a new instance
        for each layer to perform stochastic gradient descent
        
        :param lr: Learning Rate (default: 0.001)
        :param momentum: Momentum value (default: 0)
        :param weight_decay: Weight Decay (default: 0)
        :param dampening: Dampening Value (default: 0)
        :param nesterov: Nesterov value (default: False)
        '''
        
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.dampening = dampening
        self.nesterov = nesterov


class FFLinear(torch.nn.Linear):
    '''
    Geoffrey Hinton's Forward-Forward Linear Layer Algorithm. This layer greedily learns the goodness function
    by maximizing the positive signal and minimizing the negative signal through a dual logarithmic function.
    Since lengths are normalized away before training the goodness function, signals passed to proceeding layers
    do not rely on the goodness function being trained.
    '''

    def __init__(self,
                 in_features: int,
                 out_features: int,
                 optimizer_params: (FFAdamParams, FFSGDParams, None) = None,
                 num_epochs: int = 100,
                 thresh: float = 2.0,
                 active_func=torch.nn.ReLU(),
                 bias: bool = True,
                 device=None,
                 dtype=None):
        """
        Initialize FF Linear layer by providing tuning parameters, in and out features, number of training epochs,
        and settings necessary for a normal linear layer

        @param in_features: Total number of in features to train Linear Layer (dtype: int)
        @param out_features: Total number of hidden out units for linear layer (dtype: int)
        @param optimizer_params: Optimizer parameters to create optimizer (dtype: [FFAdamParams, FFSGDParams, None], default: None)
        @param num_epochs: Total number of epochs for training (dtype: int, default: 100)
        @param thresh: FF layer threshold for layer goodness (dtype: float, default: 2.0)
        @param active_func: Layer Activation Function (dtype: callable, default: torch.nn.ReLU)
        @param bias: Add or refrain from using bias weight (dtype: bool, default: True)
        @param device: Pytorch device type whether cpu or gpu (default: None)
        @param dtype: Data type of data (default: None)
        """

        # Set device to cpu if is None
        if device is None:
            device = torch.device("cpu")

        # Make torch.nn.Linear as parent class to FFLinear class
        super(FFLinear, self).__init__(in_features, out_features, bias, device, dtype)

        # initialize necessary parameters for layer
        self.active_func = active_func
        self.num_epochs = num_epochs
        self.thresh = thresh
        self.thread = None

        if isinstance(optimizer_params, FFAdamParams):
            # Create Adam optimizer if adam parameters are provided
            self.optimizer = torch.optim.Adam(self.parameters(),
                                              lr=optimizer_params.lr,
                                              betas=optimizer_params.betas,
                                              eps=optimizer_params.eps,
                                              weight_decay=optimizer_params.weight_decay,
                                              amsgrad=optimizer_params.amsgrad)
        elif isinstance(optimizer_params, FFSGDParams):
            # Create SGD optimizer if adam parameters are provided
            self.optimizer = torch.optim.SGD(self.parameters(),
                                             lr=optimizer_params.lr,
                                             momentum=optimizer_params.momentum,
                                             weight_decay=optimizer_params.weight_decay,
                                             dampening=optimizer_params.dampening,
                                             nesterov=optimizer_params.nesterov)
        else:
            # Set optimizer as None for online layer learning
            raise ValueError('Fast Learning isn''t programmed yet')

    def forward(self, signal, norm=True):
        '''
        Apply layer neuron activities on signal for hidden activities

        @param signal: Input signal that is either positive or negative
        @param norm: Perform normalization to signal before Neuron Activities (dtype: bool, Default: True)
        @return: Hidden neuron activities
        '''

        # Normalize signal if norm parameters is True (Needed so that weights don't explode)
        if norm:
            signal /= (signal.norm(2, 1, keepdim=True) + 1e-4)

        if self.bias:
            # Return hidden neuron activities with bias calculation if bias is specified
            return self.active_func(torch.matmul(signal, self.weight.T) + self.bias.unsqueeze(0))
        else:
            # Return hidden neuron activities without bias calculation if bias isn't specified
            return self.active_func(torch.matmul(signal, self.weight.T))

    def train(self, d_pos, d_neg):
        '''
        Trains the goodness function concurrently with neuron activities passed to next layer

        @param d_pos: Positive Signal necessary for training
        @param d_neg: Negative Signal necessary for training
        @return: neuron activities
        '''

        # Normalize each signal to remove the lengths before training
        d_pos /= (d_pos.norm(2, 1, keepdim=True) + 1e-4)
        d_neg /= (d_neg.norm(2, 1, keepdim=True) + 1e-4)

        # Initiate concurrent thread to train goodness function
        self.thread = Thread(target=self._learn_goodness_func, args=(d_pos, d_neg))
        self.thread.start()

        # Pass neuron activities to next layer
        return self.forward(d_pos, norm=False).detach_(), self.forward(d_neg, norm=False).detach_()

    def _learn_goodness_func(self, d_pos, d_neg):
        '''
        Trains the goodness function for the layer without backward propagation
        
        @param d_pos: Positive data signal
        @param d_neg: Negative data signal
        @return: None
        '''

        # iterate by epoch to train goodness function
        for epoch in range(self.num_epochs):
            # Find the neuron activities for both positive and negative signals
            g_pos, g_neg = self.forward(d_pos, norm=False), self.forward(d_neg, norm=False)
            # Calculate the sum of squared activities for both signals
            g_pos, g_neg = g_pos.pow(2).mean(dim=-1), g_neg.pow(2).mean(dim=-1)
            # Calculate the loss by maximizing positive goodness and minimizing negative goodness
            loss = (torch.log(1 + torch.exp(self.thresh - g_pos)) + torch.log(
                1 + torch.exp(g_neg - self.thresh))).mean() / 2

            # Zero optimizer gradients
            self.optimizer.zero_grad()
            # Find goodness function gradient
            loss.backward()
            # Adjust weights with optimizer
            self.optimizer.step()

        # Remove all gradients
        for param in self.parameters():
            param.grad = None

        # Free cuda resources
        if torch.cuda.is_available():
            torch.cuda.empty_cache()


# Declare FFLayers as all possible layers for the Forward-Forward Algorithm
FFLayers = (FFLinear)