In [5]:
import sys
sys.path.append("..")
import numpy as np
from lazydiff import regression
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# LazyDiff

## Project Group 7


**Team Members**: 

Joe Davison

Raymond Lin

Zheng Yang

Matteo Zhang

# What is Automatic Differentiation (AD)?

- Automatically evaluate the gradient of a function using the chain rule


- Numerical approximation is often not accurate enough and can be computationally expensive


- Symbolic differentiation tends to lead to inefficient code and faces the difficulty of converting a computer program into a single expression

- Many uses including backpropagation in neural networks and sampling from complex distributions


- Forward and reverse modes of gradient propogation


- Uses chain rule to build up gradients

# Forward Mode
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/9b5370f1b8e313d47eb2a2ebac437cf88a7a1d78)
![](https://upload.wikimedia.org/wikipedia/commons/a/a4/ForwardAccumulationAutomaticDifferentiation.png)


# Reverse Mode
![](https://wikimedia.org/api/rest_v1/media/math/render/svg/206c0444486628d70d2617e8eee8a528775e516c)
![](https://upload.wikimedia.org/wikipedia/commons/a/a0/ReverseaccumulationAD.png)

# LazyDiff

- `Var` class wrapping numpy arrays for autodiff scalars and vectors


- Custom operations (including elementary functions) through `ops` module


- Supports forward and reverse mode (extension)


- How it works
    - `Var` objects store their parents and children, and information on how object relates to its parents and children
    - Forward mode requires propogating derivatives to children in topological order
    - Reverse mode requires propogating derivaties to parents in topological order



# How to Use

#### Setup

`pip install lazydiff`

_Note: depends on numpy for efficient vectorization_

# Basic Demo

### Manipulate In Terms of Var Class

In [6]:
from lazydiff.vars import Var

# create scalar
scalar = Var(1.0)

# or vector
vector1 = Var([1.0, 1.0])
# or using numpy
vector2 = Var(np.ones(2))

print("vector1: {}".format(vector1))

vector1: Var([1.0, 1.0], seed=1.0)


### Custom Operations through ops Class

In [7]:
from lazydiff import ops

# conventional
fx = ops.exp(vector1)
print("fx values are: {}".format(fx.val))

# non-conventional
fy = ops.sum(vector1)
fz = ops.norm(vector2)

fx values are: [2.71828183 2.71828183]


### Getting the Gradient

In [8]:
# forward mode
vector1.forward()
for_grad = fx.grad(vector1)

# reverse mode
fx.backward()
back_grad = fx.grad(vector1)

print("Gradient of exp([1,1]):")
print(for_grad)

Gradient of exp([1,1]):
[2.71828183 2.71828183]


# Extension - Backprop

![](https://timvieira.github.io/blog/images/backprop-brain-meme.png)

# Iterative Linear Regression 

### Objective Function

In [9]:
def MSE(X, y, m, b):
    # if we don't like m, b
    # then we need to extend the column of X with additional 1s
    # to factor in +b
    loss = Var(0)
    for vec, y_i in zip(X,y):
        loss = loss + (ops.sum(m*vec)+b-y_i)**2
    return loss/len(X)

### Gradient Descent

In [10]:
def gradient_descent(X, y, loss_function, m, b, lr = 0.1, forward = True):
    loss = loss_function(X, y, m, b)
    if (forward):
        # forward mode
        m.forward()
        b.forward()
    else:
        # reverse mode
        loss.backward()
    # clear cache by reinstantiating
    m -= Var(lr*loss.grad(m))
    b -= Var(lr*loss.grad(b))
    return m, b, loss

### Update Iteratively

In [None]:
def iterative_regression(X, y, m, b, loss_function, lr = 0.1,\
        epochs = 100, earlyStop = 0, forward = True):

    loss = Var(0)
    for ep in range(epochs):
        prev = loss
        m, b, loss = gradient_descent(X, y, loss_function, m, b, lr, forward)
        # check if absolute tolerance meets early stopping condition
        if (abs(loss.val - prev.val) < earlyStop):
            break
    # return coefficient and intercept
    return m, b, loss

![](demo.gif)

Backup Link: https://youtu.be/n5h-UVG1yYE

## Too slow with original implementation

![](old_implementation.png)

## Issues

- Scalar class wrapping each numeric value

- Vector class as vector wrapper of Scalar

- A lot of unwrapping and wrapping involved

## New Implementation: Wrapper for numpy

- One single class, `Var`, wrapping a numpy array

- Makes use of numpy array optimization for calculations

## Significant Improvement on Performance

- new: time difference noticeable only when incrementing by 1000 features 

- old: noticeable for each additional feature added

Old Implementation            |  New Implementation
:-------------------------:|:-------------------------:
![](old_implementation.png)  | ![](new_implementation.png)

## Additional Extension

- Available in `regression` module

- Support for Linear, Ridge, Lasso, Elastic Net Regression

- Adaptable to Polynomial Regression

In [12]:
# implementation for Ridge Regression
# define the objective function
def ridge_loss(X, y, m, b, C = 1):
    loss = Var(0)
    for vec, y_i in zip(X,y):
        loss = loss + (ops.sum(m*vec)+b-y_i)**2
    return loss + C*ops.pow_sum(m,2)

In [13]:
dim = 2
X,y,true_coef = make_regression(n_samples = 100, n_features = dim, n_informative = dim, bias = 10, \
                                coef = True, noise = 1, random_state=1)
m_ridge = Var(np.ones(X.shape[1]))
b_ridge = Var(0)
earlyStop = 0 #1e-8
plot = False
forward = False
epochs = 300
# need to use a very small learning rate for Ridge
# if not it blows up
lr = 0.001
m_ridge, b_ridge, loss = regression.iterative_regression(X, y, m_ridge, b_ridge, regression.ridge_loss, lr, epochs,
                                                    earlyStop, forward)

In [14]:
clf_l2 = Ridge().fit(X,y)
print("Difference between iterative and sklearn Ridge regression")
print("Coefficient: {}".format(m_ridge.val - clf_l2.coef_))
print("Intercept: {}".format(b_ridge.val - clf_l2.intercept_))

Difference between iterative and sklearn Ridge regression
Coefficient: [ 0.00000000e+00 -2.84217094e-14]
Intercept: -1.7763568394002505e-15


### Polynomial Regression

Let's model $y = x^3-10x^2+3$

In [15]:
from sklearn.preprocessing import PolynomialFeatures
X = np.linspace(-10,10,20).reshape(-1,1)

y = X**3-10*X**2+3
X = PolynomialFeatures(3, include_bias = False).fit_transform(X)

In [16]:
m = Var(np.random.rand(X.shape[1]), seed = 1.0)
b = Var(0)
earlyStop = 1e-8
forward = False
m, b, loss = regression.iterative_regression(X, y, m, b, regression.MSE, 0.000001, 10000,
                                                    earlyStop, forward)

![](demo_2.gif)

Backup Link: https://youtu.be/DF-dh6MBm8Q

## Conclusion

- AD package wrapping numpy array


- Support for both forward and reverse mode


- Linear, Lasso, Ridge, Elastic Net, Polynomial Regression support


- Easy to generalize to other loss minimization problems

Mention again how using it's much faster with numpy implementation

## Picture Reference

- https://wikimedia.org/api/rest_v1/media/math/render/svg/9b5370f1b8e313d47eb2a2ebac437cf88a7a1d78

- https://upload.wikimedia.org/wikipedia/commons/a/a4/ForwardAccumulationAutomaticDifferentiation.png

- https://wikimedia.org/api/rest_v1/media/math/render/svg/206c0444486628d70d2617e8eee8a528775e516c

- https://upload.wikimedia.org/wikipedia/commons/a/a0/ReverseaccumulationAD.png

- https://timvieira.github.io/blog/images/backprop-brain-meme.png