# Torch Basics

Yihui "Ray" Ren 
yren@bnl.gov

## TOC:

* Vectorized Computation
    - numpy torch interchangable API
    - simple linear regression in numpy and torch
* AutoGrad (Automatic Differentiation)
    - torch tensor, backward and `grad` 
    - autograd demo
    - `torch.Module` and `forward`.
    - re-write linear regression in `torch.Module`
* Handling Data 
    - Stochastic Gradient Descent (SGD)
    - `torch.DataSet`
    - `torch.DataLoader`
    - re-write linear regression
* Multi-layer Perceptron
    - activation functions
* GPU offloading
    - parameter and buffer

In [None]:
## load modules
import time
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

!date +%D
for pkg in ["np", "pd", "torch"]:
    print(f"{pkg:<6} ver: {eval(pkg).__version__}")

# 10/14/21
# np     ver: 1.20.3
# pd     ver: 1.3.3
# torch  ver: 1.9.1

In [None]:
def line_breaker(foo=None):
    return f"""{"="*30}{foo if foo else "="*20:^20}{"="*30}"""

def plt_linear_fit(x, y, yhat, a, b):
    """
        x, y: input and groud truth
        yhat: prediction
        a, b: parameters of linear model
    """
    fig, axes = plt.subplots(1,2,figsize=(8,4), sharey=True)
    ax1, ax2 = axes
    ax1.scatter(x, y, facecolor='none', edgecolor='b', alpha=0.1)
    ax1.plot(np.linspace(0, 30), a*np.linspace(0,30)+b, 'r')
    ax1.set_title(f"a={a:.3E}, b={b:.3E}")
    ax1.set_xlim([0,30])
    ax1.set_ylim([0.985,1.01])
    ax1.set_ylabel("ground truth y")
    ax1.set_xlabel(" x ")
    ax2.scatter(yhat, y, facecolor='none', edgecolor='b', alpha=0.1)
    ax2.set_title(f"MAE = {np.abs(y-yhat).mean():.5E}")
    ax2.set_ylim([0.985,1.01])
    ax2.set_xlim([0.985,1.01])
    ax2.set_xlabel("yhat")
    return fig
    

## Vectorized Computation
modified from this repo [myazdani/numpy-pytorch-cheatsheet](https://github.com/myazdani/numpy-pytorch-cheatsheet)


In [None]:
# Array Creation
some_shape = (5,3)
some_list = [5,3,2,1]
def compare_numpy_torch(np_cb, th_cb, some):
    x = np_cb(some)
    y = th_cb(some)
    x, y = x.shape, y.shape
    return x, y

for func in ["empty", "ones", "zeros"]:
    print(line_breaker("comparing "+func))
    npf, thf = eval("np."+func), eval("torch."+func)
    print(compare_numpy_torch(npf, thf, some_shape))

# random tensor
print(line_breaker("comparing "+"rand"))
x = np.random.rand(*some_shape) # np rand does not take a tuple for shape
y = torch.rand(some_shape)
print(x.shape, y.shape)

## change random seed, get and set state
np.random.seed(5)
rng_state = np.random.get_state()
np.random.set_state(rng_state)

torch.random.manual_seed(5)
rng_state = torch.random.get_rng_state()
torch.random.set_rng_state(rng_state)

# convert between numpy and torch
print(line_breaker("convert btwn np and torch"))
x = np.random.rand(2,2)
y = torch.tensor(x) # convert np to torch
z = y.numpy() # convert torch to np
assert (x == z).all()

In [None]:
## tensor operation
x_shape = (3, 3)
y_shape = (3, 3)
op = "init"
print(line_breaker(op))
npx = np.random.rand(*x_shape)
npy = np.random.rand(*y_shape)
thx = torch.tensor(npx)
thy = torch.tensor(npy)
print("x:",npx)
print("y:",npy)

### add 
op = "add"
print(line_breaker(op))
npz = npx + npy
thz = thx + thy
print("x+y=z:", npz)
assert (npz == thz.numpy()).all()

### mat product
op = "multiply"
print(f"""{"="*30}{op:^20}{"="*30}""")
npz = npx@npy
thz = thx@thy
print("x@y=z:", npz)
assert np.isclose(npz, thz.numpy()).all()


npz = npx.dot(npy)
thz = thx.mm(thy)
print("x.mm(y)=z:", npz)
assert np.isclose(npz, thz.numpy()).all()

npz = np.matmul(npx,npy)
thz = torch.mm(thx, thy)
print("pkg.mm(x, y)=z:", npz)
assert np.isclose(npz, thz.numpy()).all()

### elementwise mult aka Hadamard product
op = "elementwise multi"
print(f"""{"="*30}{op:^20}{"="*30}""")
npz = npx*npy
thz = thx*thy
print("x*y=z:", npz)
assert np.isclose(npz, thz.numpy()).all()

In [None]:
## Tensor Manipulations
def create_test_tensors(x_shape):
    npx = np.random.rand(*x_shape)
    thx = torch.tensor(npx)
    return npx, thx
    
### transpose
op = "transpose"
print(line_breaker(op))

tensor_shape = (1,3)
npx, thx = create_test_tensors(tensor_shape)
npxT = npx.T
thxT = thx.T

tensor_shape = (3,4,5)
npx, thx = create_test_tensors(tensor_shape)
print("before transpose", npx.shape, thx.shape)
# npxT = np.transpose(npx, (1,0,2)) # also works
npxT = npx.transpose((1,0,2))
# thxT = torch.permute(thx, (1,0,2)) # does not works in torch1.7
thxT = thx.permute((1,0,2)) 
print("after transpose ", npxT.shape, thxT.shape)

### flatten and reshape 
op = "flatten"
print(line_breaker(op))
tensor_shape = (3,4,5)
npx, thx = create_test_tensors(tensor_shape)
npflat1 = npx.reshape(-1)
npflat2 = npx.flatten()
thflat1 = thx.reshape(-1)
thflat2 = thx.flatten()
thflat3 = thx.view(-1)
thflat4 = torch.flatten(thx)
for x in [npflat1, npflat2, thflat1, thflat2, thflat3, thflat4]:
    print(x.shape)

In [None]:
### Squeeze and Unsqueeze (adding and removing dummy dimensions)
op = "squeeze"
print(line_breaker(op))
tensor_shape = (3,1,5)
npx, thx = create_test_tensors(tensor_shape)
print("before squeeze", npx.shape, thx.shape)
npxs = npx.squeeze() 
thxs = thx.squeeze() 
print("after squeeze ", npxs.shape, thxs.shape)
op = "unsqueeze"
print(line_breaker(op))
npxus = np.expand_dims(npxs,1)
thxus = thxs.unsqueeze(1)
print("after unsqueeze at dim 1:", npxus.shape, thxus.shape)

### Concat 
op = "concatenate"
print(line_breaker(op))
tensor_shape = (3,5)
npx, thx = create_test_tensors(tensor_shape)
npy, thy = create_test_tensors(tensor_shape)
print("before concat", npx.shape, npy.shape)
npz0 = np.concatenate((npx, npy), axis=0)
thz0 = torch.cat((thx, thy), axis=0)
assert npz0.shape == thz0.shape
print("after concat along dim 0:", npz0.shape)
npz1 = np.concatenate((npx, npy), axis=1)
thz1 = torch.cat((thx, thy), axis=1)
assert npz1.shape == thz1.shape
print("after concat along dim 1:", npz1.shape)

## Linear Regression Using Vectorized Computation

### Wine Quality Dataset
### Linear regression using a formula
### Homework: convert numpy implementation to torch


In [None]:
## get data for linear regression. 
wine_quality_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
df = pd.read_csv(wine_quality_url, delimiter=";")
w = df.corr()
sns.heatmap(w)
up_tri = np.triu(np.abs(w.to_numpy()),k=1)
max_idx = np.argmax(up_tri)
col_sz = len(df.columns)
col1, col2 = df.columns[max_idx//col_sz], df.columns[max_idx%col_sz]
print(max_idx,up_tri.flatten()[max_idx], col1, col2)

print("found {col1} and {col2} for linear regression") # most correlated two features
plt.figure()
sns.scatterplot(x=df[col1], y=df[col2])
plt.title(f"{col1} and {col2} \n corr.coef. = {np.corrcoef(df[col1], df[col2])[0,1]:.4f}")

wine_x, wine_y = df[col1].to_numpy(), df[col2].to_numpy()

### Simple Linear Regression 
Find a and b such that:
$\sum (y-y')^2$
is minimized, where
$y' = ax+b$

with solution 

$a = \sum(x - \bar{x})(y - \bar{y}) / \sum (x - \bar{x})^2$

$b = \bar{y} - a\bar{x}$


In [None]:
# %%timeit -n10 -r10
## here is a numpy implementation
xbar = np.mean(wine_x)
ybar = np.mean(wine_y)
a = (wine_x - xbar)@(wine_y-ybar).T / np.power(wine_x-xbar, 2).sum()
b = ybar - a * xbar

In [None]:
plt_linear_fit(wine_x, wine_y, a*wine_x+b, a, b);

### Exercise: re-write above code in torch
here is the torch doc https://pytorch.org/docs/stable/torch.html

In [None]:
thx = torch.tensor(wine_x)
thy = torch.tensor(wine_y)
##                            ##
##                            ##
##  Intentionally Left Blank  ## 
##                            ##
##                            ##

In [None]:
# %%timeit -n10 -r10
## Solution
thx = torch.tensor(wine_x)
thy = torch.tensor(wine_y)
xbar = torch.mean(thx)
ybar = torch.mean(thy)
a = (thx - xbar)@(thy-ybar).T / torch.pow(thx-xbar, 2).sum()
b = ybar - a * xbar

In [None]:
plt_linear_fit(wine_x, wine_y, (a*thx+b).numpy(), a.numpy(), b.numpy());

## AutoGrad
In this section, you will learn automated differentiation.
1. Torch tensor has built-in grad
2. Wrap tensors and operations into `nn.Module` so they are chained 

In [None]:
## y = a*x, dy/dx = ? 
x = torch.tensor(0.1, requires_grad=True)
a = torch.tensor(3)
y = a*x # y=ax, dy/dx = a
y.backward()
print("variable x", x, "has grad of", x.grad)

In [None]:
## here is another example
## y = exp(a*x), dy/dx = ? 
## see if this is what you expected.
x = torch.tensor(0.3, requires_grad=True)
a = torch.tensor(3)
y = torch.exp(a*x) 
# do it by hand using chain rule: 
# y = exp(ax), dy/dx = exp(ax) d(ax)/dx = a * exp(ax)
print("do it by hand:\n", "y = exp(ax), dy/dx = exp(ax) d(ax)/dx = a * exp(ax)")
y.backward()
print("variable x", x, "has grad of", x.grad, "which should be the same as", a*torch.exp(a*x).detach())

In [None]:
## Exercise: pick a f(x) you like, and autograd it!  


In [None]:
##                            ##
##                            ##
##  Intentionally Left Blank  ## 
##                            ##
##                            ##

Let's consider this function: $ y = ( \sin x + 1 )^x $

In [None]:
## Solution:  
## for example y = (sin(x)+1)^x
print("for example y = (sin(x)+1)^x")
x = torch.tensor(0.2, requires_grad=True)
y = (torch.sin(x)+1).pow(x)
y.backward()
ans = torch.exp(x*torch.log(torch.sin(x)+1))*((torch.log(torch.sin(x)+1))+x*(torch.sin(x)+1).pow(-1)*torch.cos(x))
print(x, x.grad, ans)

In [None]:
## Torch Module 
# https://pytorch.org/docs/stable/generated/torch.nn.Module.html
# torch.module: Packing parameters and functions together
# two APIs:
# __init__() and forward()

class Func(nn.Module):
    def __init__(self):
        super().__init__()
        self.x = torch.tensor(0.2, requires_grad=True) 
        
    def forward(self, input):
        return (self.x.sin()+1).pow(self.x)
    
## create a module 
func = Func()
y = func(None)
y.backward()
print(func.x.grad)

In [None]:
## Error: if requires_grad=False
print("Warning: this will produce error")
class FuncError(nn.Module):
    def __init__(self):
        super().__init__()
        self.x = torch.tensor(0.2) 
        
    def forward(self, input):
        return (self.x.sin()+1).pow(self.x)
    
## create a module 
func = FuncError()
y = func(None)
try:
    y.backward()
except RuntimeError as err:
    print(err)
    
print("func.x.grad is", func.x.grad)


In [None]:
## Better to use torch.nn.Parameter
## and rename x as w (as weights) as a convention.

class FuncPara(nn.Module):
    def __init__(self):
        super().__init__()
        w = torch.tensor(0.2)
        self.w = nn.Parameter(w)
        
    def forward(self, input):
        return (self.w.sin()+1).pow(self.w)
    
## create a module 
func = FuncPara()
y = func(None)
y.backward()
print(func.w.grad)

In [None]:
## [Optional]
## The benefits of using Parameter are two folds: 
#  * registered to module parameters. 
#  * moves with modules to device.

print(line_breaker())
print("nn.Parameters are registered to nn.Module")
func = FuncPara()
for p in func.parameters():
    print(p)
    
print(line_breaker())
print("torch.tensor did not")
func = Func()
for p in func.parameters():
    print(p)
print("got nothing")
print(line_breaker())

print(line_breaker("move btwn cpu&gpu"))
if torch.cuda.is_available():
    print("registered parameter moves with Module")
    func_p = FuncPara()
    print(func_p.w.device)
    # also works on GPU
    func_p = func_p.cuda()
    print(func_p.w.device)
    
    print("unregistered tensor does not")
    # if Func tensor
    func_t = Func()
    print(func_t.x.device)
    # also works on GPU
    func_t = func_t.cuda()
    print(func_t.x.device)
    
    ## print out:
    #  cpu
    #  cuda:0
    #  cpu
    #  cpu

Modules can be composed together or called in sequence, auto-grad will work in both cases.

Let's still use this function: $ y = ( \sin x + 1 )^x $, and break it into two simpler ones $\sin(x)$ and $u^x$

In [None]:
## Modules can be composed and auto grad works 
## let's break the previous function into two parts, 
class SinFunc(nn.Module):
    def __init__(self, x):
        super().__init__()
        self.x = x
    def forward(self):
        return torch.sin(self.x)
    
class PowFunc(nn.Module):
    def __init__(self, x):
        super().__init__()
        self.x = x
    def forward(self, input):
        return torch.pow(input, self.x)

In [None]:
x = torch.tensor(0.2, requires_grad=True)
x = nn.Parameter(x)

f = SinFunc(x)
g = PowFunc(x)
y = g(f()+1)
y.backward()
x.grad

In [None]:
class CombineFunc(nn.Module):
    def __init__(self, x):
        super().__init__()
        self.f = SinFunc(x)
        self.g = PowFunc(x)
    def forward(self):
        return self.g(self.f()+1)
h = CombineFunc(x)

In [None]:
y = h()
y.backward()
x.grad
# what's the value of x.grad? why is it? what if you run several times?

In [None]:
## let's get back to the linear regression case.
## we can solve it another way using gradient descent

In [None]:
class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        torch.random.manual_seed(7) 
        # some random guess of initial values
        self.a = nn.Parameter(torch.rand(1), requires_grad=True) 
        self.b = nn.Parameter(torch.rand(1), requires_grad=True)
    
    def forward(self, x):
        return x*self.a + self.b

In [None]:
thx = torch.tensor(wine_x)
thy = torch.tensor(wine_y)
lin = LinearModel()
yhat = lin(thx)
V = lambda x: x.cpu().detach().squeeze().numpy()

In [None]:
lr = 0.003 # learning rate
for ep in range(10**4):
    yhat = lin(thx)
    loss = ((thy-yhat)*(thy-yhat)).mean()
    loss.backward()
    for p in lin.parameters():
        # Q: what is p? 
        p.data -= lr * p.grad
        p.grad.zero_() 
        # Q: recall why we need to set grad to zero?
    
    if ep%10**3==0:
        clear_output()
        plt.close(fig)
        fig = plt_linear_fit(wine_x, wine_y, V(lin(thx)), V(lin.a), V(lin.b))
        display(fig)
        time.sleep(1)

## Pytorch Built-in Modules, DataLoader and Loss Functions

In [None]:
from torch.utils.data import Dataset, DataLoader
class WineData(Dataset):
    def __init__(self, x, y):
        self.x = x.astype(float)
        self.y = y.astype(float)
        self.x = np.expand_dims(self.x, axis=1).astype(float)
        self.y = np.expand_dims(self.y, axis=1).astype(float)
        
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return len(self.y)

# examples:
dataset = WineData(wine_x, wine_y)
print("data sz = ", len(dataset), "\nget 3rd item",dataset[2])

In [None]:
dataset = WineData(wine_x, wine_y)
dataloader = DataLoader(dataset, batch_size=2048, drop_last=True, shuffle=True)
model = LinearModel() 
loss_fn = torch.nn.MSELoss()
optim = torch.optim.SGD(model.parameters(), lr=0.006)

In [None]:
x, y = next(iter(dataloader))
print(x.shape, y.shape)
yhat = model(x)
print(yhat.shape)

In [None]:
for ep in range(10**3):
    for x, y in dataloader:
        yhat = model(x)
        loss = loss_fn(yhat, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    
    if ep%10**2==0:
        clear_output()
        plt.close(fig)
        yhat = model(torch.tensor(wine_x))
        fig = plt_linear_fit(wine_x, wine_y, V(yhat), V(model.a), V(model.b))
        display(fig)
        time.sleep(1)

In [None]:
## Exercise: try to replace your LinearModel, with the built-in one:
# model = nn.Linear(1,1,bias=True) 
# and inspect  model.weight and model.bias

In [None]:
model.cuda()
for ep in range(10**3):
    for x, y in dataloader:
        x, y = x.cuda(), y.cuda()
        yhat = model(x)
        loss = loss_fn(yhat, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    
    if ep%10**2==0:
        clear_output()
        plt.close(fig)
        yhat = model(torch.tensor(wine_x).cuda())
        fig = plt_linear_fit(wine_x, wine_y, V(yhat), V(model.a), V(model.b))
        display(fig)
        time.sleep(1)