# Week H

More Neural Networks

In [None]:
!wget -q https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/src/data_utils.py
!wget -q https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/src/image_utils.py

In [None]:
import torch
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, Dataset

from data_utils import display_confusion_matrix, object_from_json_url
from data_utils import LFWUtils, StandardScaler
from image_utils import make_image

## More Tensors and Why They're Awesome

Multi-dimensional slicing is definitely a nice property of tensors, but what really sets them apart is their ability to keep track of all the operations performed on them using _computational graphs_.

If we define a tensor and set its `requires_grad` parameter to `True` we unlock some really nice properties that we can use for training neural networks.

One of these properties is the ability to automatically calculate derivatives (OMG, calculus!) of functions defined in terms of our tensor.

Let's investigate.

### Easy Calculus and Free Derivatives

Let's pretend we have the following function:

$f(x) = x^4 - 0.7x^3 - 2x^2 + x + 1$

And we want to find out when the function achieves its maximum and minimum values, when it equals $0$, or when it equals $0.5$.

We can plot it, and easily approximate those values visually:

In [None]:
def peaks(x):
  return x**4 - 0.7*x**3 - 2*x**2 + x + 1

In [None]:
# linspace is range()'s cousin, but for floats 
#   and where the 3rd argument specifies number of steps, not length of steps

x = torch.linspace(-1.3, 1.6, 300)
y = peaks(x)

plt.plot(x, y)
plt.plot([-1.3, 1.6], [0,0], '-')
plt.plot([-1.3, 1.6], [0.5, 0.5], '-')
plt.show()

Looks like local minimum and maximum values are approximately:
- $x = -0.9$ (global minimum)
- $x = 0.2$ (global maximum)
- $x = 1.2$ (local minimum)

It crosses $y = 0$ at:
- $x = -1.2$
- $x = -0.6$

And, it crosses $y=0.5$ a bunch of times, so we'll look at that later.

We can calculate exact values for these points in our graph if we define $x$ and $y$ as tensors and enable their `auto_grad` functionality.

In [None]:
xt = torch.linspace(-1.3, 1.6, 8000, requires_grad=True)
yt = peaks(xt)
yt.backward(torch.ones_like(xt))

dydx = xt.grad
print("derivatives:", dydx[:5])

minmax_idx = (dydx.abs() < 9e-4)
minmax_y = yt[minmax_idx]
minmax_x = xt[minmax_idx]

plt.plot(x, y)
plt.plot(minmax_x.tolist(), minmax_y.tolist(), 'o')
plt.show()

print("min/max:", minmax_x, minmax_y)

In [None]:
xt = torch.linspace(-1.3, 1.6, 8000, requires_grad=True)
yt = peaks(xt).pow(2)
yt.backward(torch.ones_like(xt))

dydx = xt.grad
print("derivatives:", dydx[:5])

zeros_idx = ((dydx.abs() < 0.005) & (yt < 1e-7))
zeros_x = xt[zeros_idx]
zeros_y = yt[zeros_idx]

plt.plot(x, y)
plt.plot(zeros_x.tolist(), zeros_y.tolist(), 'o')
plt.show()

print("zeros:", zeros_x, zeros_y)

In [None]:
xt = torch.linspace(-1.3, 1.6, 8000, requires_grad=True)
yt = peaks(xt)
yt2 = yt.subtract(0.5).pow(2)
yt2.backward(torch.ones_like(xt))

dydx = xt.grad
print("derivatives:", dydx[:5])

y05_idx = ((dydx.abs() < 0.005) & (yt2 < 2e-7))
y05_x = xt[y05_idx]
y05_y = yt[y05_idx]

plt.plot(x, y)
plt.plot(y05_x.tolist(), y05_y.tolist(), 'o')
plt.show()

print("y=0.5:", y05_x, y05_y)

ANOTHER WAY of doing it

Very expensive to calculate function over large range

In [None]:
xs = []
ys = []

x_ = torch.tensor(0.5, requires_grad=True)

y_ = peaks(x_)
xs.append(x_.item())
ys.append(y_.item())

y_.backward()
print(x_, y_, x_.grad)

x_ = x_ + 0.1 * x_.grad
x_.retain_grad()

y_ = peaks(x_)
xs.append(x_.item())
ys.append(y_.item())

# TODO: more steps

In [None]:
plt.plot(x, y)
plt.scatter(xs, ys, marker='o', s=14, c='r')
plt.show()
x_.item(), y_.item()

## Ok, so what ?

Well, now we have most of the ingredients for using a neural network to build a linear regression model using data.

Let's load the housing prices dataset from `HW03`:

In [None]:
# Define the location of the json file here
HOUSES_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024F-H/9103-utils/main/datasets/json/LA_housing.json"

houses_info = object_from_json_url(HOUSES_FILE)

houses_raw_df = pd.DataFrame.from_records(houses_info)

std_scaler = StandardScaler()
houses_df = std_scaler.fit_transform(houses_raw_df)

houses_train, houses_test = train_test_split(houses_df, test_size=0.2)

houses_train.head()

In [None]:
train_features = houses_train.drop(columns=["value"])
train_values = houses_train["value"]

x_train = torch.tensor(train_features.values, dtype=torch.float32)
Y_train = torch.tensor(train_values.values, dtype=torch.float32)

In [None]:
test_features = houses_test.drop(columns=["value"])
test_values = houses_test["value"]

x_test = torch.tensor(test_features.values, dtype=torch.float32)
Y_test = torch.tensor(test_values.values, dtype=torch.float32)

In [None]:
learning_rate = 1e-2
model = nn.Linear(len(houses_df.columns) - 1, 1)

for c in range(32):
    Y_ = model(x_train)
    loss = (Y_ - Y_train).pow(2).mean().pow(0.5)
    loss.backward()
    if c % 4 == 0:
      print(c, loss.item())

    for p in model.parameters():
      p.data.sub_(p.grad.data * learning_rate)
      p.grad.zero_()

In [None]:
Y_std = pd.Series(model(x_train).detach().numpy().squeeze(), name="value")

std_scaler.inverse_transform(Y_std[:4])

In [None]:
learning_rate = 1e-2
model =  nn.Sequential(
  nn.Linear(len(houses_df.columns) - 1, len(houses_df.columns) - 1),
  nn.ReLU(),
  # TODO: add layers
  nn.Linear(len(houses_df.columns) - 1, 1),
)

for c in range(32):
    Y_ = model(x_train)
    loss = (Y_ - Y_train).pow(2).mean().pow(0.5)
    loss.backward()
    if c % 4 == 0:
      print(c, loss.item())

    for p in model.parameters():
      p.data.sub_(p.grad.data * learning_rate)
      p.grad.zero_()
  

In [None]:
with torch.no_grad():
  Y_ = model(x_test)
  loss = (Y_ - Y_test).pow(2).mean().pow(0.5)
  print(loss.item())

In [None]:
learning_rate = 1e-2
model =  nn.Sequential(
  nn.Linear(len(houses_df.columns) - 1, len(houses_df.columns) - 1),
  nn.ReLU(),
  # TODO: add layers
  nn.Linear(len(houses_df.columns) - 1, 1),
)

optim = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

for c in range(32):
    optim.zero_grad()
    Y_ = model(x_train)
    loss = (Y_ - Y_train).pow(2).mean().pow(0.5)
    loss.backward()
    optim.step()
    if c % 4 == 0:
      print(c, loss.item())

In [None]:
with torch.no_grad():
  Y_ = model(x_test)
  loss = (Y_ - Y_test).pow(2).mean().pow(0.5)
  print(loss.item())

## Images?

Load the Labeled Faces in the Wild dataset

In [None]:
# 0. Split data into train/test and do any pre-processing

# 1. Dataloader? Can computer handle entire dataset at once?

# 2. What's my architecture/model?

# 3. What's my cost/loss function?

# 4. What's my optimizer?

# 5. What's my evaluation function?

In [None]:
train, test = LFWUtils.train_test_split(0.333)

x_train = torch.tensor(train["pixels"], dtype=torch.float32)
Y_train = torch.tensor(train["labels"])

x_test = torch.tensor(test["pixels"], dtype=torch.float32)
Y_test = torch.tensor(test["labels"])

In [None]:
for idx in range(0, len(train["pixels"]), 100):
  display(make_image(train["pixels"][idx], 130))
  print(train["labels"][idx], LFWUtils.LABELS[train["labels"][idx]])

## DataLoader

- batch
- random

In [None]:
class FaceDataset(Dataset):
  def __init__(self, imgs, labels):
    self.imgs = imgs
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.imgs[idx], self.labels[idx]

In [None]:
train_dataloader = DataLoader(FaceDataset(x_train, Y_train), batch_size=256, shuffle=True)
test_dataloader = DataLoader(FaceDataset(x_test, Y_test), batch_size=128)

In [None]:
def calc_accuracy(model, data):
  model.eval()
  with torch.no_grad():
    csum = 0
    for x, Y in data:
      Y_ = model(x).argmax(dim=1)
      csum += (Y_ == Y).sum().item()
    return csum / len(data.dataset)

In [None]:
learning_rate = 1e-6
model = nn.Linear(x_train.shape[1], len(Y_train.unique()))

optim = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for e in range(32):
  for x, Y in train_dataloader:
    optim.zero_grad()
    Y_ = model(x)
    loss = loss_fn(Y_, Y)
    loss.backward()
    optim.step()

  if e % 2 == 0:
    train_acc = calc_accuracy(model, train_dataloader)
    test_acc = calc_accuracy(model, test_dataloader)
    print(f"Epoch: {e} loss: {loss.item():.4f}, train acc: {train_acc:.4f}, test acc: {test_acc:.4f}")

In [None]:
calc_accuracy(model, test_dataloader)

In [None]:
def get_labels(model, data):
  model.eval()
  with torch.no_grad():
    data_labels = []
    pred_labels = []
    for x, Y in data:
      Y_ = model(x).argmax(dim=1)
      data_labels += [l.item() for l in Y]
      pred_labels += [l.item() for l in Y_]
    return data_labels, pred_labels

In [None]:
train_labels, train_pred_labels = get_labels(model, train_dataloader)
test_labels, test_pred_labels = get_labels(model, test_dataloader)

display_confusion_matrix(train_labels, train_pred_labels, display_labels=LFWUtils.LABELS)
display_confusion_matrix(test_labels, test_pred_labels, display_labels=LFWUtils.LABELS)

In [None]:
learning_rate = 1e-6
model =  nn.Sequential(
  nn.Dropout(0.2),
  nn.Linear(x_train.shape[1], x_train.shape[1] // 8),
  nn.ReLU(),
  nn.Dropout(0.2),
  nn.Linear(x_train.shape[1] // 8, len(Y_train.unique())),
)

optim = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for e in range(32):
  model.train()
  for x, Y in train_dataloader:
    optim.zero_grad()
    Y_ = model(x)
    loss = loss_fn(Y_, Y)
    loss.backward()
    optim.step()

  if e % 2 == 0:
    train_acc = calc_accuracy(model, train_dataloader)
    test_acc = calc_accuracy(model, test_dataloader)
    print(f"Epoch: {e} loss: {loss.item():.4f}, train acc: {train_acc:.4f}, test acc: {test_acc:.4f}")

In [None]:
calc_accuracy(model, train_dataloader), calc_accuracy(model, test_dataloader)

In [None]:
train_labels, train_pred_labels = get_labels(model, train_dataloader)
test_labels, test_pred_labels = get_labels(model, test_dataloader)

display_confusion_matrix(train_labels, train_pred_labels, display_labels=LFWUtils.LABELS)
display_confusion_matrix(test_labels, test_pred_labels, display_labels=LFWUtils.LABELS)

In [None]:
LFWUtils.top_precision(test_labels, test_pred_labels)

In [None]:
LFWUtils.top_recall(test_labels, test_pred_labels)