In [1]:
import torch
import torch.nn as nn
import numpy as np
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.utils.data as data_utils
from ucimlrepo import fetch_ucirepo
from torch.utils.data import DataLoader
from model import WideModel
from tqdm.auto import tqdm

In [2]:
# load dataset
# pip install ucimlrepo

# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = torch.tensor(breast_cancer_wisconsin_diagnostic.data.features.values, dtype=torch.float32)
y = breast_cancer_wisconsin_diagnostic.data.targets
y["Diagnosis"] = y["Diagnosis"].map({"M": 1, "B": 0})
y = torch.tensor(y.values, dtype=torch.float32)

train = data_utils.TensorDataset(X[:455], y[:455])
test = data_utils.TensorDataset(X[455:], y[455:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["Diagnosis"] = y["Diagnosis"].map({"M": 1, "B": 0})


In [3]:
# create dataloaders. Batch size must be 1
batch_size = 1
train_dataloader = DataLoader(train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test, batch_size=batch_size, shuffle=True)

In [4]:
# create device
dev = "cuda" if torch.cuda.is_available() else "cpu"
print("using device", dev)

# create model
model = WideModel(input_dim=30,hidden_dim_scale = 20, output_dim=1).to(dev)

# create optimizer
lr = 1e-3
optimizer = torch.optim.SGD([p for p in model.parameters()], lr=lr)

using device cpu


In [6]:
# get linearized models:
num_params = len(model.flatten_parameters())

# we reduce f(x,w) to Aw+B, where there is a different A,B per x

As = torch.empty((0,num_params))
Bs = torch.empty((0,))
ys = torch.empty((0,))

for x,y in tqdm(train_dataloader):
    x = x.to(dev)
    
    # A = gradient matrix of logits
    A = model.flatten_gradient(x).unsqueeze(0)
    # print(A.shape)
    As = torch.concat((As, A), dim=0)
    
    # B = f(x,w) - A w
    B = model.forward(x) - A @ model.flatten_parameters()
    Bs = torch.concat((Bs, B[:,0]), dim=0)
    
    ys = torch.concat((ys, y[:,0]), dim=0)
    
    model.Adict[x] = A
    model.Bdict[x] = B
    
model.update_stored_linear_tensors(As, Bs)    

  0%|          | 0/455 [00:00<?, ?it/s]

In [7]:
As

tensor([[ -0.5066,  -0.6980,  -3.4752,  ...,  25.8849,  69.7248,   1.0000],
        [ -0.4540,  -0.5858,  -2.8905,  ...,  21.2490,  55.1975,   1.0000],
        [ -0.4959,  -0.6057,  -3.2478,  ...,  20.3505,  75.1512,   1.0000],
        ...,
        [ -0.4689,  -0.5620,  -2.9864,  ...,  21.1079,  61.0999,   1.0000],
        [ -0.7197,  -0.8181,  -4.7032,  ...,  30.5737, 151.9301,   1.0000],
        [ -0.4462,  -0.6774,  -2.9100,  ...,  21.4290,  55.0188,   1.0000]])

In [20]:

# change weights to be a normal scale

f = model.batched_linearized_forward(model.w0)
while f.abs().max() > 1:
    print("updated")
    w = model.flatten_parameters()/2
    model.update_parameters(w)
    f = model.batched_linearized_forward(w)

In [21]:
# get linearized models:
num_params = len(model.flatten_parameters())

# we reduce f(x,w) to Aw+B, where there is a different A,B per x

As = torch.empty((0,num_params))
Bs = torch.empty((0,))
ys = torch.empty((0,))

for x,y in tqdm(train_dataloader):
    x = x.to(dev)
    
    # A = gradient matrix of logits
    A = model.flatten_gradient(x).unsqueeze(0)
    # print(A.shape)
    As = torch.concat((As, A), dim=0)
    
    # B = f(x,w) - A w
    B = model.forward(x) - A @ model.flatten_parameters()
    Bs = torch.concat((Bs, B[:,0]), dim=0)
    
    ys = torch.concat((ys, y[:,0]), dim=0)
    
    model.Adict[x] = A
    model.Bdict[x] = B
    
model.update_stored_linear_tensors(As, Bs)
    

  0%|          | 0/455 [00:00<?, ?it/s]

In [22]:
Bs

tensor([[-0.0311, -0.0472, -0.1993,  ...,  1.4814,  4.3266,  1.0000],
        [-0.0426, -0.0499, -0.2831,  ...,  1.7597,  9.0688,  1.0000],
        [-0.0266, -0.0553, -0.1723,  ...,  1.2860,  3.2890,  1.0000],
        ...,
        [-0.0439, -0.0558, -0.2895,  ...,  1.9796,  8.7738,  1.0000],
        [-0.0280, -0.0461, -0.1824,  ...,  1.3851,  3.3790,  1.0000],
        [-0.0290, -0.0429, -0.1834,  ...,  1.3205,  3.7999,  1.0000]])

In [9]:
w = model.w0
print(w)
for step in range(1):
    print(f"Starting Newton step {step}")
    old_w = w
    w = model.newton_update(w, ys)
    print(w)
    print(f"change in w is: {torch.linalg.norm(w-old_w)}")
    # print(f"Achieved loss")

tensor([-0.0399, -0.0743,  0.0156,  ..., -0.0099,  0.0023, -0.0127],
       grad_fn=<CatBackward0>)
Starting Newton step 0
f is nan any: False
dl[0], 15.01091480255127
dl is nan any: False
invddl[0], tensor([ 1.7200e-06,  7.0556e-06,  7.6109e-06,  ..., -3.0915e-05,
         2.6660e-05, -2.7797e-06], grad_fn=<SelectBackward0>)
invddl is nan any: False
tensor([ 4.7132e+00,  7.5275e+01,  1.1787e+02,  ..., -6.4798e+03,
         8.2259e+03,  3.7564e+03], grad_fn=<SubBackward0>)
change in w is: 797832.0625


In [10]:
f = model.batched_linearized_forward(w)
dl = (torch.exp(f)/(1+torch.exp(f)) - ys) @ model.Atensor

In [19]:
(torch.exp(f[3])/(1+torch.exp(f[3])))

tensor(nan, grad_fn=<DivBackward0>)

In [16]:
f[3]

tensor(784817.9375, grad_fn=<SelectBackward0>)

In [13]:
torch.exp(f)/(1+torch.exp(f))

tensor([0., 0., 0., nan, nan, nan, 0., 0., 0., nan, 0., 0., nan, 0., 0., nan, nan, nan, 0., 0., nan, 0., nan, 0.,
        nan, 0., nan, 0., 0., 0., 0., 0., nan, nan, nan, 0., nan, nan, 0., 0., nan, nan, 0., nan, 0., nan, 0., nan,
        0., 0., 0., nan, nan, 0., 0., nan, nan, nan, 0., 0., 0., nan, nan, 0., nan, nan, 0., 0., nan, 0., 0., nan,
        0., nan, 0., nan, nan, 0., 0., 0., 0., 0., 0., nan, nan, 0., nan, nan, 0., nan, 0., nan, 0., nan, 0., 0.,
        0., 0., 0., 0., nan, 0., 0., 0., 0., 0., nan, nan, nan, 0., nan, nan, 0., nan, nan, nan, 0., nan, nan, nan,
        0., 0., nan, 0., nan, 0., nan, nan, nan, nan, nan, 0., nan, 0., nan, 0., 0., 0., 0., nan, nan, nan, nan, 0.,
        0., 0., nan, 0., 0., nan, nan, nan, 0., nan, nan, 0., 0., nan, 0., 0., 0., nan, nan, 0., nan, nan, 0., 0.,
        nan, nan, nan, nan, 0., 0., nan, 0., nan, 0., 0., nan, 0., nan, nan, 0., nan, nan, 0., 0., nan, nan, 0., nan,
        nan, nan, nan, nan, nan, 0., 0., 0., 0., 0., nan, 0., 0., 0., nan, 

In [12]:
dl

tensor([nan, nan, nan,  ..., nan, nan, nan], grad_fn=<SqueezeBackward3>)

In [None]:
f

In [None]:
(torch.exp(f)/(1+torch.exp(f)) - ys)

In [None]:
torch.exp(f)

In [None]:
ys