In [1]:
import torch
from torch.utils.tensorboard import SummaryWriter

In [56]:
import os

def delete_folder(folder_path):
    if os.path.exists(folder_path):
        for root, dirs, files in os.walk(folder_path, topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        os.rmdir(folder_path)
    else:
        print(f"The folder {folder_path} does not exist")

delete_folder('./runs')

In [3]:
from torchvision import datasets, transforms
from torch.utils.data import Subset

train_dataset_all = datasets.MNIST(root='./data', train=True, download=True,
                               transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True,
                              transform=transforms.ToTensor(), )

subset_size = 55000  # The size of the subset I want
indices = torch.randperm(len(train_dataset_all))[:subset_size]
train_dataset = Subset(train_dataset_all, indices)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
device = torch.device('cpu')

## Model Definition

In [4]:
import torch.nn as nn

class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim, params=None):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        if params:
            self.linear.weight = nn.Parameter(params['weight'])
            self.linear.bias = nn.Parameter(params['bias'])
        # self.linear.weight = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        # self.linear.bias = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        
    def forward(self, x):
        x = x.view(-1, self.linear.in_features)
        # outputs = torch.nn.functional.softmax(self.linear(x))
        outputs = self.linear(x)
        return outputs

class LogisticRegressionRestricted(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionRestricted, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim - 1)
        # self.linear.weight = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        # self.linear.bias = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        
    def forward(self, x):
        x = x.view(-1, self.linear.in_features)
        # outputs = torch.nn.functional.softmax(self.linear(x))
        outputs = self.linear(x)
        zeros_for_last_class = torch.zeros(
            (outputs.shape[0], 1),
            device=x.device,
            dtype=x.dtype
        )
        output_with_zeros = torch.cat((outputs, zeros_for_last_class), dim=1)
        return output_with_zeros

In [66]:
def train(model, optimizer, criterion, train_dataset, writer=None, t=20, leave_out_indices=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    batch_num = 0
    if leave_out_indices:
        train_dataset = Subset(train_dataset, [i for i in range(len(train_dataset)) if i not in leave_out_indices])
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
    for epoch in range(t):
        model.train() # Set the model to training mode
        for batch_data, batch_labels in train_loader:
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            batch_data = batch_data.view(-1, 28*28)
            output = model(batch_data)
            loss = criterion(output, batch_labels)
            
            if writer:
                writer.add_scalar('training loss', loss, batch_num)
            loss.backward()
            optimizer.step(lambda: loss)
            batch_num += 1
            
    return batch_num

In [6]:
def test(model, criterion, test_dataset, device='cpu'):
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

    val_loss = 0
    val_correct = 0
    incorrect_data_list, incorrect_label_list = [], []
    with torch.no_grad():  # Disable gradient calculation during validation
        for data, labels in test_loader:
            # Transfer data to the appropriate device (CPU or GPU)
            data, labels = data.to(device), labels.to(device)

            # Forward pass
            outputs = model(data.view(-1, 28*28))

            # Calculate loss
            loss = criterion(outputs, labels)

            # Update validation metrics (e.g., accuracy)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            incorrect_data_list.append(data[predicted != labels])
            incorrect_label_list.append(labels[predicted != labels])
            
    incorrect_data = torch.cat(incorrect_data_list, dim=0)
    incorrect_label = torch.cat(incorrect_label_list, dim=0)
    val_loss /= len(test_dataset)
    val_accuracy = 100 * val_correct / len(test_dataset)
    return val_loss, val_accuracy, incorrect_data, incorrect_label

## Train the baseline model in Adam Optimizer and calculate $L(z, \hat{\theta})$

In [7]:
model = LogisticRegression(28*28, 10)
initial_params = {"weight": model.linear.weight.data.clone(), "bias": model.linear.bias.data.clone()}
optimizer_adam = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

In [67]:
writer = SummaryWriter('runs/logistic_regression_10_mnist') 
batch_num = train(model, optimizer_adam, criterion, train_dataset, writer, t=5)

torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.float64
torch.

KeyboardInterrupt: 

In [9]:
val_loss, val_accuracy, incorrect_data, incorrect_label = test(model, criterion, test_dataset, device='cuda:0')
print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')


Validation Loss: 0.0043, Validation Accuracy: 92.23%


## Train the baseline model in L-BFGS Optimizer and calculate $L(z, \hat{\theta})$

In [10]:
def train_lbfgs(model, optimizer, criterion, train_dataset, writer=None, t=20, leave_out_indices=None):
    device = torch.device('cpu')
    model = model.to(device)
    batch_num = 0
    if leave_out_indices:
        train_dataset = Subset(train_dataset, [i for i in range(len(train_dataset)) if i not in leave_out_indices])
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
    for epoch in range(t):
        model.train() # Set the model to training mode
        for batch_data, batch_labels in train_loader:
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
            
            def closure():
                optimizer.zero_grad()
                batch_data_reshaped = batch_data.view(-1, 28*28)
                output = model(batch_data_reshaped)
                loss = criterion(output, batch_labels)
                loss.backward()
                return loss
            
            loss_val = optimizer.step(closure)
            if writer:
                writer.add_scalar('training loss', loss_val, batch_num)
            batch_num += 1
    print(f"epoch {epoch} finished")        
    return batch_num

In [11]:
writer_lbfgs = SummaryWriter('runs/logistic_regression_10_mnist_lbfgs')
params = {"weight": initial_params["weight"].clone(), "bias": initial_params["bias"].clone()}
model_lbfgs = LogisticRegression(28*28, 10, params=params)
optimizer_lbfgs = torch.optim.LBFGS(model_lbfgs.parameters())

def criterion_l2(output, target, model):
    loss = criterion(output, target)
    l2_reg = 0.0
    for param in model.parameters():
        l2_reg += torch.norm(param, 2)**2
    loss += 0.01 * l2_reg
    return loss

batch_num = train_lbfgs(model_lbfgs, optimizer_lbfgs, lambda x, y: criterion_l2(x, y, model_lbfgs), train_dataset, writer=writer_lbfgs, t=1)

epoch 0 finished


In [12]:
val_loss_lbfgs, val_accuracy_lbfgs, incorrect_data_lbfgs, incorrect_label_lbfgs = test(model_lbfgs, lambda x, y: criterion_l2(x, y, model_lbfgs), test_dataset)
print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

Validation Loss: 0.0043, Validation Accuracy: 92.23%


In [13]:
val_loss, val_accuracy, incorrect_data, incorrect_label = val_loss_lbfgs, val_accuracy_lbfgs, incorrect_data_lbfgs, incorrect_label_lbfgs

## Calculate loss difference of leave-one-out retraining

In [14]:
device = torch.device('cpu')
test_data_index = torch.randint(0, len(incorrect_data), (1,)).item()
x_test, y_test = incorrect_data[test_data_index], incorrect_label[test_data_index]

In [15]:
def calc_leave_one_out_loss_diff(model, initial_params, optimizer, criterion, test_data, test_label, leave_out_indices, t=5):
    params = {"weight": initial_params["weight"].clone(), "bias": initial_params["bias"].clone()}
    loss_z_test_with_z = criterion(model(test_data), test_label.view(1))
    model_leave_one_out = LogisticRegression(28*28, 10, params=params)
    optimizer_leave_one_out = optimizer(model_leave_one_out.parameters())
    train(model_leave_one_out, optimizer_leave_one_out, criterion, train_dataset, t=t, leave_out_indices=leave_out_indices)
    loss_z_test_without_z = criterion(model_leave_one_out(test_data), test_label.view(1))
    leave_one_out_loss_diff = loss_z_test_without_z - loss_z_test_with_z
    print(f"loss_z_test_with_z: {loss_z_test_with_z}, loss_z_test_without_z: {loss_z_test_without_z}, leave_one_out_loss_diff: {leave_one_out_loss_diff}")
    return leave_one_out_loss_diff

In [16]:
writer_lbfgs_retrained = SummaryWriter('runs/logistic_regression_10_mnist_lbfgs_retrained')
def calc_leave_one_out_loss_diff_lbfgs(model, initial_params, criterion, test_data, test_label, leave_out_indices, t=1):
    params = {"weight": initial_params["weight"].clone(), "bias": initial_params["bias"].clone()}
    loss_z_test_with_z = criterion(model(test_data), test_label.view(1))
    model_leave_one_out = LogisticRegression(28*28, 10, params=params)
    optimizer_leave_one_out = torch.optim.LBFGS(model_leave_one_out.parameters())
    train_lbfgs(model_leave_one_out, optimizer_leave_one_out, criterion, train_dataset, writer=writer_lbfgs_retrained, t=t, leave_out_indices=leave_out_indices)
    loss_z_test_without_z = criterion(model_leave_one_out(test_data), test_label.view(1))
    leave_one_out_loss_diff = loss_z_test_without_z - loss_z_test_with_z
    print(f"loss_z_test_with_z: {loss_z_test_with_z}, loss_z_test_without_z: {loss_z_test_without_z}, leave_one_out_loss_diff: {leave_one_out_loss_diff}")
    return leave_one_out_loss_diff

In [17]:
# writer_retrain = SummaryWriter('runs/logistic_regression_10_mnist_retrain') 
x_test, y_test = x_test.to(device), y_test.to(device)
print(calc_leave_one_out_loss_diff_lbfgs(model_lbfgs, initial_params, criterion, x_test, y_test, []))

epoch 0 finished
loss_z_test_with_z: 3.8284571170806885, loss_z_test_without_z: nan, leave_one_out_loss_diff: nan
tensor(nan, grad_fn=<SubBackward0>)


## Influence Function 

In [46]:
def uniform_sample_dataset(dataset, t):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    
    # Shuffle the indices
    sampler = torch.utils.data.sampler.RandomSampler(dataset, num_samples=t)
    sampled_dataloader = torch.utils.data.DataLoader(dataset, batch_size=500, sampler=sampler)
    return sampled_dataloader

In [19]:
sample_loader = uniform_sample_dataset(train_dataset, 10)
for batch_data, batch_labels in sample_loader:
    print(batch_data.shape, batch_labels)

torch.Size([10, 1, 28, 28]) tensor([7, 5, 4, 5, 1, 7, 9, 7, 1, 2])


In [20]:
def calc_criterion_first_order_derivative(loss, model):
    model.zero_grad()
    # Compute the first-order gradient
    loss.backward(create_graph=True)
    param_grads = [ p.grad.flatten() for p in model.parameters() if p.requires_grad ]
    param_grads = torch.cat(param_grads)
    return param_grads

In [21]:
def calc_loss_second_order_derivative_list(loss, model):
    model.zero_grad()
    # Compute the first-order gradient
    loss.backward(create_graph=True)
    first_grads = [ (p.grad.flatten(), p) for p in model.parameters() if p.requires_grad ]
    # Compute the second-order gradient
    second_grads = []
    for first_grad, p in first_grads:
        sub_matrix = []
        for i in range(first_grad.shape[0]):
            sub_matrix.append(torch.autograd.grad(first_grad[i], p, create_graph=True)[0].flatten())
        sub_matrix = torch.stack(sub_matrix)
        second_grads.append(sub_matrix)
    print(second_grads)
    return second_grads

In [22]:
def calc_criterion_second_order_derivative(loss, model):
    model.zero_grad()
    # Compute the first-order gradient
    loss.backward(create_graph=True)
    first_grads = [ p.grad.flatten() for p in model.parameters() if p.requires_grad ]
    # Compute the second-order gradient
    second_grads = []
    for first_grad in first_grads:
        row = []
        for p in model.parameters():
            sub_matrix = []
            for i in range(first_grad.shape[0]):
                sub_matrix.append(torch.autograd.grad(first_grad[i], p, create_graph=True)[0].flatten())
            sub_matrix = torch.stack(sub_matrix)
            row.append(sub_matrix)
        row = torch.cat(row, dim=1)
        second_grads.append(row)
    second_grads = torch.cat(second_grads, dim=0)
    return second_grads

In [23]:
test_model = torch.nn.Linear(2, 2)
test_model = LogisticRegression(2, 2)

In [24]:
data_tensor = torch.tensor([[1.0, -2.0]], requires_grad=True)
label_tensor = torch.tensor([1.0, 0.0])
label_tensor_long = torch.tensor([0])

print(test_model(data_tensor)-label_tensor)

tensor([[-1.0115,  0.2502]], grad_fn=<SubBackward0>)


In [25]:
print(calc_criterion_first_order_derivative(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model))
hessian = calc_criterion_second_order_derivative(criterion(test_model(data_tensor), label_tensor_long), test_model)
print(hessian, torch.all(torch.linalg.eigvals(hessian).real >= 0))
calc_loss_second_order_derivative_list(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model)

tensor([-1.0115,  2.0230,  0.2502, -0.5004, -1.0115,  0.2502],
       grad_fn=<CatBackward0>)
tensor([[ 0.2458, -0.4915, -0.2458,  0.4915,  0.2458, -0.2458],
        [-0.4915,  0.9831,  0.4915, -0.9831, -0.4915,  0.4915],
        [-0.2458,  0.4915,  0.2458, -0.4915, -0.2458,  0.2458],
        [ 0.4915, -0.9831, -0.4915,  0.9831,  0.4915, -0.4915],
        [ 0.2458, -0.4915, -0.2458,  0.4915,  0.2458, -0.2458],
        [-0.2458,  0.4915,  0.2458, -0.4915, -0.2458,  0.2458]],
       grad_fn=<CatBackward0>) tensor(False)
[tensor([[ 1., -2.,  0., -0.],
        [-2.,  4.,  0., -0.],
        [ 0., -0.,  1., -2.],
        [ 0., -0., -2.,  4.]], grad_fn=<StackBackward0>), tensor([[1., 0.],
        [0., 1.]])]


  return F.mse_loss(input, target, reduction=self.reduction)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


[tensor([[ 1., -2.,  0., -0.],
         [-2.,  4.,  0., -0.],
         [ 0., -0.,  1., -2.],
         [ 0., -0., -2.,  4.]], grad_fn=<StackBackward0>),
 tensor([[1., 0.],
         [0., 1.]])]

In [26]:
device = torch.device('cuda:0')
one_train_dataloader = uniform_sample_dataset(train_dataset, 1)
for batch_data, batch_labels in one_train_dataloader:
    batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
    print(calc_criterion_first_order_derivative(criterion(model(batch_data), batch_labels), model).shape)
    hessian = calc_criterion_second_order_derivative(torch.nn.MSELoss()(model(batch_data), batch_labels.to(torch.float32)), model)
    print(hessian.shape)

torch.Size([7850])


  return F.mse_loss(input, target, reduction=self.reduction)


torch.Size([7850, 7850])


In [27]:
def hvp(y, w, v):
    # First derivative
    first_grads = torch.autograd.grad(y, w, retain_graph=True, create_graph=True)

    first_grads = [g.flatten() for g in first_grads]
    first_grads = torch.cat(first_grads, dim=0)
    
    # Calculate the element-wise product between the first gradients and the vector v
    elemwise_products = torch.sum(first_grads * v)

    # Second derivative
    second_grads = torch.autograd.grad(elemwise_products, w, create_graph=True)
    second_grads = [g.flatten() for g in second_grads]
    second_grads = torch.cat(second_grads, dim=0)

    return second_grads

In [28]:
params = [p for p in test_model.parameters()]
vector = calc_criterion_first_order_derivative(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model)
expected = torch.matmul(calc_criterion_second_order_derivative(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model), vector.T)
vector._grad_fn = None
test_model.zero_grad()
actual = hvp(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), params, vector)
print(expected, actual)
assert torch.equal(expected, actual)

tensor([-6.0690, 12.1380,  1.5012, -3.0024, -6.0690,  1.5012],
       grad_fn=<MvBackward0>) tensor([-6.0690, 12.1380,  1.5012, -3.0024, -6.0690,  1.5012],
       grad_fn=<CatBackward0>)


  expected = torch.matmul(calc_criterion_second_order_derivative(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model), vector.T)


In [57]:
def ihvp(train_dataset, test_data, test_label, model, criterion, t, r, device='cpu', reg_factor=0.91, scale=0.04, unique_datapoint=None, ihvp_summary_writer=SummaryWriter('runs/ihvp_sum_summary')):
    ihvp_eval_avg = 0
    test_data, test_label = test_data.to(device), test_label.to(device)
    model = model.to(device)
    def scaled_criterion(output, label):
        return criterion(output, label) * scale
    
    vector = calc_criterion_first_order_derivative(scaled_criterion(model(test_data), test_label), model)
    for i in range(r):
        if unique_datapoint is None:
            sampled_train_dataset = [(data, label) for data, label in uniform_sample_dataset(train_dataset, t)]
        else:
            sampled_train_dataset = []
            for _ in range(t):
                sampled_train_dataset.append(unique_datapoint)
        # Step 1. Initialize the evaluation of the Hessian-vector product
        ihvp_eval = vector
        data_number = 0
        for data, label in sampled_train_dataset:
            data, label = data.to(device), label.to(device)
            # Step 2. Compute the second order gradient of the loss w.r.t. the model parameters
            model.zero_grad()
            data_tensor = data.view(-1, 28*28)
            params = [p for p in model.parameters()]
            ihvp_eval._grad_fn = None
            # Step 3. Compute the inner product between the Hessian matrix and the test gradient vector using HVP
            second_grads = hvp(scaled_criterion(model(data_tensor), label), params, ihvp_eval)
            hessian = calc_criterion_second_order_derivative(criterion(model(data_tensor), label), model)
            regularized_hessian = scale * hessian + reg_factor * torch.eye(hessian.shape[0])
            eig_hessian, eig_regularized_hessian = torch.linalg.eigvals(hessian), torch.linalg.eigvals(regularized_hessian)
            # print(torch.linalg.norm(torch.eye(hessian.shape[0]), ord=2))
            norm_diff_I_regularized_hessian = torch.linalg.norm(torch.eye(hessian.shape[0])-regularized_hessian, ord=2)
            # print(norm_diff_I_regularized_hessian.sum())
            # regularized_hessian_eigvals = torch.linalg.eigvals(regularized_hessian).real
            # print(f"is semi positive definite: {torch.all(regularized_hessian_eigvals >= 0)}")
            # print(f"regularized hessain eigvals: {regularized_hessian_eigvals}")
            print(f"norm_diff_I_regularized_hessian: {norm_diff_I_regularized_hessian}, {torch.max(torch.linalg.eigvals(torch.eye(hessian.shape[0])-regularized_hessian).real)}")
            # print(f"max eigval for hessian: {torch.max(torch.abs(torch.linalg.eigvals(hessian).real))}")
            return (eig_hessian, eig_regularized_hessian, hessian)
            # hessian_list = calc_loss_second_order_derivative_list(scaled_criterion(model(data_tensor), label), model)
            # hessian_eigvals = torch.linalg.eigvals(hessian).real
            # # diff = torch.norm(torch.eye(hessian.shape[0])-hessian, p=2)
            # print(f"is semi positive definite: {torch.all(hessian_eigvals >= 0)}")
            # print(f"hessain eigvals: {torch.min(hessian_eigvals)}")
            # return hessian_eigvals
            # print(f"diff: {diff}, l2 for I: {torch.norm(torch.eye(hessian.shape[0]), p=2)}, hes: {hessian.sum()}, eig: {torch.max(torch.abs(torch.linalg.eigvals(hessian)))}")
            # return_grads_validation = torch.matmul(hessian, ihvp_eval.T)
            # print(f"diff: {torch.abs(return_grads.sum() - vector.sum())}, ihvp_eval: {ihvp_eval.sum()}, return_grads: {return_grads.sum()}, vector: {vector.sum()}")
            # print(f"validation: {return_grads_validation.sum()}, hessian: {hessian.sum()}")
            ihvp_eval = (1 - reg_factor) * ihvp_eval + vector - second_grads
            ihvp_summary_writer.add_scalar(f'ihvp_eval_sum_{i}', ihvp_eval.sum(), data_number)
            data_number += 1
        print(f"ihvp iteration {i} done and ihvp sum is {ihvp_eval.sum()}")
        ihvp_eval_avg = i / (i + 1) * ihvp_eval_avg + 1 / (i + 1) * ihvp_eval         
    return ihvp_eval_avg
    

In [30]:
test_dataloader = uniform_sample_dataset(test_dataset, 1)
test_data_list = [(data, label) for data, label in test_dataloader]

In [31]:
device = 'cpu'

In [32]:
uni_data_loader  = uniform_sample_dataset(train_dataset, 1)
uni_data_tuple = ()
for data, label in uni_data_loader:
    uni_data_tuple = (data.to(device), label.to(device))

In [60]:
ihvp_summary_writer = SummaryWriter('runs/ihvp_sum_summary') 
model = model.to(device)
for test_data, test_label in test_data_list:
    test_data, test_label = test_data.to(device), test_label.to(device)
    print(f"Test data value sum: {test_data.sum()}, Test data label: {test_label}")
    model.zero_grad()
    # ihvp_eval = ihvp(train_dataset, test_data, test_label, model, criterion, 20000, 1, reg_factor=5e-4, scale=0.04, ihvp_summary_writer=ihvp_summary_writer, unique_datapoint=uni_data_tuple)
    eig_hessian, eig_reg_hessian, hessian = ihvp(train_dataset, test_data, test_label, model, criterion, 15000, 10, reg_factor=1e-5, scale=1e-3, ihvp_summary_writer=ihvp_summary_writer, unique_datapoint=uni_data_tuple)
    # ihvp(train_dataset, test_data, test_label, model, criterion, 10000, 10, device='cpu', reg_factor=0, scale=0.8, ihvp_summary_writer=ihvp_summary_writer)

Test data value sum: 83.81175994873047, Test data label: tensor([8])
norm_diff_I_regularized_hessian: 0.9999964237213135, 0.9999954104423523


In [62]:
eig_hessian_real = eig_hessian.real
eig_reg_hessian_real = eig_reg_hessian.real

In [41]:
I = torch.eye(hessian.shape[0])

reg_hessian = 0.04 * hessian + 1e-2 * I

In [42]:
torch.max(torch.linalg.eigvals(I-reg_hessian).real)

tensor(0.9900, grad_fn=<MaxBackward1>)

In [63]:
ihvp_eval.dtype

torch.float32

In [50]:
ihvp_eval.sum()

tensor(-0.0238, grad_fn=<SumBackward0>)

In [36]:
def upweighting_loss_influence_function(train_dataset, upweighted_data, upweighted_label, test_data, test_label, model, criterion):
    # Step 1. Compute the Hessian-vector product
    ihvp_eval = ihvp(train_dataset, test_data, test_label, model, criterion)
    # Step 2. Compute the influence function
    influence = torch.dot(-ihvp_eval, calc_criterion_first_order_derivative(upweighted_data, upweighted_label, criterion, model))
    return influence

## Calculate loss difference of leave-one-out retraining

**Here is a validation**

In [45]:
import numpy as np

I = np.identity(2)
A = np.array([[0.5,0.5],[0.5,0.75]])

print(np.linalg.eigvals(A))

print(np.linalg.norm(I - A, ord=2))
U,S,Vh = np.linalg.svd(I - A)
print(S.max())
A_inv = I

A_inv_ = np.linalg.inv(A)

for i in range(100):
    A_inv = A_inv + I - A@A_inv
    
print(A_inv_)
print(A_inv)

[0.1096118 1.1403882]
0.8903882032022074
0.8903882032022074
[[ 6. -4.]
 [-4.  4.]]
[[ 5.99995419 -3.99996423]
 [-3.99996423  3.99997207]]


In [38]:
def is_semi_positive_definite(matrix):
    return torch.all(torch.linalg.eigvals(matrix).real >= -1e-5)

In [39]:
logistic_test_model = LogisticRegression(2, 2)
logistic_test_model.linear.weight.data = torch.tensor([[1.0, 2.0]])
logistic_test_model.linear.bias.data = torch.tensor([-1.0])
print(logistic_test_model.linear.weight, logistic_test_model.linear.bias)
# logistic_test_model.weight.data = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
# logistic_test_model.bias.data = torch.tensor([1.0])

input = torch.tensor([[1.0, 2.0]], requires_grad=True)
output = logistic_test_model(input)
label = torch.tensor([0])
label_float = torch.tensor([1.0, 0.0])
print(output, label)

hessian_expected = torch.sigmoid(output[0][0]) * (1-torch.sigmoid(output[0][0])) * (input.T @ input)
print(f"hessain expected: {hessian_expected}, is semi positive definite: {is_semi_positive_definite(hessian_expected)}")

loss = torch.nn.functional.cross_entropy(output, label) * 1000
loss_MSE = torch.nn.MSELoss()(output, label_float)

# jaccobian = calc_criterion_first_order_derivative(loss, logistic_test_model)
print(torch.softmax(output, dim=1))
# print(jaccobian)

test_hessian = calc_criterion_second_order_derivative(loss, logistic_test_model)
test_hessian_list = calc_loss_second_order_derivative_list(loss, logistic_test_model)
print(test_hessian, is_semi_positive_definite(test_hessian))
test_hessian_MSE = calc_criterion_second_order_derivative(loss_MSE, logistic_test_model)
print(f"test_hessian_MSE: {test_hessian_MSE}, is_semi_positive_definite: {is_semi_positive_definite(test_hessian_MSE)}")
# print(torch.linalg.eigvals(test_hessian_list[0]).real)
print(torch.linalg.eigvals(test_hessian).real)

Parameter containing:
tensor([[1., 2.]], requires_grad=True) Parameter containing:
tensor([-1.], requires_grad=True)
tensor([[4.]], grad_fn=<AddmmBackward0>) tensor([0])
hessain expected: tensor([[0.0177, 0.0353],
        [0.0353, 0.0707]], grad_fn=<MulBackward0>), is semi positive definite: True
tensor([[1.]], grad_fn=<SoftmaxBackward0>)
[tensor([[0., 0.],
        [0., 0.]], grad_fn=<StackBackward0>), tensor([[0.]], grad_fn=<StackBackward0>)]
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], grad_fn=<CatBackward0>) tensor(True)
test_hessian_MSE: tensor([[2., 4., 2.],
        [4., 8., 4.],
        [2., 4., 2.]], grad_fn=<CatBackward0>), is_semi_positive_definite: True
tensor([0., 0., 0.], grad_fn=<SelectBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)
