In [1]:
import torch
from torch.utils.tensorboard import SummaryWriter



In [2]:
import os

def delete_folder(folder_path):
    if os.path.exists(folder_path):
        for root, dirs, files in os.walk(folder_path, topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        os.rmdir(folder_path)
    else:
        print(f"The folder {folder_path} does not exist")

delete_folder('./runs')

In [3]:
from torchvision import datasets, transforms
from torch.utils.data import Subset

train_dataset_all = datasets.MNIST(root='./data', train=True, download=True,
                               transform=transforms.ToTensor())
test_dataset = datasets.MNIST(root='./data', train=False, download=True,
                              transform=transforms.ToTensor(), )

subset_size = 55000  # The size of the subset I want
indices = torch.randperm(len(train_dataset_all))[:subset_size]
train_dataset = Subset(train_dataset_all, indices)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [4]:
import torch.nn as nn

class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim, params=None):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        if params:
            self.linear.weight = nn.Parameter(params['weight'])
            self.linear.bias = nn.Parameter(params['bias'])
        # self.linear.weight = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        # self.linear.bias = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        
    def forward(self, x):
        x = x.view(-1, self.linear.in_features)
        # outputs = torch.nn.functional.softmax(self.linear(x))
        outputs = self.linear(x)
        return outputs

class LogisticRegressionRestricted(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionRestricted, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim - 1)
        # self.linear.weight = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        # self.linear.bias = nn.Parameter(torch.zeros(output_dim - 1,input_dim))
        
    def forward(self, x):
        x = x.view(-1, self.linear.in_features)
        # outputs = torch.nn.functional.softmax(self.linear(x))
        outputs = self.linear(x)
        zeros_for_last_class = torch.zeros(
            (outputs.shape[0], 1),
            device=x.device,
            dtype=x.dtype
        )
        output_with_zeros = torch.cat((outputs, zeros_for_last_class), dim=1)
        return output_with_zeros

In [5]:
def train(model, optimizer, criterion, train_dataset, writer, t=20, leave_out_indices=None):
    batch_num = 0
    if leave_out_indices:
        train_dataset = Subset(train_dataset, [i for i in range(len(train_dataset)) if i not in leave_out_indices])
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
    for epoch in range(t):
        model.train() # Set the model to training mode
        for batch_data, batch_labels in train_loader:
            optimizer.zero_grad()
            batch_data = batch_data.view(-1, 28*28)
            output = model(batch_data)
            loss = criterion(output, batch_labels)
            
            writer.add_scalar('training loss', loss, batch_num)
            loss.backward()
            optimizer.step(lambda: loss)
            batch_num += 1
            
    return batch_num

In [6]:
def test(model, criterion, test_dataset):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

    val_loss = 0
    val_correct = 0
    incorrect_data_list, incorrect_label_list = [], []
    with torch.no_grad():  # Disable gradient calculation during validation
        for data, labels in test_loader:
            # Transfer data to the appropriate device (CPU or GPU)
            data, labels = data.to(device), labels.to(device)

            # Forward pass
            outputs = model(data.view(-1, 28*28))

            # Calculate loss
            loss = criterion(outputs, labels)

            # Update validation metrics (e.g., accuracy)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            incorrect_data_list.append(data[predicted != labels])
            incorrect_label_list.append(labels[predicted != labels])
            
    incorrect_data = torch.cat(incorrect_data_list, dim=0)
    incorrect_label = torch.cat(incorrect_label_list, dim=0)
    val_loss /= len(test_dataset)
    val_accuracy = 100 * val_correct / len(test_dataset)
    return val_loss, val_accuracy, incorrect_data, incorrect_label

In [21]:
model = LogisticRegression(28*28, 10)
initial_params = {"weight": model.linear.weight.data.clone(), "bias": model.linear.bias.data.clone()}
print(initial_params)
optimizer_lbfgs = torch.optim.LBFGS(model.parameters())
optimizer_adam = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

def criterion_l2(output, target):
    loss = criterion(output, target)
    l2_reg = 0.0
    for param in model.parameters():
        l2_reg += torch.norm(param, 2)**2
    loss += 0.01 * l2_reg
    return loss

{'weight': tensor([[-0.0160, -0.0071, -0.0005,  ...,  0.0163, -0.0246,  0.0103],
        [ 0.0106,  0.0180,  0.0217,  ...,  0.0085,  0.0312,  0.0261],
        [-0.0100,  0.0209,  0.0121,  ..., -0.0010,  0.0092, -0.0201],
        ...,
        [ 0.0164,  0.0095, -0.0330,  ..., -0.0157, -0.0188, -0.0186],
        [ 0.0119,  0.0315, -0.0328,  ..., -0.0129, -0.0136, -0.0049],
        [-0.0169,  0.0244, -0.0212,  ...,  0.0170, -0.0356,  0.0041]]), 'bias': tensor([ 0.0197,  0.0177,  0.0009,  0.0026, -0.0104, -0.0284,  0.0033, -0.0143,
         0.0084,  0.0055])}


In [22]:
writer = SummaryWriter('runs/logistic_regression_10_mnist') 
batch_num = train(model, optimizer_adam, criterion, train_dataset, writer, t=5)

In [24]:
val_loss, val_accuracy, incorrect_data, incorrect_label = test(model, criterion, test_dataset)
print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')


Validation Loss: 0.0043, Validation Accuracy: 92.47%


In [23]:
test_data_index = torch.randint(0, len(incorrect_data), (1,)).item()
x_test, y_test = incorrect_data[test_data_index], incorrect_label[test_data_index]

In [25]:
print(initial_params)

{'weight': tensor([[-0.0160, -0.0071, -0.0005,  ...,  0.0163, -0.0246,  0.0103],
        [ 0.0106,  0.0180,  0.0217,  ...,  0.0085,  0.0312,  0.0261],
        [-0.0100,  0.0209,  0.0121,  ..., -0.0010,  0.0092, -0.0201],
        ...,
        [ 0.0164,  0.0095, -0.0330,  ..., -0.0157, -0.0188, -0.0186],
        [ 0.0119,  0.0315, -0.0328,  ..., -0.0129, -0.0136, -0.0049],
        [-0.0169,  0.0244, -0.0212,  ...,  0.0170, -0.0356,  0.0041]]), 'bias': tensor([ 0.0197,  0.0177,  0.0009,  0.0026, -0.0104, -0.0284,  0.0033, -0.0143,
         0.0084,  0.0055])}


In [26]:
writer_retrain = SummaryWriter('runs/logistic_regression_10_mnist_retrain') 

print(model(x_test.view(-1, 28*28)), y_test.view(1))
loss_z_test_with_z = criterion(model(x_test), y_test.view(1))

params = {"weight": initial_params["weight"].clone(), "bias": initial_params["bias"].clone()}
model_retrained = LogisticRegression(28*28, 10, params=params)
optimizer_adam_retrained = torch.optim.Adam(model_retrained.parameters())
# print(indices)
# subset_size = 55000  # The size of the subset I want
# indices = torch.randperm(len(train_dataset_all))[:subset_size]
# print(indices)
# train_dataset_partial = Subset(train_dataset_all, indices)
train(model_retrained, optimizer_adam_retrained, criterion, train_dataset, writer_retrain, t=5, leave_out_indices=[])

loss_z_test_without_z = criterion(model_retrained(x_test), y_test.view(1))

print(loss_z_test_with_z, loss_z_test_without_z)

tensor([[ -5.1887,  -6.5113,  -0.4661,   0.5459,  -7.3655,   1.7023,   1.1639,
         -17.2236,  -0.2190, -12.3769]], grad_fn=<AddmmBackward0>) tensor([8])
tensor(2.6916, grad_fn=<NllLossBackward0>) tensor(2.6916, grad_fn=<NllLossBackward0>)


In [31]:
def calc_leave_one_out_loss_diff(model, initial_params, criterion, test_data, test_label, leave_out_indices):
    params = {"weight": initial_params["weight"].clone(), "bias": initial_params["bias"].clone()}
    loss_z_test_with_z = criterion(model(test_data), test_label.view(1))
    model_leave_one_out = LogisticRegression(28*28, 10, params=params)
    optimizer_adam_leave_one_out = torch.optim.Adam(model_leave_one_out.parameters())
    train(model_leave_one_out, optimizer_adam_leave_one_out, criterion, train_dataset, writer_retrain, t=5, leave_out_indices=leave_out_indices)
    loss_z_test_without_z = criterion(model_leave_one_out(test_data), test_label.view(1))
    leave_one_out_loss_diff = torch.abs(loss_z_test_with_z - loss_z_test_without_z)
    print(f"loss_z_test_with_z: {loss_z_test_with_z}, loss_z_test_without_z: {loss_z_test_without_z}, leave_one_out_loss_diff: {leave_one_out_loss_diff}")
    return leave_one_out_loss_diff

In [34]:
print(calc_leave_one_out_loss_diff(model, initial_params, criterion, x_test, y_test, []))

loss_z_test_with_z: 2.6915714740753174, loss_z_test_without_z: 2.6915714740753174, leave_one_out_loss_diff: 0.0
tensor(0., grad_fn=<AbsBackward0>)


In [27]:
print(initial_params)

{'weight': tensor([[-0.0160, -0.0071, -0.0005,  ...,  0.0163, -0.0246,  0.0103],
        [ 0.0106,  0.0180,  0.0217,  ...,  0.0085,  0.0312,  0.0261],
        [-0.0100,  0.0209,  0.0121,  ..., -0.0010,  0.0092, -0.0201],
        ...,
        [ 0.0164,  0.0095, -0.0330,  ..., -0.0157, -0.0188, -0.0186],
        [ 0.0119,  0.0315, -0.0328,  ..., -0.0129, -0.0136, -0.0049],
        [-0.0169,  0.0244, -0.0212,  ...,  0.0170, -0.0356,  0.0041]]), 'bias': tensor([ 0.0197,  0.0177,  0.0009,  0.0026, -0.0104, -0.0284,  0.0033, -0.0143,
         0.0084,  0.0055])}


In [18]:
val_loss, val_accuracy, incorrect_data, incorrect_label = test(model_retrained, criterion, test_dataset)
print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

Validation Loss: 0.0041, Validation Accuracy: 92.81%


In [147]:
def uniform_sample_dataset(dataset, t):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    
    # Shuffle the indices
    sampler = torch.utils.data.sampler.RandomSampler(dataset, num_samples=t)
    sampled_dataloader = torch.utils.data.DataLoader(dataset, batch_size=500, sampler=sampler)
    return sampled_dataloader

In [148]:
sample_loader = uniform_sample_dataset(train_dataset, 10)
for batch_data, batch_labels in sample_loader:
    print(batch_data.shape, batch_labels)

torch.Size([10, 1, 28, 28]) tensor([8, 7, 1, 8, 5, 6, 7, 3, 8, 1])


In [149]:
def calc_criterion_first_order_derivative(loss, model):
    # Set requires_grad to True for the data tensor to enable gradient computation
    # data_tensor.requires_grad = True
    # label_tensor.requires_grad = True
    model.zero_grad()
    # output = model(data_tensor)
    # loss = criterion(output, label_tensor)
    # Compute the first-order gradient
    loss.backward(create_graph=True)
    param_grads = [ p.grad.flatten() for p in model.parameters() if p.requires_grad ]
    param_grads = torch.cat(param_grads)
    # print(param_grads)
    # return torch.concat([data_tensor.grad.reshape(1, -1), label_tensor.grad.reshape(1, -1)], dim=1)
    return param_grads

In [150]:
def calc_loss_second_order_derivative_list(loss, model):
    model.zero_grad()
    # Compute the first-order gradient
    loss.backward(create_graph=True)
    first_grads = [ (p.grad.flatten(), p) for p in model.parameters() if p.requires_grad ]
    # Compute the second-order gradient
    second_grads = []
    for first_grad, p in first_grads:
        sub_matrix = []
        # print(first_grad.shape, p.shape)
        for i in range(first_grad.shape[0]):
            sub_matrix.append(torch.autograd.grad(first_grad[i], p, create_graph=True)[0].flatten())
        sub_matrix = torch.stack(sub_matrix)
        second_grads.append(sub_matrix)
    # second_order_grads = torch.cat(second_order_grads)
    print(second_grads)
    return second_grads

In [None]:
def calc_criterion_second_order_derivative(loss, model):
    model.zero_grad()
    # Compute the first-order gradient
    loss.backward(create_graph=True)
    first_grads = [ p.grad.flatten() for p in model.parameters() if p.requires_grad ]
    # Compute the second-order gradient
    second_grads = []
    for first_grad in first_grads:
        row = []
        for p in model.parameters():
            sub_matrix = []
            for i in range(first_grad.shape[0]):
                sub_matrix.append(torch.autograd.grad(first_grad[i], p, create_graph=True)[0].flatten())
            sub_matrix = torch.stack(sub_matrix)
            # print(sub_matrix)
            row.append(sub_matrix)
        row = torch.cat(row, dim=1)
        # print(row)
        second_grads.append(row)
    second_grads = torch.cat(second_grads, dim=0)
    return second_grads

In [152]:
test_model = torch.nn.Linear(2, 2)
test_model = LogisticRegression(2, 2)

In [153]:
data_tensor = torch.tensor([[1.0, -2.0]], requires_grad=True)
label_tensor = torch.tensor([1.0, 0.0])
label_tensor_long = torch.tensor([0])

print(test_model(data_tensor)-label_tensor)
# 2 * (test_model(data_tensor) - label_tensor).item() * data_tensor

tensor([[ 0.4469, -1.5358]], grad_fn=<SubBackward0>)


In [154]:
print(calc_criterion_first_order_derivative(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model))
hessian = calc_criterion_second_order_derivative(criterion(test_model(data_tensor), label_tensor_long), test_model)
print(hessian, torch.all(torch.linalg.eigvals(hessian).real >= 0))
calc_loss_second_order_derivative_list(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model)


tensor([ 0.4469, -0.8939, -1.5358,  3.0717,  0.4469, -1.5358],
       grad_fn=<CatBackward0>)
tensor([[ 0.0459, -0.0918, -0.0459,  0.0918,  0.0459, -0.0459],
        [-0.0918,  0.1835,  0.0918, -0.1835, -0.0918,  0.0918],
        [-0.0459,  0.0918,  0.0459, -0.0918, -0.0459,  0.0459],
        [ 0.0918, -0.1835, -0.0918,  0.1835,  0.0918, -0.0918],
        [ 0.0459, -0.0918, -0.0459,  0.0918,  0.0459, -0.0459],
        [-0.0459,  0.0918,  0.0459, -0.0918, -0.0459,  0.0459]],
       grad_fn=<CatBackward0>) tensor(False)
[tensor([[ 1., -2.,  0., -0.],
        [-2.,  4.,  0., -0.],
        [ 0., -0.,  1., -2.],
        [ 0., -0., -2.,  4.]], grad_fn=<StackBackward0>), tensor([[1., 0.],
        [0., 1.]])]


  return F.mse_loss(input, target, reduction=self.reduction)


[tensor([[ 1., -2.,  0., -0.],
         [-2.,  4.,  0., -0.],
         [ 0., -0.,  1., -2.],
         [ 0., -0., -2.,  4.]], grad_fn=<StackBackward0>),
 tensor([[1., 0.],
         [0., 1.]])]

In [155]:
one_train_dataloader = uniform_sample_dataset(train_dataset, 1)
for batch_data, batch_labels in one_train_dataloader:
    print(calc_criterion_first_order_derivative(criterion(model(batch_data), batch_labels), model).shape)
    hessian = calc_criterion_second_order_derivative(torch.nn.MSELoss()(model(batch_data), batch_labels.to(torch.float32)), model)
    print(hessian.shape)
    # hessian = calc_loss_second_order_derivative_list(criterion(model(batch_data), batch_labels), model)
    # hessian_1 = hessian[0]
    # hessian_2 = hessian[1]

torch.Size([7850])


  return F.mse_loss(input, target, reduction=self.reduction)


torch.Size([7850, 7850])


In [156]:
eigvals = torch.linalg.eigvals(hessian).real

In [157]:
def hvp(y, w, v):
    # First derivative
    first_grads = torch.autograd.grad(y, w, retain_graph=True, create_graph=True)

    first_grads = [g.flatten() for g in first_grads]
    first_grads = torch.cat(first_grads, dim=0)
    
    # Calculate the element-wise product between the first gradients and the vector v
    elemwise_products = torch.sum(first_grads * v)

    # Second derivative
    second_grads = torch.autograd.grad(elemwise_products, w, create_graph=True)
    second_grads = [g.flatten() for g in second_grads]
    second_grads = torch.cat(second_grads, dim=0)

    return second_grads

In [158]:
params = [p for p in test_model.parameters()]
vector = calc_criterion_first_order_derivative(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model)
print(vector)
# vector = torch.tensor([-3.0305, -6.0611, -3.0305])
expected = torch.matmul(calc_criterion_second_order_derivative(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), test_model), vector.T)
vector._grad_fn = None
# print(vector)
test_model.zero_grad()
actual = hvp(torch.nn.MSELoss()(test_model(data_tensor), label_tensor), params, vector)
print(expected, actual)
assert torch.equal(expected, actual)

tensor([ 0.4469, -0.8939, -1.5358,  3.0717,  0.4469, -1.5358],
       grad_fn=<CatBackward0>)
tensor([ 2.6817, -5.3633, -9.2151, 18.4302,  2.6817, -9.2151],
       grad_fn=<MvBackward0>) tensor([ 2.6817, -5.3633, -9.2151, 18.4302,  2.6817, -9.2151],
       grad_fn=<CatBackward0>)


In [159]:
# hessian_eigvals = None
def ihvp(train_dataset, test_data, test_label, model, criterion, t, r, reg_factor=0.91, scale=0.04, unique_datapoint=None, ihvp_summary_writer=SummaryWriter('runs/ihvp_sum_summary')):
    ihvp_eval_avg = 0
    test_data = test_data
    def scaled_criterion(output, label):
        return criterion(output, label) * scale
    
    vector = calc_criterion_first_order_derivative(scaled_criterion(model(test_data), test_label), model)
    for i in range(r):
        if unique_datapoint is None:
            sampled_train_dataset = [(data, label) for data, label in uniform_sample_dataset(train_dataset, t)]
        else:
            sampled_train_dataset = []
            for _ in range(t):
                sampled_train_dataset.append(unique_datapoint)
        # Step 1. Initialize the evaluation of the Hessian-vector product
        ihvp_eval = vector
        data_number = 0
        for data, label in sampled_train_dataset:
            # Step 2. Compute the second order gradient of the loss w.r.t. the model parameters
            model.zero_grad()
            data_tensor = data.view(-1, 28*28)
            params = [p for p in model.parameters()]
            ihvp_eval._grad_fn = None
            # Step 3. Compute the inner product between the Hessian matrix and the test gradient vector using HVP
            second_grads = hvp(scaled_criterion(model(data_tensor), label), params, ihvp_eval)
            # hessian = calc_criterion_second_order_derivative(scaled_criterion(model(data_tensor), label), model)
            # hessian_list = calc_loss_second_order_derivative_list(scaled_criterion(model(data_tensor), label), model)
            # hessian_eigvals = torch.linalg.eigvals(hessian).real
            # # diff = torch.norm(torch.eye(hessian.shape[0])-hessian, p=2)
            # print(f"is semi positive definite: {torch.all(hessian_eigvals >= 0)}")
            # print(f"hessain eigvals: {torch.min(hessian_eigvals)}")
            # return hessian_eigvals
            # print(f"diff: {diff}, l2 for I: {torch.norm(torch.eye(hessian.shape[0]), p=2)}, hes: {hessian.sum()}, eig: {torch.max(torch.abs(torch.linalg.eigvals(hessian)))}")
            # return_grads_validation = torch.matmul(hessian, ihvp_eval.T)
            # print(f"diff: {torch.abs(return_grads.sum() - vector.sum())}, ihvp_eval: {ihvp_eval.sum()}, return_grads: {return_grads.sum()}, vector: {vector.sum()}")
            # print(f"validation: {return_grads_validation.sum()}, hessian: {hessian.sum()}")
            ihvp_eval = (1 - reg_factor) * ihvp_eval + vector - second_grads
            ihvp_summary_writer.add_scalar(f'ihvp_eval_sum_{i}', ihvp_eval.sum(), data_number)
            data_number += 1
        # print(f"ihvp iteration {i} done and ihvp sum is {ihvp_eval.sum()}")
        ihvp_eval_avg = i / (i + 1) * ihvp_eval_avg + 1 / (i + 1) * ihvp_eval         
    return ihvp_eval_avg
    

In [160]:
test_dataloader = uniform_sample_dataset(test_dataset, 1)
test_data_list = [(data, label) for data, label in test_dataloader]

In [161]:
uni_data_loader  = uniform_sample_dataset(train_dataset, 1)
uni_data_tuple = ()
for data, label in uni_data_loader:
    uni_data_tuple = (data, label)

In [162]:
ihvp_summary_writer = SummaryWriter('runs/ihvp_sum_summary') 
for test_data, test_label in test_data_list:
    print(f"Test data value sum: {test_data.sum()}, Test data label: {test_label}")
    model.zero_grad()
    ihvp_eval = ihvp(train_dataset, test_data, test_label, model, criterion, 5000, 10, reg_factor=0.96, scale=0.01, ihvp_summary_writer=ihvp_summary_writer, unique_datapoint=uni_data_tuple)

Test data value sum: 91.50196075439453, Test data label: tensor([8])


In [163]:
ihvp_eval.sum()

tensor(-6.1118e-09, grad_fn=<SumBackward0>)

In [164]:
def upweighting_loss_influence_function(train_dataset, upweighted_data, upweighted_label, test_data, test_label, model, criterion):
    # Step 1. Compute the Hessian-vector product
    ihvp_eval = ihvp(train_dataset, test_data, test_label, model, criterion)
    # Step 2. Compute the influence function
    influence = torch.dot(-ihvp_eval, calc_criterion_first_order_derivative(upweighted_data, upweighted_label, criterion, model))
    return influence

## Calculate loss difference of leave-one-out retraining

**Here is a validation**

In [225]:
import numpy as np

I = np.identity(2)
A = np.array([[2e-2,3e-2],[3e-2,4e-2]])

print(np.linalg.norm(I - A, ord=2))
A_inv = I

A_inv_ = np.linalg.inv(A)

for i in range(100):
    A_inv = A_inv + I - A@A_inv
    
print(A_inv_)
print(A_inv)

1.0016227766016836
[[-400.  300.]
 [ 300. -200.]]
[[ 77.70247895 -44.32744073]
 [-44.32744073  48.1508518 ]]


In [166]:
def is_semi_positive_definite(matrix):
    return torch.all(torch.linalg.eigvals(matrix).real >= -1e-5)

In [167]:
logistic_test_model = LogisticRegression(2, 2)
logistic_test_model.linear.weight.data = torch.tensor([[1.0, 2.0]])
logistic_test_model.linear.bias.data = torch.tensor([-1.0])
print(logistic_test_model.linear.weight, logistic_test_model.linear.bias)
# logistic_test_model.weight.data = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
# logistic_test_model.bias.data = torch.tensor([1.0])

input = torch.tensor([[1.0, 2.0]], requires_grad=True)
output = logistic_test_model(input)
label = torch.tensor([0])
label_float = torch.tensor([1.0, 0.0])
print(output, label)

hessian_expected = torch.sigmoid(output[0][0]) * (1-torch.sigmoid(output[0][0])) * (input.T @ input)
print(f"hessain expected: {hessian_expected}, is semi positive definite: {is_semi_positive_definite(hessian_expected)}")

loss = torch.nn.functional.cross_entropy(output, label) * 1000
loss_MSE = torch.nn.MSELoss()(output, label_float)

# jaccobian = calc_criterion_first_order_derivative(loss, logistic_test_model)
print(torch.softmax(output, dim=1))
# print(jaccobian)

test_hessian = calc_criterion_second_order_derivative(loss, logistic_test_model)
test_hessian_list = calc_loss_second_order_derivative_list(loss, logistic_test_model)
print(test_hessian, is_semi_positive_definite(test_hessian))
test_hessian_MSE = calc_criterion_second_order_derivative(loss_MSE, logistic_test_model)
print(f"test_hessian_MSE: {test_hessian_MSE}, is_semi_positive_definite: {is_semi_positive_definite(test_hessian_MSE)}")
# print(torch.linalg.eigvals(test_hessian_list[0]).real)
print(torch.linalg.eigvals(test_hessian).real)

Parameter containing:
tensor([[1., 2.]], requires_grad=True) Parameter containing:
tensor([-1.], requires_grad=True)
tensor([[4.]], grad_fn=<AddmmBackward0>) tensor([0])
hessain expected: tensor([[0.0177, 0.0353],
        [0.0353, 0.0707]], grad_fn=<MulBackward0>), is semi positive definite: True
tensor([[1.]], grad_fn=<SoftmaxBackward0>)
[tensor([[0., 0.],
        [0., 0.]], grad_fn=<StackBackward0>), tensor([[0.]], grad_fn=<StackBackward0>)]
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], grad_fn=<CatBackward0>) tensor(True)
test_hessian_MSE: tensor([[2., 4., 2.],
        [4., 8., 4.],
        [2., 4., 2.]], grad_fn=<CatBackward0>), is_semi_positive_definite: True
tensor([0., 0., 0.], grad_fn=<SelectBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)
