In [18]:
import torch.nn as nn
import torch.nn.parallel
import random
import argparse
# from network.resnet import resnet18, resnet34
# from network.pointnet import PointNetCls
from torch.utils.data import DataLoader
import os
import numpy as np
from Models import FMNIST_classifier
from fmnist_prepare_subset_select_train_val_test import F_MNIST
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision.transforms as transforms
from termcolor import cprint
# from knn_utils import calc_knn_graph, calc_topo_weights_with_components_idx
from subset_select_ipot_non_uniform_git import subset_select_ipot as ss_ipot
from noise import noisify_with_P, noisify_cifar10_asymmetric, noisify_cifar100_asymmetric, noisify_pairflip, noisify_modelnet40_asymmetric
import copy
from scipy.stats import mode
from matplotlib import pyplot as plt

if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = "cpu"


# Clean Data

In [14]:
def _init_fn(worker_id):
    np.random.seed(77 + worker_id)

models_path = "G:\\My Drive\\Research Codes\\Subset Selection Paper\\Neural Network Classifier\\models\\UCI-subset-select\\Fashion-MNIST\\"

train_val_ratio = 0.8
trust_prop = 0.5
noise_level = 0.8
batch_size_train = 512
num_classes = 10
random_seed = 42
max_epochs = 200

transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])


trainset = F_MNIST(root='./data', split='train', train_ratio=train_val_ratio,trust_prop=trust_prop,  download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=512, shuffle=True, num_workers=0, worker_init_fn=_init_fn)

noise_y_train_labels0, noise_y_train_indices = trainset.get_noisy_labels_with_indices()
noise_y_train, p, _ = noisify_with_P(noise_y_train_labels0, nb_classes=num_classes, noise=noise_level, random_state=random_seed)
trainset.update_corrupted_label(noise_y_train, noise_y_train_indices)

valset = F_MNIST(root='./data', split='val', train_ratio=train_val_ratio, trust_prop=trust_prop, download=True, transform=transform)
valloader = torch.utils.data.DataLoader(valset, batch_size=512, shuffle=False, num_workers=0)

testset = F_MNIST(root='./data', split='test', download=True, transform=transform)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=512, shuffle=False, num_workers=0)



unique_labels = [int(i) for i in list(np.linspace(0, 10, 10, endpoint=False))]
torch.manual_seed(0)

net = FMNIST_classifier().to(device)
criterion = nn.CrossEntropyLoss(reduction='mean')
optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4)


train_acc = []
val_acc = []
best_accuracy = 0

for epoch in range(max_epochs):  # loop over the dataset multiple times

    running_loss = []
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        # _, images, labels, _, _ = data
        images, labels, _, _, _ = data
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs, _ = net(images.double().to(device))
        loss = criterion(outputs, labels.long().to(device))
        loss.backward()
        optimizer.step()

        running_loss.append(loss.item())
        # avg_loss.append(torch.tensor(running_loss).mean())
    print("epoch={0:d},  avg_loss = {1:0.4f}".format(
        epoch, torch.tensor(running_loss).mean()))

    if True:
        classes = tuple([str(i) for i in unique_labels])
        correct_pred = {classname: 0 for classname in classes}
        total_pred = {classname: 0 for classname in classes}
        total = 0
        correct = 0

        net.eval()
        with torch.no_grad():
            for data in valloader:
                images, labels, _, _, _ = data
                outputs, _ = net(images.double().to(device))
                _, predictions = torch.max(outputs, dim=1)
                total += labels.size(0)
                correct += (predictions.cpu() == labels).sum().item()
            print(f'val set accuraccy: {100 * correct / total} %')
            val_acc.append(100 * correct / total)

        if val_acc[-1] > best_accuracy:
            best_accuracy = val_acc[-1]
            net_path = models_path + "FMNIST_clean"+".pth"
            torch.save({
                'epoch': epoch,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val-accuracy': val_acc,
                'train-accuracy': train_acc}, net_path)

        classes = tuple([str(i) for i in unique_labels])
        correct_pred = {classname: 0 for classname in classes}
        total_pred = {classname: 0 for classname in classes}
        total = 0
        correct = 0
        net.eval()
        with torch.no_grad():
            for data in trainloader:
                images, labels, _, _, _ = data
                outputs, _ = net(images.double().to(device))
                _, predictions = torch.max(outputs, dim=1)
                total += labels.size(0)
                correct += (predictions.cpu() == labels).sum().item()
        print(f'train set accuracy: {100 * correct / total} %')
        train_acc.append(100 * correct / total)


print('Finished Training')
# saving training accuracy list along with model with best accuracy
net_path = models_path + "FMNIST_clean"+".pth"
best_net_dict = torch.load(net_path)
best_net_dict['val-accuracy'] = val_acc
best_net_dict['train-accuracy'] = train_acc
torch.save(best_net_dict, net_path)


Actual noise 0.80


In [None]:
from matplotlib import pyplot as plt
net_path = models_path + "FMNIST_clean"+".pth"
best_net_dict = torch.load(net_path)

iters = list(range(0, 100))
fig, axs = plt.subplots()
axs.plot(iters, best_net_dict['val-accuracy'],
         "-.", color="r", label="validation")
axs.plot(iters, best_net_dict['train-accuracy'],
         "-.", color="b", label="train")
axs.set_xlabel("iters.")
axs.set_ylabel("accuracy")
fig.legend(ncol=3, loc=(0.4, 0.13))


# Noisy Data

In [None]:
def _init_fn(worker_id):
    np.random.seed(77 + worker_id)


models_path = "G:\\My Drive\\Research Codes\\Subset Selection Paper\\Neural Network Classifier\\models\\UCI-subset-select\\Fashion-MNIST\\"

train_val_ratio = 0.8
trust_prop = 0.5

batch_size_train = 512

num_classes = 10

noise_level = 0.8
random_seed = 42

transform_train = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
trainset = F_MNIST(root='./data', split='train', train_ratio=train_val_ratio,
                 trust_prop=trust_prop,  download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=512, shuffle=True, num_workers=0, worker_init_fn=_init_fn)


noise_y_train_labels0, noise_y_train_indices = trainset.get_noisy_labels_with_indices()
noise_y_train, p, _ = noisify_with_P(
    noise_y_train_labels0, nb_classes=num_classes, noise=noise_level, random_state=random_seed)
trainset.update_corrupted_label(noise_y_train, noise_y_train_indices)

valset = F_MNIST(root='./data', split='val', train_ratio=train_val_ratio,
               trust_prop=trust_prop, download=True, transform=transform_train)
valloader = torch.utils.data.DataLoader(
    valset, batch_size=512, shuffle=False, num_workers=0)

testset = F_MNIST(root='./data', split='test',
                download=True, transform=transform_train)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=512, shuffle=False, num_workers=0)

max_epochs = 100
unique_labels = [int(i) for i in list(np.linspace(0, 10, 10, endpoint=False))]
torch.manual_seed(0)

net = FMNIST_classifier().to(device)
criterion = nn.CrossEntropyLoss(reduction='mean')
optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(
    0.9, 0.999), eps=1e-08, weight_decay=1e-4)

train_acc = []
val_acc = []
best_accuracy = 0

for epoch in range(max_epochs):  # loop over the dataset multiple times

    running_loss = []
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        # _, images, labels, _, _ = data
        images, labels, _, _, _ = data
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs, _ = net(images.double().to(device))
        loss = criterion(outputs, labels.long().to(device))
        loss.backward()
        optimizer.step()

        running_loss.append(loss.item())
        # avg_loss.append(torch.tensor(running_loss).mean())
    print("epoch={0:d},  avg_loss = {1:0.4f}".format(
        epoch, torch.tensor(running_loss).mean()))

    if True:
        classes = tuple([str(i) for i in unique_labels])
        correct_pred = {classname: 0 for classname in classes}
        total_pred = {classname: 0 for classname in classes}
        total = 0
        correct = 0

        net.eval()
        with torch.no_grad():
            for data in valloader:
                images, labels, _, _, _ = data
                outputs, _ = net(images.double().to(device))
                _, predictions = torch.max(outputs, dim=1)
                total += labels.size(0)
                correct += (predictions.cpu() == labels).sum().item()
            print(f'val set accuraccy: {100 * correct / total} %')
            val_acc.append(100 * correct / total)

        if val_acc[-1] > best_accuracy:
            best_accuracy = val_acc[-1]
            net_path = models_path + "FMNIST_noisy"+".pth"
            torch.save({
                'epoch': epoch,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val-accuracy': val_acc,
                'train-accuracy': train_acc}, net_path)

        classes = tuple([str(i) for i in unique_labels])
        correct_pred = {classname: 0 for classname in classes}
        total_pred = {classname: 0 for classname in classes}
        total = 0
        correct = 0
        net.eval()
        with torch.no_grad():
            for data in trainloader:
                images, labels, _, _, _ = data
                outputs, _ = net(images.double().to(device))
                _, predictions = torch.max(outputs, dim=1)
                total += labels.size(0)
                correct += (predictions.cpu() == labels).sum().item()
        print(f'train set accuracy: {100 * correct / total} %')
        train_acc.append(100 * correct / total)


print('Finished Training')
# saving training accuracy list along with model with best accuracy
net_path = models_path + "FMNIST_noisy"+".pth"
best_net_dict = torch.load(net_path)
best_net_dict['val-accuracy'] = val_acc
best_net_dict['train-accuracy'] = train_acc
torch.save(best_net_dict, net_path)


In [None]:
from matplotlib import pyplot as plt
net_path = models_path + "FMNIST_noisy"+".pth"
best_net_dict = torch.load(net_path)


iters = list(range(0, 100))
fig, axs = plt.subplots()
axs.plot(iters, best_net_dict['val-accuracy'], "-.", color="r", label="validation")
axs.plot(iters, best_net_dict['train-accuracy'], "-.", color="b", label="train")
axs.set_xlabel("iters.")
axs.set_ylabel("accuracy")
fig.legend(ncol=3, loc=(0.4, 0.13))

# Subset Selection

In [None]:
def _init_fn(worker_id):
    np.random.seed(77 + worker_id)


models_path = "G:\\My Drive\\Research Codes\\Subset Selection Paper\\Neural Network Classifier\\models\\UCI-subset-select\\Fashion-MNIST\\"

train_val_ratio = 0.8
trust_prop = 0.5

batch_size_train = 512

num_classes = 10

noise_level = 0.8
random_seed = 42

cs = [1, 2, 3, 4, 5, 6, 7, 8, 20]
max_epochs = 200
gamma_ss = 0.01

# models_path + "ss_1 " +  "cifar10 " + "c={cs:2d}".format(cs=c)+".pth"

for c in cs:
    print("c = ", c)

    torch.manual_seed(0)

    transform_train = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
    trainset = F_MNIST(root='./data', split='train', train_ratio=train_val_ratio,
                     trust_prop=trust_prop,  download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(
        trainset, batch_size=512, shuffle=True, num_workers=0, worker_init_fn=_init_fn)

    noise_y_train_labels0, noise_y_train_indices = trainset.get_noisy_labels_with_indices()
    noise_y_train, p, _ = noisify_with_P(
        noise_y_train_labels0, nb_classes=num_classes, noise=noise_level, random_state=random_seed)
    trainset.update_corrupted_label(noise_y_train, noise_y_train_indices)

    valset = F_MNIST(root='./data', split='val', train_ratio=train_val_ratio,
                   trust_prop=trust_prop, download=True, transform=transform_train)
    valloader = torch.utils.data.DataLoader(
        valset, batch_size=512, shuffle=False, num_workers=0)

    testset = F_MNIST(root='./data', split='test',
                    download=True, transform=transform_train)
    testloader = torch.utils.data.DataLoader(
        testset, batch_size=512, shuffle=False, num_workers=0)

    unique_labels = [int(i)
                     for i in list(np.linspace(0, 10, 10, endpoint=False))]
    torch.manual_seed(0)

    net = FMNIST_classifier().to(device)
    criterion = nn.CrossEntropyLoss(reduction='mean')
    optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(
        0.9, 0.999), eps=1e-08, weight_decay=1e-4)

    train_acc = []
    val_acc = []
    best_accuracy = 0

    for epoch in range(max_epochs):  # loop over the dataset multiple times

        running_loss = []
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            # _, images, labels, _, _ = data
            images, labels, weights, cd_y, idx = data
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs, encodings = net(images.double().to(device))

            x = encodings[cd_y == 1].detach().cpu()
            y = encodings[cd_y == 0].detach().cpu()
            x_labels = labels[cd_y == 1].detach().cpu()
            y_labels = labels[cd_y == 0].detach().cpu()
            x_idx = idx[cd_y == 1].detach().cpu()
            y_idx = idx[cd_y == 0].detach().cpu()

            x_labels_onehot = (F.one_hot(x_labels.long(), num_classes=num_classes)).to(
                device).double()

            mu = 1/x.shape[0]*np.ones(x.shape[0])
            nu = 1/y.shape[0]*np.ones(y.shape[0])
            costs, P, _, _ = ss_ipot(x, y, mu, nu, c,  gamma_ss, max_outer_iter=20,
                                     max_inner_iter=20, wd=2, disp_iter=False, return_map=True)

            P_torch = torch.from_numpy(P).to(device)
            y_labels_hat = (P_torch.T@x_labels_onehot.double()).to(device)

            input_labels_hat = torch.zeros_like(
                outputs, device=device, dtype=torch.double)
            input_labels_hat[cd_y == 1] = 1 / \
                x.shape[0] * (x_labels_onehot.double())
            input_labels_hat[cd_y == 0] = y_labels_hat

            loss = - torch.sum(input_labels_hat*F.log_softmax(outputs, dim=1))
            loss.backward()
            optimizer.step()
            running_loss.append(loss.item())
            # avg_loss.append(torch.tensor(running_loss).mean())
        print("epoch={0:d},  avg_loss = {1:0.4f}".format(
            epoch, torch.tensor(running_loss).mean()))

        if True:
            classes = tuple([str(i) for i in unique_labels])
            correct_pred = {classname: 0 for classname in classes}
            total_pred = {classname: 0 for classname in classes}
            total = 0
            correct = 0

            net.eval()
            with torch.no_grad():
                for data in valloader:
                    images, labels, _, _, _ = data
                    outputs, _ = net(images.double().to(device))
                    _, predictions = torch.max(outputs, dim=1)
                    total += labels.size(0)
                    correct += (predictions.cpu() == labels).sum().item()
                print(f'val set accuraccy: {100 * correct / total} %')
                val_acc.append(100 * correct / total)

            if val_acc[-1] > best_accuracy:
                best_accuracy = val_acc[-1]
                net_path = models_path + "ss_1 " + \
                    "fmnist " + "c={cs:2d}".format(cs=c)+".pth"
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': net.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'val-accuracy': val_acc,
                    'train-accuracy': train_acc}, net_path)

            classes = tuple([str(i) for i in unique_labels])
            correct_pred = {classname: 0 for classname in classes}
            total_pred = {classname: 0 for classname in classes}
            total = 0
            correct = 0
            net.eval()
            with torch.no_grad():
                for data in trainloader:
                    images, labels, _, _, _ = data
                    outputs, _ = net(images.double().to(device))
                    _, predictions = torch.max(outputs, dim=1)
                    total += labels.size(0)
                    correct += (predictions.cpu() == labels).sum().item()
            print(f'train set accuracy: {100 * correct / total} %')
            train_acc.append(100 * correct / total)

    print('Finished Training')
    # saving training accuracy list along with model with best accuracy
    net_path = models_path + "ss_1 " + \
        "fmnist " + "c={cs:2d}".format(cs=c)+".pth"
    best_net_dict = torch.load(net_path)
    best_net_dict['val-accuracy'] = val_acc
    best_net_dict['train-accuracy'] = train_acc
    torch.save(best_net_dict, net_path)
