In [11]:
import warnings
import os
import sys
# try to import peal and if not installed, add the parent directory to the path
try:
    import peal

except ImportError:
    # if peal not installed, but project downloaded locally
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

# import basic libraries needed for sure and set the device depending on whether cuda is available or not
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

warnings.filterwarnings('ignore')

# set autoreload for more convinient development
%load_ext autoreload
%autoreload 2

from peal.global_utils import request
# check and set that the right gpu is used
if device == 'cuda':
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    !nvidia-smi
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    print('Currently used device: ' + str(os.environ["CUDA_VISIBLE_DEVICES"]))
    os.environ["CUDA_VISIBLE_DEVICES"] = request(
        'cuda_visible_devices', default="0")
    torch.cuda.set_device(int(os.environ["CUDA_VISIBLE_DEVICES"]))
    import math
    import nvidia_smi
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    gigabyte_vram = info.total / math.pow(10, 9)
    print("Total memory:", gigabyte_vram)

else:
    gigabyte_vram = None

    
is_asking = request('asking', default = True)#
unrestricted_unpoisened = request('unrestricted_unpoisened', default = False, is_asking=is_asking)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# create the datasets
from peal.data.dataset_factory import get_datasets
from peal.data.dataset_generators import CircleDatasetGenerator
from peal.global_utils import load_yaml_config
import copy

unpoisened_dataset_config = load_yaml_config('<PEAL_BASE>/configs/data/symbolic_circle.yaml')
unpoisened_dataset_config.num_samples = 1088
unpoisened_dataset_config.noise_scale = 0
unpoisened_dataset_config.set_negative_to_zero = False
dg = CircleDatasetGenerator(
    dataset_name='circle',
    num_samples=unpoisened_dataset_config.num_samples,
    radius=unpoisened_dataset_config.radius,
    noise_scale=unpoisened_dataset_config.noise_scale,
    seed=unpoisened_dataset_config.seed,
)

dg.generate_dataset()

student_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_circle_classifier.yaml')
if len(student_config.task['x_selection']):
    unpoisened_dataset_config.input_size = [len(student_config.task['x_selection'])]
    
unpoisened_dataset_train, unpoisened_dataset_val, unpoisened_dataset_test = get_datasets(
    config=unpoisened_dataset_config,
    base_dir=dg.label_dir,
    task_config=student_config.task,
)

# create a copy of the dataset config that will be poisened in the next steps
poisened_dataset_config = copy.deepcopy(unpoisened_dataset_config)
poisened_dataset_config.num_samples = int(unpoisened_dataset_config.num_samples / 2)

confounder_probability = request('confounder_probability', '100')
poisened_dataset_config.confounder_probability = float(confounder_probability) / 100

# create dataset based changed data config
poisened_dataset_train, poisened_dataset_val, poisened_dataset_test = get_datasets(
    config=poisened_dataset_config,
    base_dir=dg.label_dir,
    task_config=student_config.task,
)

In [6]:
student_config

namespace(training={'train_batch_size': 100,
                    'val_batch_size': 100,
                    'test_batch_size': 100,
                    'max_epochs': 12,
                    'learning_rate': 0.0001,
                    'optimizer': 'adam',
                    'global_train_step': 0,
                    'global_validation_step': 0,
                    'epoch': -1,
                    'steps_per_epoch': 1000,
                    'verbosity': 1},
          architecture={'layers': [['fc', 512, 0.25], ['fc', 512, 0.5]],
                        'activation': 'ReLU'},
          task={'criterions': {'ce': 1.0, 'l1': 10000.0},
                'output_type': 'multiclass',
                'output_size': 2,
                'x_selection': ['x1', 'x2'],
                'y_selection': ['Target'],
                'selection': ['x1', 'x2']})

In [2]:
is_train_generator = request('is_train_generator', True)
if is_train_generator:
    # if you want the generator getting trained from scratch
    from peal.generators.variational_autoencoders import VAE
    from peal.training.trainers import ModelTrainer
    from peal.data.dataset_wrappers import VAEDatasetWrapper
    generator_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_circle_vae.yaml')
    generator_config.data = unpoisened_dataset_train.config
    generator_config.training['max_epochs'] = 5
    generator = VAE(generator_config).to(device)

    dataset_train = VAEDatasetWrapper(unpoisened_dataset_train)
    dataset_val = VAEDatasetWrapper(unpoisened_dataset_val)

    generator_trainer = ModelTrainer(
        config=generator_config,
        model=generator,
        datasource=(dataset_train, dataset_val),
        model_name=request(
            'generator_model_name',
            'artificial_symbolic_' + confounder_probability + '_generator'
        ),
        gigabyte_vram=gigabyte_vram
    )
    generator_trainer.fit()

else:
    # if you want to use loaded generator
    generator_path = request(
        'generator_path',
        'peal_runs/artificial_symbolic_' + confounder_probability + '_generator/model.cpl'
    )
    generator = torch.load(generator_path).to(device)


NameError: name 'request' is not defined

In [12]:
is_train_student = request('is_train_student', True)
if is_train_student:
    # if you want to train your own initial student model
    from peal.architectures.downstream_models import Symbolic2VectorModel
    from peal.training.trainers import ModelTrainer
    student_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_circle_classifier.yaml')
    student_config.data = poisened_dataset_train.config
    student_config.training['max_epochs'] = 5 
    # create and traing student model
    student = Symbolic2VectorModel(student_config).to(device)
    student_trainer = ModelTrainer(
        config=student_config,
        model=student,
        datasource=(poisened_dataset_train, poisened_dataset_val),
        model_name=request(
            'student_model_name',
            'artificial_symbolic_' + confounder_probability + '_classifier'
        ),
        gigabyte_vram=gigabyte_vram
    )
    student_trainer.fit()

else:
    # if you want to load your initial student model
    student_path = request(
        'student_path',
        'peal_runs/artificial_symbolic_' + confounder_probability + '_classifier/model.cpl'
    )
    student = torch.load(student_path).to(device)


Do you want to change value of is_train_student==True? [y/n] n
Do you want to change value of student_model_name==artificial_symbolic_100_classifier? [y/n] n


Training Config: {'training': {'train_batch_size': 100, 'val_batch_size': 100, 'test_batch_size': 100, 'max_epochs': 5, 'learning_rate': 0.0001, 'optimizer': 'adam', 'global_train_step': 0, 'global_validation_step': 0, 'epoch': -1, 'iterations_per_episode': 1000, 'verbosity': 1}, 'architecture': {'activation': 'ReLU', 'neuron_numbers_encoder': [512], 'neuron_numbers_decoder': [512]}, 'task': {'criterions': {'ce': 1.0, 'l1': 10000.0}, 'output_type': 'multiclass', 'output_size': 2, 'x_selection': ['x1', 'x2'], 'y_selection': ['Target'], 'selection': ['x1', 'x2']}, 'data': {'radius': 1, 'noise_scale': 0, 'num_samples': 250, 'split': [0.8, 0.9], 'input_type': 'symbolic', 'input_size': [2], 'label_noise': 0.0, 'seed': 0, 'output_type': 'multiclass', 'output_size': 2, 'confounder_probability': 1.0, 'confounding_factors': ['Target', 'Confounder'], 'known_confounder': True, 'set_negative_to_zero': False}}


Model Training: validation_0_it: 0, loss: 1.211959215652314e-06Epoch: 4, train_loss_accumulated: 1.525195102658472e-06, train_accuracy: 1.0, train_predicted_classes: tensor([0.5000, 0.5000]), train_targets: tensor([0.5000, 0.5000]), train_classes_difference: tensor([0., 0.]), validation_0_loss_accumulated: 2.582859679023386e-06, validation_0_accuracy: 1.0, validation_0_predicted_classes: tensor([0.5000, 0.5000]), validation_0_targets: tensor([0.5000, 0.5000]), validation_0_classes_difference: tensor([0., 0.]): : 5005it [03:30, 23.81it/s]                   


In [15]:
teacher_type = request('teacher_type', 'train') # changes this to load
if teacher_type == 'train':
    # if you want to train and use new model for knowledge distillation
    from peal.architectures.downstream_models import Symbolic2VectorModel
    from peal.training.trainers import ModelTrainer
    teacher_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_circle_classifier.yaml')
    teacher_config.data = u
    npoisened_dataset_train.config
    teacher_config.training['max_epochs'] = 5

    # create and train teacher model
    teacher = Symbolic2VectorModel(teacher_config).to(device)
    teacher_trainer = ModelTrainer(
        config=teacher_config,
        model=teacher,
        datasource=(unpoisened_dataset_train, unpoisened_dataset_val),
        model_name=request('teacher_model_name', 'artificial_symbolic_unpoisened_classifier'),
        gigabyte_vram=gigabyte_vram
    )
    teacher_trainer.fit()
    teacher_type = 'oracle'

elif teacher_type == 'load':
    # if you want to use existing model for knowledge distillation
    teacher_path = request(
        'teacher_path', 'peal_runs/artificial_symbolic_unpoisened_classifier/model.cpl')
    teacher = torch.load(teacher_path).to(device)
    teacher_type = 'oracle'

else:
    teacher = teacher_type

Do you want to change value of teacher_type==train? [y/n] n
Do you want to change value of teacher_model_name==artificial_symbolic_unpoisened_classifier? [y/n] n


Training Config: {'training': {'train_batch_size': 100, 'val_batch_size': 100, 'test_batch_size': 100, 'max_epochs': 5, 'learning_rate': 0.0001, 'optimizer': 'adam', 'global_train_step': 0, 'global_validation_step': 0, 'epoch': -1, 'iterations_per_episode': 1000, 'verbosity': 1}, 'architecture': {'activation': 'ReLU', 'neuron_numbers_encoder': [512], 'neuron_numbers_decoder': [512]}, 'task': {'criterions': {'ce': 1.0, 'l1': 10000.0}, 'output_type': 'multiclass', 'output_size': 2, 'x_selection': ['x1', 'x2'], 'y_selection': ['Target'], 'selection': ['x1', 'x2']}, 'data': {'radius': 1, 'noise_scale': 0, 'num_samples': 500, 'split': [0.8, 0.9], 'input_type': 'symbolic', 'input_size': [2], 'label_noise': 0.0, 'seed': 0, 'output_type': 'multiclass', 'output_size': 2, 'confounder_probability': 0.5, 'confounding_factors': ['Target', 'Confounder'], 'known_confounder': True, 'set_negative_to_zero': False}}


Model Training: validation_0_it: 0, loss: 0.009123411029577255Epoch: 4, train_loss_accumulated: 0.00103426119312644, train_accuracy: 1.0, train_predicted_classes: tensor([0.5000, 0.5000]), train_targets: tensor([0.5000, 0.5000]), train_classes_difference: tensor([0., 0.]), validation_0_loss_accumulated: 0.01188803743571043, validation_0_accuracy: 1.0, validation_0_predicted_classes: tensor([0.5000, 0.5000]), validation_0_targets: tensor([0.5000, 0.5000]), validation_0_classes_difference: tensor([0., 0.]): : 5005it [02:42, 30.76it/s] 


In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
def plot_decision_boundary(model, dataset):
    assert isinstance(dataset, peal.data.datasets.SymbolicDataset)
    
    data = torch.zeros([len(dataset.data),len(dataset.attributes)], dtype=torch.float16)
    for idx, key in enumerate(dataset.data):
        data[idx] = dataset.data[key]
    
    input_idx = [idx for idx, element in enumerate(dataset.attributes) if element in dataset.task_config.x_selection]
    target_idx = [idx for idx, element in enumerate(dataset.attributes) if element in dataset.task_config.y_selection]
    confounder_idx =  [idx for idx, element in enumerate(dataset.attributes) if element in list(set(dataset.config.confounding_factors) - set(dataset.task_config.y_selection))]
    #if isinstance(data, torch.Tensor):
    #    data = data.numpy()
    #if isinstance(data, pd.DataFrame):
    #    data = data.to_numpy()
    #xmin, xmax = data[:,0].min()-1, data[:,0].max()+1
    #ymin, ymax = data[:,1].min()-1, data[:,1].max()+1
    
    steps = 1000
    #input_grid = []
    #x1_span = np.linspace(-1.5, 1.5, steps)
    #x2_span = np.linspace(-1.5, 1.5, steps)
    #xx1, xx2 = np.meshgrid(x1_span, x2_span)
    #xx1, xx2 = np.meshgrid(*[np.linspace(float(data[:, [input_idx]].min()-0.5),float(data[:, [input_idx]].max()+0.5), 1000) for idx in input_idx])
    #grid = torch.from_numpy(np.array([xx1.flatten(), xx2.flatten()]).T).to(torch.float32)
    #xx1, xx2 = np.meshgrid(*[np.linspace(int(data[:, 0].min())-0.5,int(data[:, 0].min())+0.5) for idx in input_idx])
    #grid = torch.from_numpy(np.array([xx1.flatten(), xx2.flatten()]).T).to(torch.float32)
    #model.eval()
    #z = model(grid).to(torch.float32).detach().numpy().argmax(axis=1).reshape(xx1.shape)
    fig, ax = plt.subplots()
    #idx = features_idx+[class_label_idx]
    ax.scatter(data[:,0], data[:,1], c=data[:,target_idx])
    #ax.contour(xx1, xx2, z, levels=[0],linestyles='dashed')
    ax.grid()

In [8]:
import peal
plot_decision_boundary(None, poisened_dataset_train)

NameError: name 'poisened_dataset_train' is not defined

In [None]:
plot_decision_boundary(teacher, poisened_dataset_train)

In [None]:
# use counterfactual knowledge distillation to improve model
from peal.adaptors.counterfactual_knowledge_distillation import CounterfactualKnowledgeDistillation
cfkd = CounterfactualKnowledgeDistillation(
    student=student,
    datasource=(
        poisened_dataset_train,
        poisened_dataset_val,
        unpoisened_dataset_test,
    ),
    output_size=2,
    generator=generator,
    teacher=teacher,
    base_dir=request(
        'cfkd_base_dir',
        'peal_runs/artificial_symbolic_' + confounder_probability +
        '_classifier/cfkd_' + teacher_type
    ),
    gigabyte_vram=gigabyte_vram,
    overwrite=False,
)
cfkd.adaptor_config.explainer['learning_rate'] = 1.0
cfkd.adaptor_config.explainer['gradient_steps'] = 51
cfkd.run()

Do you want to change value of cfkd_base_dir==peal_runs/artificial_symbolic_100_classifier/cfkd_oracle? [y/n] n


Adaptor Config: {'architecture': {}, 'assumed_input_size': [3, 64, 64], 'attribution_threshold': 0.5, 'batch_size': 1, 'continuos_learning': False, 'current_iteration': 0, 'data': {'radius': 1, 'noise_scale': 0, 'num_samples': 250, 'split': [0.8, 0.9], 'input_type': 'symbolic', 'input_size': [2], 'label_noise': 0.0, 'seed': 0, 'output_type': 'multiclass', 'output_size': 2, 'confounder_probability': 1.0, 'confounding_factors': ['Target', 'Confounder'], 'known_confounder': True, 'set_negative_to_zero': False, 'has_hint': False}, 'explainer': {'explanation_style': 'counterfactual', 'gradient_steps': 51, 'img_noise_injection': 0.01, 'img_regularization': 0.0, 'l1_regularization': 1.0, 'learning_rate': 1.0, 'log_prob_regularization': 0.0, 'optimizer': 'Adam', 'use_masking': True, 'y_target_goal_confidence': 0.65}, 'fa_1sided_prime': 0.0, 'finetune_iterations': 5, 'gigabyte_vram': 40, 'max_train_samples': 2, 'max_validation_samples': 2, 'min_start_target_percentile': 0.0, 'mixing_ratio': 0.5

ipdb>  self


<peal.adaptors.counterfactual_knowledge_distillation.CounterfactualKnowledgeDistillation object at 0x7f8a24652bb0>


ipdb>  dataloader.dataset.attributes


['# x1', 'x2', 'Target']


In [None]:
# NOTES:
# 1. When student is ran without generating data, confounder probability is not defined and, therefore, the logic of loading preexisting model fails (need to load data before, should be allowed to skip)
# 2. unclear about the folder structure for cfdk
# 3. 

In [None]:
# for dataloaders.py    
    if isinstance(val_dataloader, torch.utils.data.dataloader.DataLoader):
        if len(val_dataloader.dataset.data[list(val_dataloader.dataset.data.keys())[-1]]) < 1:
            val_dataloader.dataset.data.popitem()


In [None]:
poisened_dataset_val.data

In [None]:
unpoisened_dataset_train.task_config