In [1]:
import warnings
import os
import sys
# try to import peal and if not installed, add the parent directory to the path
try:
    import peal

except ImportError:
    # if peal not installed, but project downloaded locally
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

# import basic libraries needed for sure and set the device depending on whether cuda is available or not
import torch
from peal.utils import request
device = 'cuda' if torch.cuda.is_available() else 'cpu'

warnings.filterwarnings('ignore')

# set autoreload for more convinient development
%load_ext autoreload
%autoreload 2

# check and set that the right gpu is used
if device == 'cuda':
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    !nvidia-smi
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    print('Currently used device: ' + str(os.environ["CUDA_VISIBLE_DEVICES"]))
    os.environ["CUDA_VISIBLE_DEVICES"] = request(
        'cuda_visible_devices', default="0")
    torch.cuda.set_device(int(os.environ["CUDA_VISIBLE_DEVICES"]))
    import math
    import nvidia_smi
    nvidia_smi.nvmlInit()
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    gigabyte_vram = info.total / math.pow(10, 9)
    print("Total memory:", gigabyte_vram)

else:
    gigabyte_vram = None


Mon Jul 31 17:16:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 970     Off  | 00000000:01:00.0  On |                  N/A |
|  0%   55C    P0    50W / 200W |    442MiB /  4034MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
# create the datasets
from peal.data.dataset_factory import get_datasets
from peal.data.dataset_generators import ArtificialConfounderTabularDatasetGenerator
from peal.utils import load_yaml_config
from peal.configs.data.template import DataConfig
from peal.configs.models.template import ModelConfig
import copy

unpoisened_dataset_config = load_yaml_config('<PEAL_BASE>/configs/data/wave_circle.yaml', DataConfig)
dg = ArtificialConfounderTabularDatasetGenerator(
    dataset_name='artificial_symbolic',
    num_samples=unpoisened_dataset_config.num_samples,
    input_size=unpoisened_dataset_config.input_size[0],
    label_noise=unpoisened_dataset_config.label_noise,
    seed=unpoisened_dataset_config.seed
)
dg.generate_dataset()

student_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_classifier.yaml', ModelConfig)
if not student_config.task.x_selection is None:
    unpoisened_dataset_config.input_size = [len(student_config.task.x_selection)]
    
unpoisened_dataset_train, unpoisened_dataset_val, unpoisened_dataset_test = get_datasets(
    config=unpoisened_dataset_config,
    base_dir=dg.label_dir,
    task_config=student_config.task,
)

# create a copy of the dataset config that will be poisened in the next steps
poisened_dataset_config = copy.deepcopy(unpoisened_dataset_config)
poisened_dataset_config.num_samples = int(unpoisened_dataset_config.num_samples / 2)

confounder_probability = request('confounder_probability', '100')
poisened_dataset_config.confounder_probability = float(confounder_probability) / 100

# create dataset based changed data config
poisened_dataset_train, poisened_dataset_val, poisened_dataset_test = get_datasets(
    config=poisened_dataset_config,
    base_dir=dg.label_dir,
    task_config=student_config.task,
)

In [None]:
is_train_generator = request('is_train_generator', True)
if is_train_generator:
    # if you want the generator getting trained from scratch
    from peal.generators.variational_autoencoders import VAE
    from peal.configs.models.template import ModelConfig
    from peal.training.trainers import ModelTrainer
    from peal.data.dataset_wrappers import VAEDatasetWrapper
    generator_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_vae.yaml', ModelConfig)
    generator_config.data = poisened_dataset_train.config
    generator = VAE(generator_config.architecture, generator_config.data.input_size[0]).to(device)

    dataset_train = VAEDatasetWrapper(poisened_dataset_train)
    dataset_val = VAEDatasetWrapper(poisened_dataset_val)

    generator_trainer = ModelTrainer(
        config=generator_config,
        model=generator,
        datasource=(dataset_train, dataset_val),
        model_name=request(
            'generator_model_name',
            'artificial_symbolic_' + confounder_probability + '_generator'
        ),
        gigabyte_vram=gigabyte_vram
    )
    generator_trainer.fit()

else:
    # if you want to use loaded generator
    generator_path = request(
        'generator_path',
        'peal_runs/artificial_symbolic_' + confounder_probability + '_generator/model.cpl'
    )
    generator = torch.load(generator_path).to(device)


In [4]:
is_train_student = request('is_train_student', True)
if is_train_student:
    # if you want to train your own initial student model
    from peal.architectures.downstream_models import SequentialModel
    from peal.configs.models.template import ModelConfig
    from peal.training.trainers import ModelTrainer
    student_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_classifier.yaml', ModelConfig)
    student_config.data = poisened_dataset_train.config

    input_channels=len(student_config.task.x_selection) if not student_config.task.x_selection is None else student_config.data.input_size[0]
    output_channels=student_config.task.output_channels if not student_config.task.output_channels is None else student_config.data.output_size[0]
    # create and traing student model
    student = SequentialModel(student_config.architecture, input_channels, output_channels).to(device)
    student_trainer = ModelTrainer(
        config=student_config,
        model=student,
        datasource=(poisened_dataset_train, poisened_dataset_val),
        model_name=request(
            'student_model_name',
            'artificial_symbolic_' + confounder_probability + '_classifier'
        ),
        gigabyte_vram=gigabyte_vram
    )
    student_trainer.fit()
else:
    # if you want to load your initial student model
    student_path = request(
        'student_path',
        'peal_runs/artificial_symbolic_' + confounder_probability + '_classifier/model.cpl'
    )
    student = torch.load(student_path).to(device)


Training Config: <peal.configs.models.template.ModelConfig object at 0x7fa098190d00>


Model Training: train_it: 741, loss: 0.0006215108442120254Epoch: 0:   8%|▊         | 741/9612 [00:10<01:48, 81.51it/s]

KeyboardInterrupt: 

In [None]:
teacher_type = request('teacher_type', 'train')
if teacher_type == 'train':
    # if you want to train and use new model for knowledge distillation
    from peal.architectures.downstream_models import SequentialModel
    from peal.configs.models.template import ModelConfig
    from peal.training.trainers import ModelTrainer
    teacher_config = load_yaml_config('<PEAL_BASE>/configs/models/symbolic_classifier.yaml', ModelConfig)
    teacher_config.data = unpoisened_dataset_train.config

    input_channels=len(teacher_config.task.x_selection) if not teacher_config.task.x_selection is None else teacher_config.data.input_size[0]
    output_channels=teacher_config.task.output_channels if not teacher_config.task.output_channels is None else teacher_config.data.output_size[0]
    # create and traing student model
    teacher = SequentialModel(teacher_config.architecture, input_channels, output_channels).to(device)
    teacher_trainer = ModelTrainer(
        config=teacher_config,
        model=teacher,
        datasource=(unpoisened_dataset_train, unpoisened_dataset_val),
        model_name=request('teacher_model_name', 'artificial_symbolic_unpoisened_classifier'),
        gigabyte_vram=gigabyte_vram
    )
    teacher_trainer.fit()
    teacher_type = 'oracle'

elif teacher_type == 'load':
    # if you want to use existing model for knowledge distillation
    teacher_path = request(
        'teacher_path', 'peal_runs/artificial_symbolic_unpoisened_classifier/model.cpl')
    teacher = torch.load(teacher_path).to(device)
    teacher_type = 'oracle'

else:
    teacher = teacher_type

In [None]:
cfkd.adaptor_config.task.__dict__

In [None]:
student

In [None]:
generator

In [None]:
# use counterfactual knowledge distillation to improve model
from peal.adaptors.counterfactual_knowledge_distillation import CounterfactualKnowledgeDistillation
cfkd = CounterfactualKnowledgeDistillation(
    student=student,
    datasource=(
        poisened_dataset_train,
        poisened_dataset_val,
        unpoisened_dataset_test,
    ),
    output_size=2,
    generator=generator,
    teacher=teacher,
    base_dir=request(
        'cfkd_base_dir',
        'peal_runs/artificial_symbolic_' + confounder_probability +
        '_classifier/cfkd_' + teacher_type
    ),
    gigabyte_vram=gigabyte_vram,
    task_config=student_config.task,
    overwrite=True,
)
cfkd.run()