In [None]:
# Drive version:
# !pip uninstall TabularExperimentTrackerClient --y
# !pip install git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient
# !pip uninstall NeuralNetworksTrainingPackage --y
# !pip install git+https://github.com/Bartosz-G/NeuralNetworksTrainingPackage

In [2]:
%pip uninstall NeuralNetworksTrainingPackage --y
%pip uninstall TabularExperimentTrackerClient --y
%pip install git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient
%pip install git+https://github.com/Bartosz-G/NeuralNetworksTrainingPackage

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient
  Cloning https://github.com/DanielWarfield1/TabularExperimentTrackerClient to /tmp/pip-req-build-tsdna2jc
  Running command git clone --filter=blob:none --quiet https://github.com/DanielWarfield1/TabularExperimentTrackerClient /tmp/pip-req-build-tsdna2jc
  Resolved https://github.com/DanielWarfield1/TabularExperimentTrackerClient to commit 5e738443d55b9637454776bd740a8083af70272d
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting openml (from TabularExperimentTrackerClient==0.0.1)
  Using cached openml-0.14.1-py3-none-any.whl
Collecting liac-arff>=2.4.0 (from openml->TabularExperimentTrackerClient==0.0.1)
  Using cached liac_arff-2.5.0-py3-none-any.whl
Collecting xmltodict (from openml->TabularExperimentTrackerClient==0.0.1)
  Using cached xmltodict-0.13.

In [3]:
import numpy as np
import pandas as pd
import sklearn
import torch
import time

In [4]:
import os
from TabularExperimentTrackerClient.ExperimentClient import ExperimentClient

path =  '../creds/'
creds_orch_file = "creds-orch.txt"
creds_openml_file = "creds-openml.txt"



with open(os.path.join(path, creds_orch_file), 'r') as file:
    lines = file.readlines()
    orchname = lines[0].strip()
    orchsecret = lines[1].strip()

with open (os.path.join(path, creds_openml_file), "r") as myfile:
    openMLAPIKey = myfile.read()

ex = ExperimentClient(verbose = True)


ex.define_orch_cred(orchname, orchsecret)
ex.define_opml_cred(openMLAPIKey)

# Colab version
# ex.define_opml_cred_drive('/My Drive/research/non-homogenous-data/creds/creds-openml.txt')
# ex.define_orch_cred_drive('bart', '/My Drive//research/non-homogenous-data/creds/creds-colab.txt')

# 1. Defining the experiment

In [5]:
experiment_name = 'test_Mlp_1'


# Mlp parameters
depth = {'distribution': 'categorical', 'values':[2,3,4,5,6,7,8,9,10,11,12]}
hidden_dim = {'distribution': 'categorical', 'values':[64, 128, 256, 512]}
seed = {'distribution': 'constant', 'value': 42}
regularize = {'distribution': 'categorical', 'values':[None, None, None, 0.25, 0.5, 0.75, 'bn', 'bn','bn']} # float implies dropout
embd_size = {'distribution': 'categorical', 'values':[None, None, None, 'sqrt', 64, 128, 256]} # 'sqrt implies embedding is sqrt smaller than the number of categories
optimizer = {'distribution': 'categorical', 'values': ['SGD', 'SGD', 'SGD', 'SGD', 'Adam']}
batch_size = {'distribution': 'categorical', 'values':[64, 128, 256, 512, 1024]}
epochs = {'distribution': 'categorical', 'values':[60, 90, 120, 150]}
lr = {'distribution': 'log_uniform', 'min':1e-5, 'max':1e-2}
momentum = {'distribution': 'categorical', 'values':[0, 0.5, 0.9]}
no_cuda = {'distribution': 'constant', 'value': False}
lr_step_size = {'distribution': 'categorical', 'values':[0, 20, 30]}
gamma = {'distribution':'categorical', 'values':[0.2, 0.1, 0.05]}




In [6]:
#============================================================
# MLP on datasets containing categorical variables
#============================================================
Mlp_relu_cat_space = {
    'depth': depth,
    'seed': seed,
    'regularize':regularize,
    'embd_size':embd_size,
    'activation': {'distribution': 'constant', 'value': 'relu'},
    'hidden_dim': hidden_dim,
    'optimizer': optimizer,
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma
    }

Mlp_sigm_cat_space = {
    'depth': depth,
    'seed': seed,
    'regularize':regularize,
    'embd_size':embd_size,
    'activation': {'distribution': 'constant', 'value': 'sigmoid'},
    'hidden_dim': hidden_dim,
    'optimizer': optimizer,
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma
}

#============================================================
# MLP on continous predictors
#============================================================

Mlp_relu_cont_space = {
    'depth': depth,
    'seed': seed,
    'regularize':regularize,
    'embd_size': {'distribution': 'constant', 'value': None},
    'activation': {'distribution': 'constant', 'value': 'relu'},
    'hidden_dim': hidden_dim,
    'optimizer': optimizer,
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma
}


Mlp_sigm_cont_space = {
    'depth': depth,
    'seed': seed,
    'regularize':regularize,
    'embd_size': {'distribution': 'constant', 'value': None},
    'activation': {'distribution': 'constant', 'value': 'sigmoid'},
    'hidden_dim': hidden_dim,
    'optimizer': optimizer,
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma
}

In [7]:
model_groups = {
    'Mlp_relu_cat':{'model':'Mlp_relu_cat', 'hype':Mlp_relu_cat_space},
    'Mlp_sigm_cat':{'model':'Mlp_sigm_cat', 'hype':Mlp_sigm_cat_space},
    'Mlp_relu_cont':{'model':'Mlp_relu_cont', 'hype':Mlp_relu_cont_space},
    'Mlp_sigm_cont':{'model':'Mlp_sigm_cont', 'hype':Mlp_sigm_cont_space}
}

ex.def_model_groups(model_groups)

In [22]:
ex.def_data_groups_opml()
print(f'automatically defined data groups: {ex.data_groups.keys()}')

categorical_models = [k for k in model_groups.keys() if '_cat' in k]
continous_models = [k for k in model_groups.keys() if '_cont' in k]


applications = {'opml_reg_purnum_group': continous_models,
                'opml_reg_numcat_group': categorical_models,
                'opml_class_purnum_group': continous_models,
                'opml_class_numcat_group': categorical_models}

ex.def_applications(applications)
ex.reg_experiment(experiment_name)


# Required to distinguish between classification or regression tasks
regression_tasks = ex.__dict__['data_groups']['opml_reg_purnum_group'] + ex.__dict__['data_groups']['opml_reg_numcat_group']

automatically defined data groups: dict_keys(['opml_reg_purnum_group', 'opml_class_purnum_group', 'opml_reg_numcat_group', 'opml_class_numcat_group'])
existing experiment found


In [9]:
exp_info = ex.experiment_info()
successful_runs = exp_info['successful_runs']
required_runs = exp_info['required_runs']
print('total required runs: {}'.format(required_runs))

total required runs: 7080


# 2. General Data pre-processing

In [10]:
from NeuralNetworksTrainingPackage.event_handler import dataPreProcessingEventEmitter
from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import filterCardinality, quantileTransform, truncateData ,balancedTruncateData, oneHotEncodePredictors, oneHotEncodeTargets, toDataFrame, splitTrainValTest, balancedSplitTrainValTest

n_sample = 20000
split = [0.5, 0.25, 0.25]
quantile_transform_distribution='normal'


data_pre_processing = dataPreProcessingEventEmitter()

filter_cardinality = filterCardinality(transform = 'all')
truncate_data = truncateData(n = n_sample, transform = 'all')
balanced_truncate_data = balancedTruncateData(n = n_sample, transform = 'all') # Ensures balance of classes
one_hot_encode_predictors = oneHotEncodePredictors(transform = 'all')
one_hot_encode_targets = oneHotEncodeTargets(transform = 'all')
to_data_frame = toDataFrame(transform = 'all')
split_train_val_test = splitTrainValTest(split = split)
balanced_split_train_val_test = balancedSplitTrainValTest(split = split)
quantile_transform = quantileTransform(output_distribution = quantile_transform_distribution, transform = 'all')


# Transformations will be called in the order they're added to data_pre_processing
data_pre_processing.add_pre_processing_step('regression', filter_cardinality)
data_pre_processing.add_pre_processing_step('regression', truncate_data)
data_pre_processing.add_pre_processing_step('regression', one_hot_encode_predictors)
data_pre_processing.add_pre_processing_step('regression', to_data_frame)
data_pre_processing.add_pre_processing_step('regression', split_train_val_test)
data_pre_processing.add_pre_processing_step('regression', quantile_transform)


data_pre_processing.add_pre_processing_step('classification', filter_cardinality)
data_pre_processing.add_pre_processing_step('classification', balanced_truncate_data)
data_pre_processing.add_pre_processing_step('classification', one_hot_encode_predictors)
data_pre_processing.add_pre_processing_step('classification', one_hot_encode_targets)
data_pre_processing.add_pre_processing_step('classification', to_data_frame)
data_pre_processing.add_pre_processing_step('classification', balanced_split_train_val_test)
data_pre_processing.add_pre_processing_step('classification', quantile_transform)

# 3. Model Specific Data pre-processing

In [11]:
from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import CustomCategoricalSplitDataset, toPyTorchDatasets

pytorch_data_class = CustomCategoricalSplitDataset
to_pytorch_datasets = toPyTorchDatasets(wrapper = pytorch_data_class)


# Transformations will be called after general pre-processing steps, and in order they're added
data_pre_processing.add_pre_processing_step('Mlp_relu_cat', to_pytorch_datasets)

data_pre_processing.add_pre_processing_step('Mlp_sigm_cat', to_pytorch_datasets)

data_pre_processing.add_pre_processing_step('Mlp_relu_cont', to_pytorch_datasets)

data_pre_processing.add_pre_processing_step('Mlp_sigm_cont', to_pytorch_datasets)

# 4. Model Metrics

In [12]:
from metrics.MlpMetrics import MlpMetricsClassification, MlpMetricsRegression

mlp_metrics_regression = MlpMetricsRegression()
mlp_metrics_classification = MlpMetricsClassification()

mlp_metrics = {'regression': mlp_metrics_regression,
               'classification': mlp_metrics_classification}

metric_model_pairs = {
    'Mlp_relu_cat': mlp_metrics,
    'Mlp_sigm_cat': mlp_metrics,
    'Mlp_relu_cont': mlp_metrics,
    'Mlp_sigm_cont': mlp_metrics,
}

# 5. Model Training routine

In [13]:
from models.MlpNetwork import init_MlpNetwork
from training.MlpTrain import MlpTrainingRoutine

mlp_model_and_training = {'model_init':init_MlpNetwork, 'training_routine': MlpTrainingRoutine}

model_training_pairs = {
    'Mlp_relu_cat': mlp_model_and_training,
    'Mlp_sigm_cat': mlp_model_and_training,
    'Mlp_relu_cont': mlp_model_and_training,
    'Mlp_sigm_cont': mlp_model_and_training,
}

# 6. Main Experiment loop

In [24]:
torch_models = ('Mlp_relu_cat', 'Mlp_sigm_cat', 'Mlp_relu_cont', 'Mlp_sigm_cont')

sklearn_models = ('no_sklearn_models_in_this_training')


for i in range(14160):
    print(f'==== Begin run:{i} ====')
    run_info = ex.begin_run_sticky()

    hyperparameters = run_info['hyp']
    model_name = run_info['model']

    if hyperparameters.get('task') is None:
        if run_info['task'] in regression_tasks:
            hyperparameters['task'] = 'regression'
        else:
            hyperparameters['task'] = 'classification'
    task = hyperparameters['task']
    seed = hyperparameters['seed']



    torch.manual_seed(seed)
    np.random.seed(seed)


    print('---- Loading datasets ----')
    X, y, categorical_indicator, attribute_names = ex.opml_load_task(run_info['mtpair_task'])

    # Pre-processing
    data_pre_processing.set_seed_for_all(seed)
    data_pre_processing.set_dataset(X, y, categorical_indicator, attribute_names)
    data_pre_processing.apply(task)
    data_pre_processing.apply(model_name)
    train_data, val_data, test_data = data_pre_processing.get_train_val_test()


    # Getting appropriate metrics
    metrics_calculator = metric_model_pairs[model_name][task]


    match model_name:
        case _ if model_name in sklearn_models:
            pass

        case _ if model_name in torch_models:
            # hyperparameters will be updated with {'input_dim': num_columns_X, 'output_dim':num_columns_Y}
            hyperparameters.update(train_data.get_dims())

            train_batch_size = hyperparameters['batch_size']
            train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=train_batch_size,shuffle= True)
            val_dataloader = torch.utils.data.DataLoader(val_data,batch_size=len(val_data),shuffle= True)
            test_dataloader = torch.utils.data.DataLoader(test_data,batch_size=len(test_data),shuffle= True)


            init_model = model_training_pairs[model_name]['model_init']
            TrainingRoutine = model_training_pairs[model_name]['training_routine']


            model = init_model(**hyperparameters)
            training_routine = TrainingRoutine(**hyperparameters)
            
            training_routine.set_optimizer_scheduler(model)

            start_epoch = 1  # start from epoch 1 or last checkpoint epoch
            total_epochs = hyperparameters['epochs']
            start_time = time.time()

            for epoch in range(start_epoch, total_epochs + start_epoch):
                print(f"----{epoch}th training epoch ----")
                epoch_metrics = {}

                training_routine.scheduler_step(epoch)
                train_loss = training_routine.train(model, train_dataloader)


                if train_loss is None:
                    print('---Stopping training due to loss being nan!---')
                    epoch_metrics = {'train_loss': train_loss, 'epoch': epoch}
                    ex.update_run(epoch_metrics)
                    break

                if epoch == total_epochs:
                    continue

                epoch_metrics.update(train_loss)
                epoch_metrics.update({'epoch': epoch})
                ex.update_run(epoch_metrics)
                print(epoch_metrics)

            else:
                final_metrics = {}
                training_time = time.time()-start_time

                train_metrics = metrics_calculator.get_metrics(model, train_dataloader, hyperparameters, 'train')
                val_all_metrics = metrics_calculator.get_all(model, val_dataloader, hyperparameters, 'val')
                test_all_metrics = metrics_calculator.get_all(model, test_dataloader, hyperparameters, 'test')

                final_metrics.update(train_loss)
                final_metrics.update(train_metrics)
                final_metrics.update(val_all_metrics)
                final_metrics.update(test_all_metrics)
                final_metrics.update({'epoch': epoch})
                final_metrics.update({'epoch_time': training_time})

                ex.update_run(final_metrics)
                print(final_metrics)

    ex.end_run()

==== Begin run:0 ====
{'_id': '650f249a1e88c2d3788d960f', 'metrics_per_epoch': [], 'experiment_id': '650f22271e88c2d3788cb61b', 'experiment_name': 'test_Mlp_1', 'mtpair_index': 20, 'mtpair_model': 'Mlp_relu_cont', 'mtpair_task': '336-361083', 'is_completed': False, 'user_id': '64d3a7457658d6ec6db139d0', 'user_name': 'bart', 'hyp': {'depth': 2, 'seed': 42, 'regularize': None, 'embd_size': None, 'activation': 'relu', 'hidden_dim': 256, 'optimizer': 'Adam', 'batch_size': 64, 'epochs': 90, 'lr': 5.379641495585897e-05, 'momentum': 0.5, 'no_cuda': False, 'lr_step_size': 20, 'gamma': 0.1}, 'model': 'Mlp_relu_cont', 'task': '336-361083'}
50f249a1e88c2d3788d960
---- Loading datasets ----
downloading task 336-361083
using values from previous task load, skipped download


TypeError: 'NoneType' object is not subscriptable

## Checking whether the code works as intended

In [None]:
# torch_models = ('LCN_reg', 'LCN_cls', 'LLN_reg', 'LLN_cls')

# sklearn_models = ('no_sklearn_models_in_this_training')


# for i in range(1):
#     print(f'==== Begin run:{i} ====')
#     run_info = ex.begin_run()

#     hyperparameters = run_info['hyp']
#     model_name = run_info['model']

#     task = hyperparameters['task']
#     seed = hyperparameters['seed']



#     torch.manual_seed(seed)
#     np.random.seed(seed)


#     print('---- Loading datasets ----')
#     X, y, categorical_indicator, attribute_names = ex.opml_load_task(run_info['mtpair_task'])

#     # Pre-processing
#     data_pre_processing.set_seed_for_all(seed)
#     data_pre_processing.set_dataset(X, y, categorical_indicator, attribute_names)
#     data_pre_processing.apply(task)
#     data_pre_processing.apply(model_name)
#     train_data, val_data, test_data = data_pre_processing.get_train_val_test()


#     # Getting appropriate metrics
#     metrics_calculator = metric_model_pairs[model_name][task]


#     match model_name:
#         case _ if model_name in sklearn_models:
#             pass

#         case _ if model_name in torch_models:
#             # hyperparameters will be updated with {'input_dim': num_columns_X, 'output_dim':num_columns_Y}
#             hyperparameters.update(train_data.get_dims())

#             train_batch_size = hyperparameters['batch_size']
#             train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=train_batch_size,shuffle= True)
#             val_dataloader = torch.utils.data.DataLoader(val_data,batch_size=len(val_data),shuffle= True)
#             test_dataloader = torch.utils.data.DataLoader(test_data,batch_size=len(test_data),shuffle= True)


#             init_model = model_training_pairs[model_name]['model_init']
#             TrainingRoutine = model_training_pairs[model_name]['training_routine']


#             model = init_model(**hyperparameters)
#             training_routine = TrainingRoutine(**hyperparameters)
            
#             training_routine.set_optimizer_scheduler(model)

#             start_epoch = 1  # start from epoch 1 or last checkpoint epoch
#             total_epochs = hyperparameters['epochs']
#             start_time = time.time()

#             for epoch in range(start_epoch, total_epochs + start_epoch):
#                 print(f"----{epoch}th training epoch ----")
#                 epoch_metrics = {}

#                 training_routine.scheduler_step(epoch)
#                 train_loss = training_routine.train(model, train_dataloader)


#                 if train_loss is None:
#                     print('---Stopping training due to loss being nan!---')
#                     epoch_metrics = {'train_loss': train_loss, 'epoch': epoch}
#                     ex.update_run(epoch_metrics)
#                     break

#                 if epoch == total_epochs:
#                     continue

#                 epoch_metrics.update(train_loss)
#                 epoch_metrics.update({'epoch': epoch})
#                 ex.update_run(epoch_metrics)
#                 print(epoch_metrics)

#             else:
#                 final_metrics = {}
#                 training_time = time.time()-start_time

#                 train_metrics = metrics_calculator.get_metrics(model, train_dataloader, hyperparameters, 'train')
#                 val_all_metrics = metrics_calculator.get_all(model, val_dataloader, hyperparameters, 'val')
#                 test_all_metrics = metrics_calculator.get_all(model, test_dataloader, hyperparameters, 'test')

#                 final_metrics.update(train_loss)
#                 final_metrics.update(train_metrics)
#                 final_metrics.update(val_all_metrics)
#                 final_metrics.update(test_all_metrics)
#                 final_metrics.update({'epoch': epoch})
#                 final_metrics.update({'epoch_time': training_time})

#                 ex.update_run(final_metrics)
#                 print(final_metrics)

#     ex.end_run()

In [None]:
# torch_models = ('LCN_reg', 'LCN_cls', 'LLN_reg', 'LLN_cls')

# sklearn_models = ('no_sklearn_models_in_this_training')


# for i in range(1):
#     print(f'==== Begin run:{i} ====')
# #     run_info = ex.begin_run()

#     hyperparameters = run_info['hyp']
#     model_name = run_info['model']

#     task = hyperparameters['task']
#     seed = hyperparameters['seed']



#     torch.manual_seed(seed)
#     np.random.seed(seed)


#     print('---- Loading datasets ----')
#     X, y, categorical_indicator, attribute_names = ex.opml_load_task(run_info['mtpair_task'])

#     # Pre-processing
#     data_pre_processing.set_seed_for_all(seed)
#     data_pre_processing.set_dataset(X, y, categorical_indicator, attribute_names)
#     data_pre_processing.apply(task)
#     data_pre_processing.apply(model_name)
#     train_data, val_data, test_data = data_pre_processing.get_train_val_test()


#     # Getting appropriate metrics
#     metrics_calculator = metric_model_pairs[model_name][task]


#     match model_name:
#         case _ if model_name in sklearn_models:
#             pass

#         case _ if model_name in torch_models:
#             # hyperparameters will be updated with {'input_dim': num_columns_X, 'output_dim':num_columns_Y}
#             hyperparameters.update(train_data.get_dims())

#             train_batch_size = hyperparameters['batch_size']
#             train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=train_batch_size,shuffle= True)
#             val_dataloader = torch.utils.data.DataLoader(val_data,batch_size=len(val_data),shuffle= True)
#             test_dataloader = torch.utils.data.DataLoader(test_data,batch_size=len(test_data),shuffle= True)


#             init_model = model_training_pairs[model_name]['model_init']
#             TrainingRoutine = model_training_pairs[model_name]['training_routine']


#             model = init_model(**hyperparameters)
#             training_routine = TrainingRoutine(**hyperparameters)
            
#             training_routine.set_optimizer_scheduler(model)

#             start_epoch = 1  # start from epoch 1 or last checkpoint epoch
#             total_epochs = hyperparameters['epochs']
#             start_time = time.time()

#             for epoch in range(start_epoch, total_epochs + start_epoch):
#                 print(f"----{epoch}th training epoch ----")
#                 epoch_metrics = {}

#                 training_routine.scheduler_step(epoch)
#                 train_loss = training_routine.train(model, train_dataloader)


#                 if train_loss is None:
#                     print('---Stopping training due to loss being nan!---')
#                     epoch_metrics = {'train_loss': train_loss, 'epoch': epoch}
#                     print(epoch_metrics)
#                     # ex.update_run(epoch_metrics)
#                     break

#                 if epoch == total_epochs:
#                     continue

#                 epoch_metrics.update(train_loss)
#                 epoch_metrics.update({'epoch': epoch})
#                 # ex.update_run(epoch_metrics)
#                 print(epoch_metrics)

#             else:
#                 final_metrics = {}
#                 training_time = time.time()-start_time

#                 train_metrics = metrics_calculator.get_metrics(model, train_dataloader, hyperparameters, 'train')
#                 val_all_metrics = metrics_calculator.get_all(model, val_dataloader, hyperparameters, 'val')
#                 test_all_metrics = metrics_calculator.get_all(model, test_dataloader, hyperparameters, 'test')

#                 final_metrics.update(train_loss)
#                 final_metrics.update(train_metrics)
#                 final_metrics.update(val_all_metrics)
#                 final_metrics.update(test_all_metrics)
#                 final_metrics.update({'epoch': epoch})
#                 final_metrics.update({'epoch_time': training_time})

#                 # ex.update_run(final_metrics)
#                 print(final_metrics)

#     # ex.end_run()

In [None]:
# class oneHotEncodeTargets():
#     def __init__(self, transform = 'all'):
#         self.parent = None
#         self.transform = transform

#     def apply(self, X, y, categorical_indicator, attribute_names):
        
#         y = pd.get_dummies(y, dtype=int)

#         # if isinstance(y, pd.DataFrame):
#         #     is_categorical = any(y[col].dtype.name == 'category' for col in y.columns)
#         #     if is_categorical:
#         #         y = pd.get_dummies(y)
#         # 
#         # if isinstance(y, pd.Series):
#         #     is_categorical = y.dtype.name == 'category'
#         # 
#         #     if is_categorical:
#         #         y = pd.get_dummies(y)

#         return X, y, categorical_indicator, attribute_names
    
# one_hot_encode_targets = oneHotEncodeTargets()

In [None]:
#     hyperparameters = run_info['hyp']
#     model_name = run_info['model']

#     task = hyperparameters['task']
#     seed = hyperparameters['seed']



#     torch.manual_seed(seed)
#     np.random.seed(seed)


#     print('---- Loading datasets ----')
#     X, y, categorical_indicator, attribute_names = ex.opml_load_task(run_info['mtpair_task'])
#     print(f'dtype: {y.dtype}')
    
#     X_o, y_o, categorical_indicator_o, attribute_names_0 = one_hot_encode_targets.apply(X, y, categorical_indicator, attribute_names)

#     print(y_o)
    
#     # Pre-processing
# #     data_pre_processing.set_seed_for_all(seed)
# #     data_pre_processing.set_dataset(X, y, categorical_indicator, attribute_names)
# #     data_pre_processing.apply(task)
# #     data_pre_processing.apply(model_name)
# #     train_data, val_data, test_data = data_pre_processing.get_train_val_test()
    
# #     print(train_data.Y)



In [2]:
# data_pre_processing.events

In [None]:
# hyperparameters

In [None]:
from copy import deepcopy
#X_c, y_c, categorical_indicator_c, attribute_names_c = deepcopy(X), deepcopy(y), deepcopy(categorical_indicator), deepcopy(attribute_names)
X, y, categorical_indicator, attribute_names = deepcopy(X_c), deepcopy(y_c), deepcopy(categorical_indicator_c), deepcopy(attribute_names_c)

In [None]:
# from NeuralNetworksTrainingPackage.event_handler import dataPreProcessingEventEmitter
# from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import filterCardinality, quantileTransform, truncateData ,balancedTruncateData, oneHotEncodePredictors, toDataFrame, splitTrainValTest, balancedSplitTrainValTest

# n_sample = 20000
# split = [0.5, 0.25, 0.25]
# quantile_transform_distribution='normal'

# class oneHotEncodeTargets():
#     def __init__(self, transform = 'all'):
#         self.parent = None
#         self.transform = transform

#     def apply(self, X, y, categorical_indicator, attribute_names):
        
#         y = pd.get_dummies(y, dtype=int)

#         # if isinstance(y, pd.DataFrame):
#         #     is_categorical = any(y[col].dtype.name == 'category' for col in y.columns)
#         #     if is_categorical:
#         #         y = pd.get_dummies(y)
#         # 
#         # if isinstance(y, pd.Series):
#         #     is_categorical = y.dtype.name == 'category'
#         # 
#         #     if is_categorical:
#         #         y = pd.get_dummies(y)

#         return X, y, categorical_indicator, attribute_names


# data_pre_processing = dataPreProcessingEventEmitter()

# filter_cardinality = filterCardinality(transform = 'all')
# truncate_data = truncateData(n = n_sample, transform = 'all')
# balanced_truncate_data = balancedTruncateData(n = n_sample, transform = 'all') # Ensures balance of classes
# one_hot_encode_predictors = oneHotEncodePredictors(transform = 'all')
# one_hot_encode_targets = oneHotEncodeTargets(transform = 'all')
# to_data_frame = toDataFrame(transform = 'all')
# split_train_val_test = splitTrainValTest(split = split)
# balanced_split_train_val_test = balancedSplitTrainValTest(split = split)
# quantile_transform = quantileTransform(output_distribution = quantile_transform_distribution, transform = 'all')


# # Transformations will be called in the order they're added to data_pre_processing
# data_pre_processing.add_pre_processing_step('regression', filter_cardinality)
# data_pre_processing.add_pre_processing_step('regression', truncate_data)
# data_pre_processing.add_pre_processing_step('regression', one_hot_encode_predictors)
# data_pre_processing.add_pre_processing_step('regression', to_data_frame)
# data_pre_processing.add_pre_processing_step('regression', split_train_val_test)
# data_pre_processing.add_pre_processing_step('regression', quantile_transform)


# data_pre_processing.add_pre_processing_step('classification', filter_cardinality)
# data_pre_processing.add_pre_processing_step('classification', balanced_truncate_data)
# data_pre_processing.add_pre_processing_step('classification', one_hot_encode_predictors)
# data_pre_processing.add_pre_processing_step('classification', one_hot_encode_targets)
# data_pre_processing.add_pre_processing_step('classification', to_data_frame)
# data_pre_processing.add_pre_processing_step('classification', balanced_split_train_val_test)
# data_pre_processing.add_pre_processing_step('classification', quantile_transform)

# from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import CustomDataset, toPyTorchDatasets

# to_pytorch_datasets = toPyTorchDatasets(wrapper = CustomDataset)

# # Transformations will be called after general pre-processing steps, and in order they're added
# data_pre_processing.add_pre_processing_step('LCN_reg', to_pytorch_datasets)

# data_pre_processing.add_pre_processing_step('LCN_cls', to_pytorch_datasets)

# data_pre_processing.add_pre_processing_step('LLN_reg', to_pytorch_datasets)

# data_pre_processing.add_pre_processing_step('LLN_cls', to_pytorch_datasets)


In [None]:
data_pre_processing.set_seed_for_all(seed)
data_pre_processing.set_dataset(X, y, categorical_indicator, attribute_names)
data_pre_processing.apply('classification')
data_pre_processing.apply(model_name)
train_data, val_data, test_data = data_pre_processing.get_train_val_test()

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=train_batch_size,shuffle= True)
val_dataloader = torch.utils.data.DataLoader(val_data,batch_size=len(val_data),shuffle= True)
test_dataloader = torch.utils.data.DataLoader(test_data,batch_size=len(test_data),shuffle= True)

In [None]:
X, y, categorical_indicator, attribute_names = train_data

In [None]:
print(X.shape)
print(X_c.shape)

In [None]:
non_categorical_columns = [attr for attr, is_cat in zip(attribute_names, categorical_indicator) if not is_cat]
print(X[non_categorical_columns].mean())
print(X[non_categorical_columns].nunique())

In [None]:
non_categorical_columns = [attr for attr, is_cat in zip(attribute_names_c, categorical_indicator_c) if not is_cat]
print(X_c[non_categorical_columns].mean())
print(X_c[non_categorical_columns].nunique())

In [None]:
print(X.loc[:, categorical_indicator].nunique())
print(X_c.loc[:, categorical_indicator_c].nunique())

In [None]:
y.value_counts()

In [None]:
val_data = data_pre_processing.get('val')
test_data = data_pre_processing.get('test')
print(val_data[1].value_counts())
print(test_data[1].value_counts())

In [None]:
type(train_data.Y)

In [None]:
for data, target in train_dataloader:
    print(target)

In [21]:
'336-361072' in regression_tasks

True

In [15]:
hyperparameters

{'depth': 6,
 'seed': 42,
 'regularize': 0.5,
 'embd_size': None,
 'activation': 'sigmoid',
 'hidden_dim': 64,
 'optimizer': 'SGD',
 'batch_size': 256,
 'epochs': 150,
 'lr': 0.004464764641809309,
 'momentum': 0,
 'no_cuda': False,
 'lr_step_size': 0,
 'gamma': 0.1,
 'task': '336-361083'}

In [16]:
run_info

{'_id': '650f222909a509ad7a989bcd',
 'metrics_per_epoch': [],
 'experiment_id': '650f22271e88c2d3788cb61b',
 'experiment_name': 'test_Mlp_1',
 'mtpair_index': 21,
 'mtpair_model': 'Mlp_sigm_cont',
 'mtpair_task': '336-361083',
 'is_completed': False,
 'user_id': '64d3a7457658d6ec6db139d0',
 'user_name': 'bart',
 'hyp': {'depth': 6,
  'seed': 42,
  'regularize': 0.5,
  'embd_size': None,
  'activation': 'sigmoid',
  'hidden_dim': 64,
  'optimizer': 'SGD',
  'batch_size': 256,
  'epochs': 150,
  'lr': 0.004464764641809309,
  'momentum': 0,
  'no_cuda': False,
  'lr_step_size': 0,
  'gamma': 0.1,
  'task': '336-361083'},
 'model': 'Mlp_sigm_cont',
 'task': '336-361083'}

In [18]:
ex.__dict__['data_groups']

{'opml_reg_purnum_group': ['336-361072',
  '336-361073',
  '336-361074',
  '336-361076',
  '336-361077',
  '336-361078',
  '336-361079',
  '336-361080',
  '336-361081',
  '336-361082',
  '336-361083',
  '336-361084',
  '336-361085',
  '336-361086',
  '336-361087',
  '336-361088',
  '336-361279',
  '336-361280',
  '336-361281'],
 'opml_class_purnum_group': ['337-361055',
  '337-361060',
  '337-361061',
  '337-361062',
  '337-361063',
  '337-361065',
  '337-361066',
  '337-361068',
  '337-361069',
  '337-361070',
  '337-361273',
  '337-361274',
  '337-361275',
  '337-361276',
  '337-361277',
  '337-361278'],
 'opml_reg_numcat_group': ['335-361093',
  '335-361094',
  '335-361096',
  '335-361097',
  '335-361098',
  '335-361099',
  '335-361101',
  '335-361102',
  '335-361103',
  '335-361104',
  '335-361287',
  '335-361288',
  '335-361289',
  '335-361291',
  '335-361292',
  '335-361293',
  '335-361294'],
 'opml_class_numcat_group': ['334-361110',
  '334-361111',
  '334-361113',
  '334-361282