In [None]:
# Drive version:
# !pip uninstall TabularExperimentTrackerClient --y
# !pip install git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient
# !pip uninstall NeuralNetworksTrainingPackage --y
# !pip install git+https://github.com/Bartosz-G/NeuralNetworksTrainingPackage

In [2]:
%pip install git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient
%pip install git+https://github.com/Bartosz-G/NeuralNetworksTrainingPackage

Collecting git+https://github.com/DanielWarfield1/TabularExperimentTrackerClient
  Cloning https://github.com/DanielWarfield1/TabularExperimentTrackerClient to /tmp/pip-req-build-9jn0v3pk
  Running command git clone --filter=blob:none --quiet https://github.com/DanielWarfield1/TabularExperimentTrackerClient /tmp/pip-req-build-9jn0v3pk
  Resolved https://github.com/DanielWarfield1/TabularExperimentTrackerClient to commit 780933411aa8c4e394478a26dec4a39447f8f012
  Preparing metadata (setup.py) ... [?25ldone
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/Bartosz-G/NeuralNetworksTrainingPackage
  Cloning https://github.com/Bartosz-G/NeuralNetworksTrainingPackage to /tmp/pip-req-build-

In [3]:
import numpy as np
import pandas as pd
import sklearn
import torch
import time

In [None]:
# Importing NeuralNetworksPackage
#from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import *
# Global namespace:
# Hyperparams(**run_info.get('hyp'))
# CustomDataset(X, Y, relative_indices, tensor_type=torch.float)
# CustomDatasetWrapper(train_dataset, relative_indices)
# kfold_dataloader_iterator(dataset, n_splits=10, random_state=42, batch_size=16, shuffle_kfold=True, shuffle_dataloader=True)
# get_train_test(X, y, categorical_indicator, attribute_names, train_split, seed)
# get_train_val_test
#from NeuralNetworksTrainingPackage.metrics.basic_metrics import *
# test(args, model, device, test_loader, test_set_name)
# train(args, model, device, train_loader, optimizer, epoch, anneal, alpha=1)
# calc_metrics(y, yhat, is_categorical)

In [4]:
import os
from TabularExperimentTrackerClient.ExperimentClient import ExperimentClient

path =  '../creds/'
creds_orch_file = "creds-orch.txt"
creds_openml_file = "creds-openml.txt"



with open(os.path.join(path, creds_orch_file), 'r') as file:
    lines = file.readlines()
    orchname = lines[0].strip()
    orchsecret = lines[1].strip()

with open (os.path.join(path, creds_openml_file), "r") as myfile:
    openMLAPIKey = myfile.read()

ex = ExperimentClient(verbose = True)


ex.define_orch_cred(orchname, orchsecret)
ex.define_opml_cred(openMLAPIKey)

# Colab version
# ex.define_opml_cred_drive('/My Drive/research/non-homogenous-data/creds/creds-openml.txt')
# ex.define_orch_cred_drive('bart', '/My Drive//research/non-homogenous-data/creds/creds-colab.txt')

# 1. Data pre-processing

In [5]:
from NeuralNetworksTrainingPackage.event_handler import dataPreProcessingEventEmitter
from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import filterCardinality, quantileTransform, trunctuateData
from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import oneHotEncodePredictors, oneHotEncodeTargets, toDataFrame

n_sample = 20000
quantile_transform_distribution='uniform'


data_pre_processing = dataPreProcessingEventEmitter()

filter_cardinality = filterCardinality()
data_pre_processing.add_pre_processing('regression', filter_cardinality)
data_pre_processing.add_pre_processing('classification', filter_cardinality)

quantile_transform = quantileTransform(output_distribution = quantile_transform_distribution)
data_pre_processing.add_pre_processing('regression', quantile_transform)
data_pre_processing.add_pre_processing('classification', quantile_transform)

trunctuate_data = trunctuateData(n = n_sample)
data_pre_processing.add_pre_processing('regression', trunctuate_data)
data_pre_processing.add_pre_processing('classification', trunctuate_data)

one_hot_encode_predictors = oneHotEncodePredictors()
data_pre_processing.add_pre_processing('regression', one_hot_encode_predictors)
data_pre_processing.add_pre_processing('classification', one_hot_encode_predictors)

one_hot_encode_targets = oneHotEncodeTargets()
data_pre_processing.add_pre_processing('classification', one_hot_encode_targets)

to_data_frame = toDataFrame()
data_pre_processing.add_pre_processing('regression', to_data_frame)
data_pre_processing.add_pre_processing('classification', to_data_frame)

# 2. Defining the experiment

In [6]:
experiment_name = 'experiment_LCN_1'



# LCN and LLN Parameters
depth = {'distribution': 'int_uniform', 'min':1, 'max':11}
seed = {'distribution': 'constant', 'value': 42}
drop_type = {'distribution': 'categorical', 'values':['node_dropconnect', 'none']}
p = {'distribution': 'float_uniform', 'min':0.25, 'max':0.75}
back_n = {'distribution': 'categorical', 'values':[0, 0, 0, 1]}
hidden_dim = {'distribution': 'constant', 'value': 1} # Assertion error coming from Net if not 1
anneal = {'distribution': 'categorical', 'values':['interpolation', 'none', 'approx']}
batch_size = {'distribution': 'categorical', 'values':[16,32,64,64,64,128,256]}
epochs = {'distribution': 'constant', 'value': 30}
lr = {'distribution': 'log_uniform', 'min':0.05, 'max':0.2} # yields mean = 0.1082, median 0.1
momentum = {'distribution': 'constant', 'value': 0.9}
no_cuda = {'distribution': 'constant', 'value': False}
lr_step_size = {'distribution': 'categorical', 'values':[10, 10, 15, 20]}
gamma = {'distribution': 'constant', 'value': 0.1}

In [7]:
#============================================================
# Regression: Locally Constant Networks
#============================================================
LCN_reg_SGD_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_constant'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'SGD'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'regression'}
    }

LCN_reg_AMSGrad_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_constant'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'AMSGrad'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'regression'}
    }

#============================================================
# Regression: Locally Linear Networks
#============================================================

LLN_reg_SGD_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_linear'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'SGD'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'regression'}
    }


LLN_reg_AMSGrad_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_linear'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'AMSGrad'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'regression'}
    }

#============================================================
# Classification: Locally Constant Networks
#============================================================

LCN_cls_SGD_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_constant'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'SGD'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'classification'}
    }

LCN_cls_AMSGrad_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_constant'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'AMSGrad'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'classification'}
    }

#============================================================
# Classification: Locally Linear Networks
#============================================================

LLN_cls_SGD_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_linear'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'SGD'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'classification'}
    }


LLN_cls_AMSGrad_space = {
    'depth': depth,
    'seed': seed,
    'drop_type': drop_type,
    'p': p,
    'ensemble_n': {'distribution': 'constant', 'value': 1},
    'shrinkage': {'distribution': 'constant', 'value': 1},
    'back_n': back_n,
    'net_type': {'distribution': 'constant', 'value': 'locally_linear'},
    'hidden_dim': hidden_dim,
    'anneal': anneal,
    'optimizer': {'distribution': 'constant', 'value': 'AMSGrad'},
    'batch_size': batch_size,
    'epochs': epochs,
    'lr': lr,
    'momentum': momentum,
    'no_cuda': no_cuda,
    'lr_step_size': lr_step_size,
    'gamma': gamma,
    'task': {'distribution': 'constant', 'value': 'classification'}
    }

In [8]:
model_groups = {
    'LCN_reg_SGD':{'model':'LCN_reg_SGD', 'hype':LCN_reg_SGD_space},
    'LCN_reg_AMSGrad':{'model':'LCN_reg_AMSGrad', 'hype':LCN_reg_AMSGrad_space},
    'LLN_reg_SGD':{'model':'LLN_reg_SGD', 'hype':LLN_reg_SGD_space},
    'LLN_reg_AMSGrad':{'model':'LLN_reg_AMSGrad', 'hype':LLN_reg_AMSGrad_space},
    'LCN_cls_SGD':{'model':'LCN_cls_SGD', 'hype':LCN_cls_SGD_space},
    'LCN_cls_AMSGrad':{'model':'LCN_cls_AMSGrad', 'hype':LCN_cls_AMSGrad_space},
    'LLN_cls_SGD':{'model':'LLN_cls_SGD', 'hype':LLN_cls_SGD_space},
    'LLN_cls_AMSGrad':{'model':'LLN_cls_AMSGrad', 'hype':LLN_cls_AMSGrad_space},
}

ex.def_model_groups(model_groups)

In [9]:
ex.def_data_groups_opml()
print(f'automatically defined data groups: {ex.data_groups.keys()}')

classification_models = [k for k in model_groups.keys() if '_cls' in k]
regression_models = [k for k in model_groups.keys() if '_reg' in k]


applications = {'opml_reg_purnum_group': regression_models,
                'opml_reg_numcat_group': regression_models,
                'opml_class_purnum_group': classification_models,
                'opml_class_numcat_group': classification_models}

ex.def_applications(applications)
ex.reg_experiment(experiment_name)

automatically defined data groups: dict_keys(['opml_reg_purnum_group', 'opml_class_purnum_group', 'opml_reg_numcat_group', 'opml_class_numcat_group'])
existing experiment found


'existing experiment found'

In [36]:
exp_info = ex.experiment_info()
successful_runs = exp_info['successful_runs']
required_runs = exp_info['required_runs']
print('total required runs: {}'.format(required_runs))

total required runs: 14160


# 2. Main training loop:

In [37]:
from NeuralNetworksTrainingPackage.dataprocessing.basic_pre_processing import get_train_val_test, CustomDataset
from NeuralNetworksTrainingPackage.metrics.basic_metrics import calc_metrics
from models.LcnNetwork import *
from training.LcnTrain import *


for i in range(14160):
    print(f'==== Begin run:{i} ====')
    print('---- Initialising parameters for the run ----')
    run_info = ex.begin_run_sticky()
    args = Hyperparams(**run_info.get('hyp')) # hyperparameters for LCN need to be in form of an object (you can ignore this)
    print(run_info)


    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    data_pre_processing.set_seed_for_all(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)




    print('---- Loading datasets ----')
    X, y, categorical_indicator, attribute_names = ex.opml_load_task(run_info['mtpair_task'])
    train_data, val_data, test_data = get_train_val_test(X, y, categorical_indicator, attribute_names,
                                                         data_pre_processing,
                                                         task = args.task,
                                                         model = run_info.get('model'),
                                                         split = [0.5, 0.25, 0.25],
                                                         args = args) # Returns CustomDataset obj instances




    train_dataloader = torch.utils.data.DataLoader(train_data, # CustomDataset obj can be directly passed to dataloader
                                                   batch_size=args.batch_size,
                                                   shuffle= True)


    train_eval_dataloader = torch.utils.data.DataLoader(train_data, # CustomDataset obj can be directly passed to dataloader
                                                        batch_size=len(train_data),
                                                        shuffle= True) # Required for test_metrics()

    val_dataloader = torch.utils.data.DataLoader(val_data, # CustomDataset obj can be directly passed to dataloader
                                                 batch_size=len(val_data),
                                                 shuffle= True)

    test_dataloader = torch.utils.data.DataLoader(test_data, # CustomDataset obj can be directly passed to dataloader
                                                  batch_size=len(test_data),
                                                  shuffle= True)


    model = Net(input_dim= args.input_dim, 
                output_dim= args.output_dim, 
                hidden_dim= args.hidden_dim, 
                num_layer= args.depth, 
                num_back_layer= args.back_n, 
                dense= True,
                drop_type= args.drop_type,
                net_type= args.net_type,
                approx= args.anneal).to(device)


    if args.optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True)
    elif args.optimizer == 'AMSGrad':
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.gamma)


    metrics = {}
    start_epoch = 1  # start from epoch 1 or last checkpoint epoch
    if args.anneal == 'approx':
        args.net_type = 'approx_' + args.net_type


    start_time = time.time()
    for epoch in range(start_epoch, args.epochs + start_epoch):
        print(f"----{epoch}th training epoch ----")
        epoch_metrics = {}

        scheduler.step(epoch)

        alpha = get_alpha(epoch, args.epochs)
        train_approximate_loss = train(args, model, device, train_dataloader, optimizer, epoch, args.anneal, alpha)

        train_loss = get_loss(args, model, device, train_dataloader, 'train')
        val_loss = get_loss(args, model, device, val_dataloader, 'valid')
        test_loss = get_loss(args, model, device, test_dataloader, 'test')

        if epoch == args.epochs:
            continue

        epoch_metrics['train_loss'] = train_loss
        epoch_metrics['val_loss'] = val_loss
        epoch_metrics['test_loss'] = test_loss
        epoch_metrics['epoch'] = epoch
        
        ex.update_run(epoch_metrics)

        if torch.isnan(torch.tensor(train_loss)).item():
            print('---Stopping training due to loss being nan!---')
            break


    else:
        metrics['epoch_time'] = time.time()-start_time


        train_metrics = get_metrics(args, model, device, train_eval_dataloader, calc_metrics, 'train') #Requires batch_size to be entire dataset
        val_metrics = get_metrics(args, model, device, val_dataloader, calc_metrics, 'valid')
        test_metrics = get_metrics(args, model, device, test_dataloader, calc_metrics, 'test')

        metrics['train_loss'] = train_loss
        metrics['val_loss'] = val_loss
        metrics['test_loss'] = test_loss
        metrics['train_metrics'] = train_metrics
        metrics['validate_metrics'] = val_metrics
        metrics['test_metrics'] = test_metrics
        
        ex.update_run(metrics)
        ex.end_run()

==== Begin run:0 ====
---- Initialising parameters for the run ----
{'_id': '64f848b2ec39ca1aa41427d6', 'metrics_per_epoch': [], 'experiment_id': '64f0f2556e02727fe9a6ff59', 'experiment_name': 'experiment_LCN_1', 'mtpair_index': 211, 'mtpair_model': 'LLN_cls_AMSGrad', 'mtpair_task': '334-361110', 'is_completed': False, 'user_id': '64d3a7457658d6ec6db139d0', 'user_name': 'bart', 'hyp': {'depth': 11, 'seed': 42, 'drop_type': 'node_dropconnect', 'p': 0.5279647480386398, 'ensemble_n': 1, 'shrinkage': 1, 'back_n': 1, 'net_type': 'locally_linear', 'hidden_dim': 1, 'anneal': 'approx', 'optimizer': 'AMSGrad', 'batch_size': 64, 'epochs': 30, 'lr': 0.17946939538041282, 'momentum': 0.9, 'no_cuda': False, 'lr_step_size': 10, 'gamma': 0.1, 'task': 'classification'}, 'model': 'LLN_cls_AMSGrad', 'task': '334-361110'}
4f848b2ec39ca1aa41427d
{'_id': '64f848b2ec39ca1aa41427d6', 'metrics_per_epoch': [], 'experiment_id': '64f0f2556e02727fe9a6ff59', 'experiment_name': 'experiment_LCN_1', 'mtpair_index': 21

KeyboardInterrupt: 

## Checking whether the code works as intended

In [11]:
X, y, categorical_indicator, attribute_names = ex.opml_load_task('335-361103')

downloading task 335-361103
task different than previous task, downloading...


In [27]:
from copy import deepcopy
# X_c, y_c, categorical_indicator_c, attribute_names_c = deepcopy(X), deepcopy(y), deepcopy(categorical_indicator), deepcopy(attribute_names)
X, y, categorical_indicator, attribute_names = deepcopy(X_c), deepcopy(y_c), deepcopy(categorical_indicator_c), deepcopy(attribute_names_c)

In [None]:
some_val = train_data.X.iloc[[5], :]
print(some_val)
torch.tensor(some_val.values.squeeze(axis=0), dtype=torch.float)

In [None]:
next(iter(train_dataloader))

In [18]:
X, y, categorical_indicator, attribute_names = data_pre_processing.apply('regression', X, y, categorical_indicator, attribute_names)
X, y, categorical_indicator, attribute_names = data_pre_processing.apply('LCN_reg_AMSGrad', X, y, categorical_indicator, attribute_names)

In [None]:
type(train_data.X)

In [None]:
X, y, categorical_indicator, attribute_names = ex.opml_load_task(run_info['mtpair_task'])

In [30]:
train_data, _, __ = get_train_val_test(X, y, categorical_indicator, attribute_names,
                                                         data_pre_processing,
                                                         task = 'regression',
                                                         model = 'LCN_reg_AMSGrad',
                                                         split = [0.5, 0.25, 0.25],
                                                         args = None)

In [32]:
print(type(train_data.X))
print(type(train_data.Y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [34]:
train_dataloader = torch.utils.data.DataLoader(train_data, # CustomDataset obj can be directly passed to dataloader
                                                   batch_size= 10,
                                                   shuffle= True)

In [35]:
next(iter(train_dataloader))

[tensor([[0.2698, 0.7518, 0.7553, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000],
         [0.8068, 0.1346, 0.2913, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000],
         [0.5160, 0.3098, 0.9389, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000],
         [0.8068, 0.4479, 0.8700, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000],
         [0.7267, 0.1717, 0.3554, 0.0000, 0.

In [31]:
train_data.X

Unnamed: 0,Hour,Altitude..m.,PM.sub.2.5..sub..particulate.matter..Hourly.measured.,Month_0,Month_1,Month_10,Month_11,Month_2,Month_3,Month_4,...,DayofWeek_1,DayofWeek_2,DayofWeek_3,DayofWeek_4,DayofWeek_5,DayofWeek_6,Environment.Type_0,Environment.Type_1,Environment.Type_2,Environment.Type_3
0,0.148148,0.552052,0.217217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.107608,0.873373,0.355355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.000000,0.349349,0.158158,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.726727,0.171672,0.143143,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.432432,0.751752,0.453954,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.432432,0.171672,0.415916,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
19996,0.936436,0.873373,0.247247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19997,0.806807,0.256256,0.570070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19998,0.642643,0.309810,0.094595,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [20]:
y

Unnamed: 0,PM.sub.10..sub..particulate.matter..Hourly.measured.
0,2.795817
1,2.557227
2,2.572612
3,2.079442
4,2.517696
...,...
19995,2.803360
19996,2.928524
19997,1.686399
19998,3.353407


In [19]:
X

Unnamed: 0,Hour,Altitude..m.,PM.sub.2.5..sub..particulate.matter..Hourly.measured.,Month_0_0.0,Month_0_1.0,Month_1_0.0,Month_1_1.0,Month_10_0.0,Month_10_1.0,Month_11_0.0,...,DayofWeek_6_0.0,DayofWeek_6_1.0,Environment.Type_0_0.0,Environment.Type_0_1.0,Environment.Type_1_0.0,Environment.Type_1_1.0,Environment.Type_2_0.0,Environment.Type_2_1.0,Environment.Type_3_0.0,Environment.Type_3_1.0
0,0.520020,0.544545,0.767267,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.564064,0.461461,0.592092,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,0.150651,0.811311,0.258258,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,0.606106,0.321321,0.344845,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.564064,0.228228,0.334835,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1.000000,0.811311,0.267267,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
19996,0.938939,0.758759,0.604605,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
19997,0.606106,0.172172,0.150150,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
19998,0.394394,0.587087,0.963660,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [25]:
attribute_names

['Hour',
 'Altitude..m.',
 'PM.sub.2.5..sub..particulate.matter..Hourly.measured.',
 'Month_0_0.0',
 'Month_0_1.0',
 'Month_1_0.0',
 'Month_1_1.0',
 'Month_10_0.0',
 'Month_10_1.0',
 'Month_11_0.0',
 'Month_11_1.0',
 'Month_2_0.0',
 'Month_2_1.0',
 'Month_3_0.0',
 'Month_3_1.0',
 'Month_4_0.0',
 'Month_4_1.0',
 'Month_5_0.0',
 'Month_5_1.0',
 'Month_6_0.0',
 'Month_6_1.0',
 'Month_7_0.0',
 'Month_7_1.0',
 'Month_8_0.0',
 'Month_8_1.0',
 'Month_9_0.0',
 'Month_9_1.0',
 'DayofWeek_0_0.0',
 'DayofWeek_0_1.0',
 'DayofWeek_1_0.0',
 'DayofWeek_1_1.0',
 'DayofWeek_2_0.0',
 'DayofWeek_2_1.0',
 'DayofWeek_3_0.0',
 'DayofWeek_3_1.0',
 'DayofWeek_4_0.0',
 'DayofWeek_4_1.0',
 'DayofWeek_5_0.0',
 'DayofWeek_5_1.0',
 'DayofWeek_6_0.0',
 'DayofWeek_6_1.0',
 'Environment.Type_0_0.0',
 'Environment.Type_0_1.0',
 'Environment.Type_1_0.0',
 'Environment.Type_1_1.0',
 'Environment.Type_2_0.0',
 'Environment.Type_2_1.0',
 'Environment.Type_3_0.0',
 'Environment.Type_3_1.0']

In [26]:
attribute_names_c

['Hour',
 'Month',
 'DayofWeek',
 'Environment.Type',
 'Altitude..m.',
 'PM.sub.2.5..sub..particulate.matter..Hourly.measured.']