<a href="https://colab.research.google.com/github/DarkEol/AutoML/blob/main/AutoML-Ontology/Code/AutoH2O_MetaOnto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Script for model search by H2O AutoML constrained with ontology

In [None]:
!pip install h2o

In [None]:
!pip install owlready2

In [None]:
import time
import h2o
import numpy as np
import pandas as pd
import pickle
from owlready2 import *
from h2o.automl import H2OAutoML
import sklearn.metrics
from sklearn.datasets import fetch_openml

In [None]:
h2o.init()

In [None]:
#defining neccesary settings

#define list of algorithms
full_set = ['DRF', 'GLM', 'XGBoost', 'GBM', 'DeepLearning', 'StackedEnsemble']

dict_algos = {'Distributed_Random_Forest': 'DRF',
              'Generalized_Linear_Model': 'GLM',
              'XGBoost': 'XGBoost',
              'Gradient_Boosting_Machine': 'GBM',
              'Deep_Learning': 'DeepLearning',
              'Stacked_Ensembles': 'StackedEnsemble'}

#load ontology from file
onto = get_ontology("OntologyEmpty.owl").load()

#meta-features stored in ontology
dict_features = {
    'BinaryClass' : onto.BinaryClass,
    'NoBinaryClass' : onto.NoBinaryClass,
    'StringClass' : onto.StringClass,
    'NoStringClass' : onto.NoStringClass,
    'UnaryAttibutes' : onto.UnaryAttibutes,
    'NoUnaryAttibutes' : onto.NoUnaryAttibutes,
    'ManyFeatures' : onto.ManyFeatures,
    'FewFeatures' : onto.FewFeatures,
    'NoManyFeatures' : onto.FewFeatures,
    'NoFewFeatures' : onto.ManyFeatures,
    'ManyInstances' : onto.ManyInstances,
    'FewInstances' : onto.FewInstances,
    'NoManyInstances' : onto.FewInstances,
    'NoFewInstances' : onto.ManyInstances,
    'BinaryAttributes' : onto.BinaryAttributes,
    'NoBinaryAttributes' : onto.NoBinaryAttributes,
    'NumericAttributes' : onto.NumericAttributes,
    'NoNumericAttributes' : onto.NoNumericAttributes,
    'NominalAttributes' : onto.NominalAttributes,
    'NoNominalAttributes' : onto.NoNominalAttributes,
    'StringAttributes' : onto.StringAttributes,
    'NoStringAttributes' : onto.NoStringAttributes,
    'MissingValues' : onto.MissingValues,
    'NoMissingValues' : onto.NoMissingValues,
    'NumericClass' : onto.NumericClass,
    'NoNumericClass' : onto.NoNumericClass
    }

#create empty frame for results
results = pd.DataFrame(columns=["Dataset", "AutoML library",	"Limit",	"Num. DataSets", "Data characteristic",	"Selected algorithm",	"Accuracy", "Precision", "Recall", "F-measure",	"AUC", "Time elapsed"])

In [None]:
#function for quering ontology with specific meta-feature
def queryOntology(num, data_char):
  ontologyName = "ontology-"+str(num)+"-h2o.owl"
  onto = get_ontology(ontologyName).load()

  # querying ontology
  list_algos = list()
  for cls in list(onto.H2OAlgorithm.subclasses()):
    if (dict_features[data_char] in cls.suitableFor):
      print(cls.name)
      list_algos.append(dict_algos[cls.name])
  return list_algos

In [None]:
#load dataset and extract its meta-features
#firstly all features are false
features = pd.Series(data=[False] * 24, index=['ManyInstances', 'FewInstances', 'ManyFeatures', 'FewFeatures',
       'BinaryClass', 'DateClass', 'MissingClassValues', 'NominalClass',
       'NumericClass', 'NoClass', 'UnaryClass', 'EmptyNominalClass',
       'StringClass', 'RelationalClass', 'OnlyMulti-InstanceData',
       'BinaryAttributes', 'DateAttributes', 'EmptyNominalAttributes',
       'MissingValues', 'NominalAttributes', 'NumericAttributes',
       'UnaryAttibutes', 'RelationalAttributes', 'StringAttributes'])

#load dataset from OpenML by id
opml = fetch_openml(data_id=41169, as_frame=True, parser='auto')

#data for features extraction
X = opml.data
y = opml.target
name = opml.details['name']

#extracting meta-features
#extracting the size of the dataset
shape = X.shape
n_instances = shape[0]
n_features = shape[1]
n_target_values = len(y.unique())
numeric_data = X.select_dtypes(include=[np.number])
not_numeric_data = X.select_dtypes(exclude=[np.number])
category_data = X.select_dtypes(include=['category'])

#extracting information about the target class
if n_target_values == 2:
  binary_class = True
  features['BinaryClass'] = True
  print("Binary")
else:
  binary_class = False
  print("MultiClass")

if y.str.isnumeric().any():
  features['NumericClass'] = True
  string_class = False
  print("No string class")
else:
  string_class = True
  features['StringClass'] = True
  print("String class")

if y.dtype=='category':
  features['NominalClass'] = True

if n_instances > 3200:
  many_instances = True
  features['ManyInstances'] = True
  print("Many instances")
else:
  many_instances = False
  features['FewInstances'] = True
  print("Few instances")

if n_features > 100:
  print("Many features")
else:
  features['FewFeatures'] = True
  print("Few features")

#extracting information about dataset attributes
if numeric_data.shape[1] > 0:
  features['NumericAttributes'] = True

if category_data.shape[1] > 0:
  features['NominalAttributes'] = True

binary_attributes = False
string_attributes = False
missing_values = False

for fname in opml.feature_names:
  if len(X[fname].unique())==2: # and not binary_attributes:
    binary_attributes = True
    features['BinaryAttributes'] = True

  if len(X[fname].unique())==1:
    features['UnaryAttributes'] = True

  if X[fname].isna().sum() > 0:
    missing_values = True
    features['MissingValues'] = True

  if not pd.api.types.is_numeric_dtype(X[fname]):
    string_attributes = True
    features['StringAttributes'] = True

if binary_attributes:
  print("Binary attributes")
else:
  print("No binary attributes")
if missing_values:
  print("Missing values")
else:
  print("No missing values")
if string_attributes:
  print("String attributes")
else:
  print("No string attributes")

x=opml.feature_names
y=opml.target.name
frame = h2o.H2OFrame(opml.frame)
train, test = frame.split_frame(ratios=[.75])

if train[y].isnumeric():
  train[y] = train[y].asfactor()
  test[y] = test[y].asfactor()
  print("convert numeric target to categorial")
else:
  print("target remained the same")

In [None]:
#performing AutoML search for model using different constrains

cost_lim = 0.70 #threshold for the early stopping (misclassification)
num_ds = 35     #number of datasets used fore ontology creation
runtime = 900   #amount of time allocated for the search (in seconds)
d_char = "NoBinaryAttributes" #attribute to search into ontology

for i in range(0,5):
# 0 - search with time constrain
# 1 - search with time constrain and early stopping
# >1 - search with time constrain, early stopping and use of ontology

  start_time = time.time()
  num = results.shape[0]

  #AutoML settings
  if i==0:
    data_char = "No characteristics"
    limits = "No limits"
    automl = h2o.automl.H2OAutoML(max_runtime_secs=runtime)

  if i==1:
    data_char = "No characteristics"
    limits = "misclas=" + str(cost_lim)

    automl = h2o.automl.H2OAutoML(
    stopping_metric = "misclassification",
    stopping_rounds = 1,
    stopping_tolerance = cost_lim,
    max_runtime_secs=runtime)

  if i>1:
    results.at[num, "Num. DataSets"] = num_ds
    data_char = d_char
    limits = "misclas=" + str(cost_lim)
    list_algos = queryOntology(num_ds, data_char)

    automl = h2o.automl.H2OAutoML(
    stopping_metric = "misclassification",
    stopping_rounds = 1,
    stopping_tolerance = cost_lim,
    include_algos=list_algos,
    max_runtime_secs=runtime)

  print("searching for model")
  automl.train(x=x, y=y, training_frame=train)
  print("model found")
  elapsed = time.time() - start_time
  print("elapsed: ", elapsed)

  #info about found models
  print("leaderboard")
  print("leaderboard:", h2o.automl.get_leaderboard(automl, extra_columns = "ALL"))

  algo = automl.leader.key

  if binary_class: #metrics for binary classifications
    perf = automl.leader.model_performance(test)
    accuracy = perf.accuracy()[0][1]
    precision = perf.precision()[0][1]
    recall = perf.recall()[0][1]
    f1 = perf.F1()[0][1]
    auc = perf.auc()
  else:
    y_test = test[y].as_data_frame()
    y_pred_0 = automl.leader.predict(test)  #calculate predictions
    y_pred = y_pred_0['predict'].as_data_frame() #take from table only predictions
    accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
    precision = sklearn.metrics.precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = sklearn.metrics.recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = sklearn.metrics.f1_score(y_test, y_pred, average='macro')
    preds_0 = y_pred_0.as_data_frame() #convert H2OFrame to DataFrame
    preds = preds_0.drop('predict', axis=1)  #remove name of classes (first column)
    y_test_p = y_test[y]  #convert DataFrame to Series
    auc = sklearn.metrics.roc_auc_score(y_test_p, preds, multi_class='ovr')

  print("algo: ",algo)
  print("Accuracy", accuracy)
  print("Precision", precision)
  print("Recall", recall)
  print("F-measure", f1)
  print("AUC", auc)

  results.at[num,'Dataset'] = name
  results.at[num,'AutoML library'] = "H2O"
  results.at[num,'Limit'] = limits
  results.at[num,'Data characteristic'] = data_char
  results.at[num,'Selected algorithm'] = algo
  results.at[num,'Accuracy'] = accuracy
  results.at[num,'Precision'] = precision
  results.at[num,'Recall'] = recall
  results.at[num,'F-measure'] = f1
  results.at[num,'AUC'] = auc
  results.at[num,'Time elapsed'] = elapsed

In [None]:
print(results.shape[0])
print(results[['Dataset', 'Num. DataSets', 'Accuracy', 'AUC','Time elapsed']])
results.to_excel('Experiments-h2o.xlsx')