<a href="https://colab.research.google.com/github/DarkEol/AutoML/blob/main/AutoML-Ontology/Code/AutoSki_MetaOnto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##script for model search by AutoSklearn constrained with ontology

In [None]:
#downgrade to install auto-sklearn
!pip install Cython==0.29.36
!pip uninstall scipy -y
!pip install scipy==1.9
!pip uninstall pyparsing -y
!pip install pyparsing==2.4

In [None]:
!pip uninstall scikit_learn -y
!pip install scikit-learn==0.24.2 --no-build-isolation
!pip install auto-sklearn

In [None]:
!pip install owlready2

In [4]:
#making imports
from __future__ import annotations
import autosklearn.classification
import sklearn.model_selection
import numpy as np
import pandas as pd
#import pickle
#import graphviz
from sklearn import tree
from owlready2 import *
from sklearn.datasets import fetch_openml
#from sklearn.datasets import load_iris
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.tree import export_text
from sklearn import preprocessing
from smac.optimizer.smbo import SMBO
from smac.runhistory.runhistory import RunInfo, RunValue
from autosklearn.metrics import balanced_accuracy, precision, recall, f1
import time

In [6]:
#define function for early stopping
cost_lim = 0.10   #value of cost for early stopping
def callback(
    smbo: SMBO,
    run_info: RunInfo,
    result: RunValue,
    time_left: float,
) -> bool | None:
    """Stop early if we get a very low cost value for a single run

    The return value indicates to SMAC whether to stop or not. False will
    stop the search process while any other value will mean it continues.
    """
    # You can find out the parameters in the SMAC documentation
    # https://automl.github.io/SMAC3/main/
    if result.cost <= cost_lim:
        print("Stopping!")
        print(run_info)
        print(result)
        return False

In [21]:
#define list of algorithms
full_set = ['adaboost', 'bernoulli_nb', 'decision_tree', 'extra_trees', 'gaussian_nb', 'gradient_boosting',
            'k_nearest_neighbors', 'lda', 'liblinear_svc', 'libsvm_svc', 'mlp', 'multinomial_nb',
            'passive_aggressive', 'qda', 'random_forest', 'sgd']

dict_algos = {'AdaBoostClassifier': 'adaboost',
              'ExtraTreesClassifier': 'extra_trees',
              'GradientBoostingClassifier': 'gradient_boosting',
              'KNeighborsClassifier': 'k_nearest_neighbors',
              'PassiveAggressiveClassifier': 'passive_aggressive',
              'LIN_SVC': 'liblinear_svc',
              'MLPClassifier': 'mlp',
              'RandomForestClassifier': 'random_forest',
              'SVM_SVC':'libsvm_svc'}

onto = get_ontology("OntologyEmpty.owl").load()

dict_features = {
    'BinaryClass' : onto.BinaryClass,
    'NoBinaryClass' : onto.NoBinaryClass,
    'StringClass' : onto.StringClass,
    'NoStringClass' : onto.NoStringClass,
    'UnaryAttibutes' : onto.UnaryAttibutes,
    'NoUnaryAttibutes' : onto.NoUnaryAttibutes,
    'ManyFeatures' : onto.ManyFeatures,
    'FewFeatures' : onto.FewFeatures,
    'NoManyFeatures' : onto.FewFeatures,
    'NoFewFeatures' : onto.ManyFeatures,
    'ManyInstances' : onto.ManyInstances,
    'FewInstances' : onto.FewInstances,
    'NoManyInstances' : onto.FewInstances,
    'NoFewInstances' : onto.ManyInstances,
    'BinaryAttributes' : onto.BinaryAttributes,
    'NoBinaryAttributes' : onto.NoBinaryAttributes,
    'NumericAttributes' : onto.NumericAttributes,
    'NoNumericAttributes' : onto.NoNumericAttributes,
    'NominalAttributes' : onto.NominalAttributes,
    'NoNominalAttributes' : onto.NoNominalAttributes,
    'StringAttributes' : onto.StringAttributes,
    'NoStringAttributes' : onto.NoStringAttributes,
    'MissingValues' : onto.MissingValues,
    'NoMissingValues' : onto.NoMissingValues,
    'NumericClass' : onto.NumericClass,
    'NoNumericClass' : onto.NoNumericClass
    }

results = pd.DataFrame(columns=["Dataset", "AutoML library",	"Limit",	"Num. DataSets", "Data characteristic",	"Selected algorithm",	"Accuracy", "Precision", "Recall", "F-measure",	"AUC", "Time elapsed"])

In [22]:
#function for quering ontology with specific meta-feature
def queryOntology(num, data_char):
  ontologyName = "ontology-"+str(num)+"-as.owl"
  onto = get_ontology(ontologyName).load()

  # querying ontology
  list_algos = list()
  for cls in list(onto.AutoSklearnAlgorithm.subclasses()):
    if (dict_features[data_char] in cls.suitableFor):
      list_algos.append(dict_algos[cls.name])
  list_algos=list(set(list_algos)) #get unique values
  return list_algos

In [None]:
#firstly all features are false
features = pd.Series(data=[False] * 24, index=['ManyInstances', 'FewInstances', 'ManyFeatures', 'FewFeatures',
       'BinaryClass', 'DateClass', 'MissingClassValues', 'NominalClass',
       'NumericClass', 'NoClass', 'UnaryClass', 'EmptyNominalClass',
       'StringClass', 'RelationalClass', 'OnlyMulti-InstanceData',
       'BinaryAttributes', 'DateAttributes', 'EmptyNominalAttributes',
       'MissingValues', 'NominalAttributes', 'NumericAttributes',
       'UnaryAttibutes', 'RelationalAttributes', 'StringAttributes'])

#load dataset from OpenML by ID
opml = fetch_openml(data_id=1461)
data = opml.data
X = opml.data
y = opml.target

name = opml.details['name']

shape = X.shape
n_instances = shape[0]
n_features = shape[1]
n_target_values = len(y.unique())
numeric_data = X.select_dtypes(include=[np.number])
not_numeric_data = X.select_dtypes(exclude=[np.number])
category_data = X.select_dtypes(include=['category'])

if n_target_values == 2:
  binary_class = True
  features['BinaryClass'] = True
  print("Binary")
else:
  binary_class = False
  print("MultiClass")

if y.str.isnumeric().any():
  features['NumericClass'] = True
  string_class = False
  print("No string class")
else:
  string_class = True
  features['StringClass'] = True
  print("String class")

if n_instances > 3200:
  many_instances = True
  features['ManyInstances'] = True
  print("Many instances")
else:
  many_instances = False
  features['FewInstances'] = True
  print("Few instances")

if n_features > 100:
  print("Many features")
else:
  features['FewFeatures'] = True
  print("Few features")

if numeric_data.shape[1] > 0:
  features['NumericAttributes'] = True

if category_data.shape[1] > 0:
  features['NominalAttributes'] = True

binary_attributes = False
string_attributes = False
missing_values = False

for fname in opml.feature_names:
  if len(X[fname].unique())==2: # and not binary_attributes:
    binary_attributes = True
    features['BinaryAttributes'] = True

  if len(X[fname].unique())==1:
    features['UnaryAttributes'] = True

  if X[fname].isna().sum() > 0:
    missing_values = True
    features['MissingValues'] = True

  if not pd.api.types.is_numeric_dtype(X[fname]):
    string_attributes = True
    features['StringAttributes'] = True

if binary_attributes:
  print("Binary attributes")
else:
  print("No binary attributes")
if missing_values:
  print("Missing values")
else:
  print("No missing values")
if string_attributes:
  print("String attributes")
else:
  print("No string attributes")

if y.dtype=='category':
  features['NominalClass'] = True
  print("categorial class encoding")
  le = preprocessing.LabelEncoder()
  y = le.fit_transform(y)
else:
  print("not categorial class")

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
num_ds = 35     #number of datasets used fore ontology creation (35 or 50)
runtime = 900   #amount of time allocated for the search (in seconds)
d_char = "StringAttributes" #attribute to search into ontology. take value from results of the previous cell

#running search for best model
if __name__ == '__main__':
  for i in range(5):
    # 0 - search with time constraint
    # 1 - search with time constraint and early stopping
    # >1 - search with time constraint, early stopping and use of ontology

    start_time = time.time()
    num = results.shape[0]
    data_char = "No characteristics"
    limits = "cost <= " + str(cost_lim)

    #AutoML settings
    if i==0:
      limits = "No limits"
      cls = autosklearn.classification.AutoSklearnClassifier(
      ensemble_class=None,
      time_left_for_this_task=runtime)

    if i==1:
      cls = autosklearn.classification.AutoSklearnClassifier(
      get_trials_callback=callback,
      ensemble_class=None,
      time_left_for_this_task=runtime)

    if i>1:
      results.at[num, "Num. DataSets"] = num_ds
      data_char = d_char
      list_algos = queryOntology(num_ds, data_char)
      print(list_algos)
      cls = autosklearn.classification.AutoSklearnClassifier(
      include={'classifier': list_algos},
      get_trials_callback=callback,
      ensemble_class=None,
      time_left_for_this_task=runtime)

    print("searching for model")
    cls.fit(X_train, y_train)
    print("model found")
    elapsed = time.time() - start_time
    print("elapsed: ", elapsed)

    #info about found models
    print("leaderboard")
    print(cls.leaderboard())

    algo = cls.leaderboard().iat[0,2]

    #predictions and multiclass metrics
    predictions = cls.predict(X_test)
    accuracy = cls.score(X_test, y_test)
    if binary_class:
      precision = sklearn.metrics.precision_score(y_test, predictions)
      recall = sklearn.metrics.recall_score(y_test, predictions)
      f1 = sklearn.metrics.f1_score(y_test, predictions)
      auc = sklearn.metrics.roc_auc_score(y_test, predictions)
    else:
      precision = sklearn.metrics.precision_score(y_test, predictions, average='macro', zero_division=0) #macro for multiclass
      recall = sklearn.metrics.recall_score(y_test, predictions, average='macro') #macro for multiclass
      f1 = sklearn.metrics.f1_score(y_test, predictions, average='macro')
      pred_proba = cls.predict_proba(X_test)  # for muliclass tasks
      auc = sklearn.metrics.roc_auc_score(y_test, pred_proba, multi_class='ovr') #pred_proba for multiclass

    print("algo: ",algo)
    print("Accuracy", accuracy)
    print("Precision", precision)
    print("Recall", recall)
    print("F-measure", f1)
    print("AUC", auc)

    results.at[num,'Dataset'] = name
    results.at[num,'AutoML library'] = "Auto-Sklearn"
    results.at[num,'Limit'] = limits
    results.at[num,'Data characteristic'] = data_char
    results.at[num,'Algorithm'] = algo
    results.at[num,'Accuracy'] = accuracy
    results.at[num,'Precision'] = precision
    results.at[num,'Recall'] = recall
    results.at[num,'F-measure'] = f1
    results.at[num,'AUC'] = auc
    results.at[num,'Time elapsed'] = elapsed

In [None]:
print(results.shape)
print(results[['Dataset', 'Num. DataSets', 'Limit', 'Data characteristic', 'Algorithm', 'Accuracy', 'Time elapsed']])
results.to_excel('Experiments-as.xlsx')