##Script for building decision tree and ontology for AutoML-H2O from meta-features

In [None]:
!pip install owlready2

Collecting owlready2
  Downloading owlready2-0.45.tar.gz (27.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: owlready2
  Building wheel for owlready2 (pyproject.toml) ... [?25l[?25hdone
  Created wheel for owlready2: filename=owlready2-0.45-cp310-cp310-linux_x86_64.whl size=24077432 sha256=a14f99661bd25ab6082c5e1f268cf2e1f39844a23f3736c1aaa53112274c7c89
  Stored in directory: /root/.cache/pip/wheels/5c/f4/9d/249b1671d391e3feddd443c7d0eb79a732afabc9f370591271
Successfully built owlready2
Installing collected packages: owlready2
Successfully installed owlready2-0.45


In [None]:
import pandas as pd
import graphviz
import pickle
from sklearn import tree
from sklearn.tree import export_text
from owlready2 import *

In [None]:
#building decision tree

info = pd.read_csv('features-h2o-40-semi.csv', delimiter=';')
#print(type(info.columns.values))
#print(info.columns.values[:len(info.columns.values)-1])
#print(info)

#target is last attribute
y = info[info.columns[info.columns.size-1]]
X = info[info.columns.delete(info.columns.size-1)]
print("x len: ", len(X))
#print("X: ", X)
#print("y:", y)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
#tree.plot_tree(clf)

print("save decision tree into file")
file = open('decisionTree.txt','wb')
pickle.dump(clf, file)
file.close()
print("file saved")

r = export_text(clf, feature_names=list(info.columns.values[:len(info.columns.values)-1]))
print(r)
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=list(info.columns.values[:len(info.columns.values)-1]), class_names=y)
graph = graphviz.Source(dot_data)
graph.render("tree")

x len:  35
save decision tree into file
file saved
|--- NumericAttributes <= 0.50
|   |--- ManyFeatures <= 0.50
|   |   |--- MissingValues <= 0.50
|   |   |   |--- BinaryClass <= 0.50
|   |   |   |   |--- class: StackedEnsemble
|   |   |   |--- BinaryClass >  0.50
|   |   |   |   |--- ManyInstances <= 0.50
|   |   |   |   |   |--- class: GBM
|   |   |   |   |--- ManyInstances >  0.50
|   |   |   |   |   |--- class: StackedEnsemble
|   |   |--- MissingValues >  0.50
|   |   |   |--- class: GBM
|   |--- ManyFeatures >  0.50
|   |   |--- class: DRF
|--- NumericAttributes >  0.50
|   |--- FewFeatures <= 0.50
|   |   |--- UnaryAttributes <= 0.50
|   |   |   |--- BinaryAttributes <= 0.50
|   |   |   |   |--- BinaryClass <= 0.50
|   |   |   |   |   |--- class: StackedEnsemble
|   |   |   |   |--- BinaryClass >  0.50
|   |   |   |   |   |--- class: StackedEnsemble
|   |   |   |--- BinaryAttributes >  0.50
|   |   |   |   |--- class: StackedEnsemble
|   |   |--- UnaryAttributes >  0.50
|   |   

'tree.pdf'

In [None]:
#traverse the tree and add the path to leaf to ontology

def extendOntology(list_of_nodes):
  #list_of_nodes = ['FewInstances' if x=='NoManyInstances' else x for x in list_of_nodes]
  #list_of_nodes = ['ManyFeatures' if x=='NoFewFeatures' else x for x in list_of_nodes]
  algo = list_of_nodes.pop()
  #print("algo: ", algo)
  info_to_add = onto.H2OAlgorithm
  for node in list_of_nodes:
    #print(dict_features[node])
    info_to_add = info_to_add & onto.suitableFor.some(dict_features[node])
  #print(info_to_add)
  dict_algos[algo].is_a.append(info_to_add)
  #print(list_of_nodes)

def travelTree(clf, node_id, list_of_nodes):
  left_id = clf.tree_.children_left[node_id]
  right_id = clf.tree_.children_right[node_id]
  feature = info.columns[clf.tree_.feature[node_id]]

  if left_id==-1 and right_id==-1:
    list2 = list_of_nodes.copy()
    #list2.append(y[clf.tree_.value[node_id].argmax()])
    list2.append(clf.classes_[clf.tree_.value[node_id].argmax()])
    extendOntology(list2)

  if left_id!=-1:
    list2 = list_of_nodes.copy()
    list2.append("No"+feature)
    travelTree(clf, left_id, list2)
  if right_id!=-1:
    list2 = list_of_nodes.copy()
    list2.append(feature)
    travelTree(clf, right_id, list2)

onto = get_ontology("ClassOntologyEmpty.owl").load()

dict_algos = {
'DeepLearning' : onto.Deep_Learning,
'DRF' : onto.Distributed_Random_Forest,
'GBM' : onto.Gradient_Boosting_Machine,
'GLM' : onto.Generalized_Linear_Model,
'StackedEnsemble' : onto.Stacked_Ensembles,
'XGBoost' : onto.XGBoost }

dict_features = {
    'BinaryClass' : onto.BinaryClass,
    'NoBinaryClass' : onto.NoBinaryClass,
    'StringClass' : onto.StringClass,
    'NoStringClass' : onto.NoStringClass,
    'UnaryAttributes' : onto.UnaryAttributes,
    'NoUnaryAttributes' : onto.NoUnaryAttributes,
    'ManyFeatures' : onto.ManyFeatures,
    'NoManyFeatures' : onto.FewFeatures,
    'FewFeatures' : onto.FewFeatures,
    'NoFewFeatures' : onto.ManyFeatures,
    'ManyInstances' : onto.ManyInstances,
    'FewInstances' : onto.FewInstances,
    'NoManyInstances' : onto.FewInstances,
    'NoFewInstances' : onto.ManyInstances,
    'BinaryAttributes' : onto.BinaryAttributes,
    'NoBinaryAttributes' : onto.NoBinaryAttributes,
    'NumericAttributes' : onto.NumericAttributes,
    'NoNumericAttributes' : onto.NoNumericAttributes,
    'NominalAttributes' : onto.NominalAttributes,
    'NoNominalAttributes' : onto.NoNominalAttributes,
    'StringAttributes' : onto.StringAttributes,
    'NoStringAttributes' : onto.NoStringAttributes,
    'MissingValues' : onto.MissingValues,
    'NoMissingValues' : onto.NoMissingValues,
    'NumericClass' : onto.NumericClass,
    'NoNumericClass' : onto.NoNumericClass
    }

travelTree(clf, 0, list())

onto.save(file = "ontoUpdated.owl")

class_name = 'BinaryClass'
info_to_insert = onto.AutoSklearnAlgorithm & onto.suitableFor.some(onto.class_name)
print(type(info_to_insert))
info_to_insert = info_to_insert & onto.suitableFor.some(onto.ManyFeatures)
print(info_to_insert)

<class 'owlready2.class_construct.And'>
ClassOntologyEmpty.AutoSklearnAlgorithm & ClassOntologyEmpty.suitableFor.some(owl.Thing) & ClassOntologyEmpty.suitableFor.some(ClassOntologyEmpty.ManyFeatures)


In [None]:
#traverse the tree and print the path

def travelTree(clf, node_id, str_path):
  left_id = clf.tree_.children_left[node_id]
  right_id = clf.tree_.children_right[node_id]
  feature = info.columns[clf.tree_.feature[node_id]]

  if left_id==-1 and right_id==-1:
    #str_path += y[clf.tree_.value[node_id].argmax()]
    str_path += clf.classes_[clf.tree_.value[node_id].argmax()]
    print(str_path)

  if left_id!=-1:
    travelTree(clf, left_id, str_path+"NO "+feature+" - ")

  if right_id!=-1:
    travelTree(clf, right_id, str_path+feature+" - ")

    #print("IT Is LEAF:", node_id)
    #print("IT Is LEAF:", clf.tree_.feature[node_id])
    #print("IT Is LEAF:", clf.tree_.impurity[node_id])
    #print("IT Is LEAF:", clf.tree_.max_depth)
    #print("IT Is LEAF:", clf.tree_.max_n_classes)
    #print("IT Is LEAF:", clf.tree_.n_classes)
    #print("IT Is LEAF:", clf.tree_.n_features)
    #print("IT Is LEAF:", clf.tree_.n_leaves[node_id])
    #print("IT Is LEAF:", clf.tree_.n_node_samples[node_id])
    #print("IT Is LEAF:", clf.tree_.n_outputs[node_id])
    #print("IT Is LEAF:", clf.tree_.node_count[node_id])
    #print("IT Is LEAF:", clf.tree_.predict[node_id])
    #print("IT Is LEAF:", clf.tree_.threshold[node_id])
    #print("IT Is LEAF:", clf.tree_.value[node_id])
    #print("IT Is LEAF:", clf.tree_.value[node_id].argmax())

print("=========")
travelTree(clf, 0, "")
print("=========")

['libsvm_svc' 'liblinear_svc' 'lda' 'mlp' 'random_forest'
 'gradient_boosting' 'adaboost' 'passive_aggressive' 'k_nearest_neighbors'
 'extra_trees']
0              libsvm_svc
1           liblinear_svc
2              libsvm_svc
3              libsvm_svc
4              libsvm_svc
5                     lda
6                     mlp
7           random_forest
8              libsvm_svc
9              libsvm_svc
10             libsvm_svc
11          random_forest
12                    mlp
13      gradient_boosting
14      gradient_boosting
15      gradient_boosting
16          liblinear_svc
17          random_forest
18      gradient_boosting
19          random_forest
20             libsvm_svc
21             libsvm_svc
22               adaboost
23          random_forest
24             libsvm_svc
25                    mlp
26          random_forest
27                    lda
28     passive_aggressive
29               adaboost
30    k_nearest_neighbors
31          random_forest
32      gradient_bo