<a href="https://colab.research.google.com/github/DarkEol/AutoML/blob/main/AutoML-Ontology/Code/AutoH2O_Meta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##script for extraction of meta-features from datasets along with information about algorithms selected by H2O AutoML

In [None]:
!pip install h2o

In [3]:
import h2o
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn import preprocessing
from h2o.automl import H2OAutoML
import sklearn.metrics

In [None]:
h2o.init()

In [5]:
mfeatures = pd.DataFrame(columns=["ID", "Name", "NumberOfInstances", "NumberOfFeatures", "ManyInstances", "FewInstances", "ManyFeatures", "FewFeatures", "BinaryClass",	"DateClass", "MissingClassValues", "NominalClass", "NumericClass", "NoClass",	"UnaryClass", "EmptyNominalClass", "StringClass", "RelationalClass", "OnlyMulti-InstanceData", "BinaryAttributes", "DateAttributes", "EmptyNominalAttributes", "MissingValues", "NominalAttributes", "NumericAttributes", "UnaryAttibutes",	"RelationalAttributes",	"StringAttributes", "Algorithm", "Accuracy"])
empty_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
i=0

In [None]:
#ID of datasets at OpenML for retrieval and meta-features extraction
datasets_nums=[
2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14,
15, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 48,
49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 151,
152, 155, 161, 163, 164, 181, 182, 184, 185, 187, 188, 194, 198, 201, 205,
209, 301, 311, 312, 314, 316, 372, 373,
#374] #exclude bad example!!
375, 376, 377, 378, 380, 381,
382, 427, 443, 444, 451, 452, 454, 455, 461, 463, 464, 465, 466, 467, 468,
469, 470, 472, 473, 475, 476, 478, 479, 480, 481, 488, 498, 510, 516, 523,
524, 688, 720, 722, 725, 727, 734, 735, 752, 761, 803, 807, 816, 819, 821,
823, 833, 843, 846, 847, 871, 881, 901, 923, 959, 976, 977, 979, 980, 981,
993, 1000, 1002, 1019, 1021, 1037, 1038, 1039, 1040, 1041, 1042, 1044, 1046, 1053, 1056,
1069, 1116, 1119, 1120, 1169, 1217, 1236, 1240, 1459, 1460, 1471, 1475, 1476, 1478, 1481,
1482, 1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496,
1497, 1498, 1501, 1502, 1503, 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, 1512, 1513,
1514, 1515, 1516, 1517, 1518, 1519, 1520, 1523, 1524, 6332
]

le = preprocessing.LabelEncoder()

for dset_num in datasets_nums:
  opml = fetch_openml(data_id=dset_num, as_frame=True, parser='auto')
  data = opml.data

  #preparing data for H2O
  x=opml.feature_names
  y=opml.target.name

  if opml.frame[y].dtype=='category':
    opml.frame[y] = le.fit_transform(opml.frame[y])

  if pd.api.types.is_float_dtype(opml.frame[y]):
    opml.frame[y] = le.fit_transform(opml.frame[y])

  frame = h2o.H2OFrame(opml.frame)
  train, test = frame.split_frame(ratios=[.75])

  name = opml.details['name']
  target = opml.target
  shape = data.shape
  n_instances = shape[0]
  n_features = shape[1]
  n_target_values = len(target.unique())

  numeric_data = data.select_dtypes(include=[np.number])
  not_numeric_data = data.select_dtypes(exclude=[np.number])
  category_data = data.select_dtypes(include=['category'])
  object_data = data.select_dtypes(include=[object])

  mfeatures.loc[len(mfeatures)] = empty_list #adding row

  mfeatures.at[i,'ID'] = dset_num
  mfeatures.at[i,'Name'] = name
  mfeatures.at[i,'NumberOfInstances'] = n_instances
  mfeatures.at[i,'NumberOfFeatures'] = n_features

#extracting the size of the dataset
  if n_instances > 3200:
    mfeatures.at[i,'ManyInstances'] = 1
  else:
    mfeatures.at[i,'FewInstances'] = 1

  if n_features > 100:
    mfeatures.at[i,'ManyFeatures'] = 1
  else:
    mfeatures.at[i, 'FewFeatures'] = 1

#extracting information about the target class
  if n_target_values == 2:
    mfeatures.at[i, 'BinaryClass'] = 1

  if pd.api.types.is_numeric_dtype(target) or target.str.isnumeric().any():
    mfeatures.at[i, 'NumericClass'] = 1

  if pd.api.types.is_string_dtype(target):
    mfeatures.at[i, 'StringClass'] = 1

  if target.dtype=='category':
    mfeatures.at[i, 'NominalClass'] = 1

#extracting information about dataset attributes
  if numeric_data.shape[1] > 0:
    mfeatures.at[i, 'NumericAttributes'] = 1

  if category_data.shape[1] > 0:
    mfeatures.at[i, 'NominalAttributes'] = 1

  for fname in opml.feature_names:
    if len(data[fname].unique())==2:
      mfeatures.at[i, 'BinaryAttributes'] = 1

    if len(data[fname].unique())==1:
      mfeatures.at[i, 'UnaryAttibutes'] = 1

      #binary_attributes = True
    if data[fname].isna().sum() > 0:
      mfeatures.at[i, 'MissingValues'] = 1

    if not pd.api.types.is_numeric_dtype(data[fname]):
      mfeatures.at[i, 'StringAttributes'] = 1

  #preparing target column for autoML
  if pd.api.types.is_integer_dtype(train[y]):
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()
    print("convert numeric target to categorial")
  else:
    print("target remained the same")

  automl = h2o.automl.H2OAutoML(max_runtime_secs=600)

  print("searching for model...")
  automl.train(x=x, y=y, training_frame=train)

  algo = automl.leader.key
  if n_target_values == 2: #metrics for binary classifications
    perf = automl.leader.model_performance(test)
    accuracy = perf.accuracy()[0][1]
  else:
    y_test = test[y].as_data_frame()
    y_pred_0 = automl.leader.predict(test)  #calculate predictions
    y_pred = y_pred_0['predict'].as_data_frame() #take from table only predictions
    accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)

  print("algo: ",algo)
  print("score: ",accuracy)
  mfeatures.at[i, 'Algorithm'] = algo
  mfeatures.at[i, 'Accuracy'] = accuracy

  i+=1

print(mfeatures[['Name','ManyInstances', 'FewInstances', 'ManyFeatures', 'FewFeatures', 'BinaryClass','NumericClass','StringClass','NominalClass','BinaryAttributes','MissingValues','NominalAttributes','NumericAttributes','StringAttributes','UnaryAttibutes']])

In [None]:
print(mfeatures[['Name','ManyInstances', 'FewInstances', 'ManyFeatures', 'FewFeatures', 'BinaryClass','NumericClass','StringClass','NominalClass','BinaryAttributes','MissingValues','NominalAttributes','NumericAttributes','StringAttributes','UnaryAttibutes','Algorithm','Accuracy']])
print(mfeatures.shape)
mfeatures.to_csv('features.csv')
mfeatures.to_excel('features.xlsx')
print(i)