<a href="https://colab.research.google.com/github/DarkEol/AutoML/blob/main/AutoML-Ontology/Code/AutoSki_Meta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#script for extraction of meta-features from datasets along with information about algorithms selected by AutoSklearn

In [None]:
#downgrade to install auto-sklearn
!pip install Cython==0.29.36
!pip uninstall scipy -y
!pip install scipy==1.9
!pip uninstall pyparsing -y
!pip install pyparsing==2.4

In [None]:
!pip uninstall scikit_learn -y
!pip install scikit-learn==0.24.2 --no-build-isolation
!pip install auto-sklearn

In [3]:
import autosklearn.classification
import sklearn.model_selection
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn import preprocessing

In [4]:
#create empty dataframe and empty row
empty_list = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
mfeatures = pd.DataFrame(columns=["Name", "NumberOfInstances", "NumberOfFeatures", "ManyInstances", "FewInstances", "ManyFeatures", "FewFeatures", "BinaryClass",	"DateClass", "MissingClassValues", "NominalClass", "NumericClass", "NoClass",	"UnaryClass", "EmptyNominalClass", "StringClass", "RelationalClass", "OnlyMulti-InstanceData", "BinaryAttributes", "DateAttributes", "EmptyNominalAttributes", "MissingValues", "NominalAttributes", "NumericAttributes", "UnaryAttibutes",	"RelationalAttributes",	"StringAttributes", "Algorithm", "Accuracy"])
i=0

In [None]:
if __name__ == '__main__':

  #ID of datasets at OpenML for retrieval and meta-features extraction
  datasets_nums = [13, 14, 15, 16, 18] #17 bad

  for dset_num in datasets_nums:
    opml = fetch_openml(data_id=dset_num)
    X = opml.data
    y = opml.target

    name = opml.details['name']

    shape = X.shape
    n_instances = shape[0]
    n_features = shape[1]
    n_target_values = len(y.unique())

    numeric_data = X.select_dtypes(include=[np.number])
    not_numeric_data = X.select_dtypes(exclude=[np.number])
    category_data = X.select_dtypes(include=['category'])

    mfeatures.loc[len(mfeatures)] = empty_list #adding row

    mfeatures.at[i,'Name'] = name
    mfeatures.at[i,'NumberOfInstances'] = n_instances
    mfeatures.at[i,'NumberOfFeatures'] = n_features

    #storing info about the size of the dataset
    if n_instances > 3200:
      mfeatures.at[i,'ManyInstances'] = 1
    else:
      mfeatures.at[i,'FewInstances'] = 1

    if n_features > 100:
      mfeatures.at[i,'ManyFeatures'] = 1
    else:
      mfeatures.at[i, 'FewFeatures'] = 1

    #storing information about the target class
    if n_target_values == 2:
      mfeatures.at[i, 'BinaryClass'] = 1
      print("Binary")
    else:
      print("MultiClass")

    if (y.dtype=="float64") or y.str.isnumeric().any():
      mfeatures.at[i, 'NumericClass'] = 1
      le = preprocessing.LabelEncoder()
      y = le.fit_transform(y)
    else:
      mfeatures.at[i, 'StringClass'] = 1

    if y.dtype=='category':
      mfeatures.at[i, 'NominalClass'] = 1

    #storing information about dataset attributes
    if numeric_data.shape[1] > 0:
      mfeatures.at[i, 'NumericAttributes'] = 1

    if category_data.shape[1] > 0:
      mfeatures.at[i, 'NominalAttributes'] = 1

    for fname in opml.feature_names:
      if len(X[fname].unique())==2:
        mfeatures.at[i, 'BinaryAttributes'] = 1

      if len(X[fname].unique())==1:
        mfeatures.at[i, 'UnaryAttibutes'] = 1

      if X[fname].isna().sum() > 0:
        mfeatures.at[i, 'MissingValues'] = 1

      if not pd.api.types.is_numeric_dtype(X[fname]):
        mfeatures.at[i, 'StringAttributes'] = 1

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.20, random_state=1)

    #AutoML settings
    cls = autosklearn.classification.AutoSklearnClassifier(ensemble_class=None, time_left_for_this_task=600)

    #running search for best model
    print("searching for model", i)
    cls.fit(X_train, y_train)
    print("model found")

    #info about found models
    algo = cls.leaderboard().iat[0,2]
    score = cls.score(X_test, y_test)

    print("algo: ",algo)
    print("score: ",score)
    mfeatures.at[i, 'Algorithm'] = algo
    mfeatures.at[i, 'Accuracy'] = score
    i+=1

In [None]:
print(mfeatures[['Name','ManyInstances', 'FewInstances', 'ManyFeatures', 'FewFeatures', 'BinaryClass','NumericClass','StringClass','NominalClass','BinaryAttributes','MissingValues','NominalAttributes','NumericAttributes','StringAttributes','UnaryAttibutes','Algorithm','Accuracy']])
print(mfeatures.shape)
mfeatures.to_csv('features.csv')
mfeatures.to_excel('features.xlsx')