In [None]:
!pip install auto-sklearn

In [None]:
!pip install ucimlrepo

In [None]:
#making imports
import autosklearn.classification
import sklearn.model_selection
from sklearn import preprocessing
from sklearn.datasets import fetch_openml
from ucimlrepo import fetch_ucirepo
import h2o
import pandas as pd
import threading
import time
from colorama import Fore

In [None]:
#creating frame for storing result
results = pd.DataFrame(columns=["ID", "DataSet", "AutoML lib.", "Time lim.", "Algorithm", "Accuracy", "Precision", "Recall", "F-measure", "AUC", "Time"])
i=0

In [None]:
class Info:
    def __init__(self, data, target, frame, feature_names): #fields for reading from file
        self.data = data
        self.target = target
        self.frame = frame
        self.feature_names = feature_names

In [None]:
def calcMetrics(y_test, y_pred, results, i, numl, st, isBin, y_test_p=0, preds=0):
  if isBin: #binary
    print(Fore.RED + "Binary metrics"+Fore.BLACK)
    results.at[i*numl+st,'Precision'] = sklearn.metrics.precision_score(y_test, y_pred)
    results.at[i*numl+st,'Recall'] = sklearn.metrics.recall_score(y_test, y_pred)
    results.at[i*numl+st, 'F-measure'] = sklearn.metrics.f1_score(y_test, y_pred)
    results.at[i*numl+st,'AUC'] = sklearn.metrics.roc_auc_score(y_test, y_pred)
  else: #multiclass
    print(Fore.RED + "MultiClass metrics"+Fore.BLACK)
    results.at[i*numl+st,'Precision'] = sklearn.metrics.precision_score(y_test, y_pred, average='macro', zero_division=0)
    results.at[i*numl+st,'Recall'] = sklearn.metrics.recall_score(y_test, y_pred, average='macro', zero_division=0)
    results.at[i*numl+st,'F-measure'] = sklearn.metrics.f1_score(y_test, y_pred, average='macro')
    results.at[i*numl+st,'AUC'] = sklearn.metrics.roc_auc_score(y_test_p, preds, multi_class='ovr')

In [None]:
def fillCommon(dataset, name, time, autoML, results, i, num, st):
  results.at[i*num+st,'ID'] = dataset
  results.at[i*num+st,'DataSet'] = name
  results.at[i*num+st,'Time lim.'] = time
  results.at[i*num+st,'AutoML lib.'] = autoML

In [None]:
#method for starting AutoSklearn
#dataset - data to process, time - running time limit, ratio - split ration of train/test, i - number of run, numl - number of used libraries, st - start (in table of results)
def runAutoSki(dataset, timelim, ratio, i, numl, st):   #i*numl+st
  if __name__ == '__main__':
    X = dataset.data
    y = dataset.target
    print("type", y.dtype)
    print("A-S X ", X)
    print("A-S y ", y)
    if len(y.unique()) ==2: #afterwards y will be encoded!!!
      print(Fore.RED + "running AutoSklearn Binary"+Fore.BLACK)
      bin = True
    else:
      print(Fore.RED + "running AutoSklearn MultiClass"+Fore.BLACK)
      bin = False

    if y.dtype=='category' or y.dtype=='object':
      print("categorial class encoding")
      #encoding for text values!!!
      le = preprocessing.LabelEncoder()
      y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=ratio, random_state=1)
    cls = autosklearn.classification.AutoSklearnClassifier(ensemble_class=None, time_left_for_this_task=timelim)

    print(Fore.RED + "Search for AutoSklearn model"+Fore.BLACK)
    start_time1 = time.time()
    cls.fit(X_train, y_train)
    elapsed1 = time.time() - start_time1
    print(Fore.RED + "AutoSklearn model found"+Fore.BLACK)

    predictions = cls.predict(X_test)

    results.at[i*numl+st,'Algorithm'] = cls.leaderboard().iat[0,2]
    results.at[i*numl+st,'Accuracy'] = cls.score(X_test, y_test)
    results.at[i*numl+st,'Time'] = elapsed1

    if bin:
      calcMetrics(y_test, predictions, results, i, numl, st, True)
    else:
      pred_proba = cls.predict_proba(X_test)  # for muliclass tasks
      calcMetrics(y_test, predictions, results, i, numl, st, False, y_test, pred_proba)

In [None]:
#method for starting H2O
def runH2O(dataset, timelim, ratio, i, numl, st):
  x=dataset.feature_names
  y=dataset.target.name

  if len(dataset.target.unique())==2:
    print(Fore.RED + "running H2O Binary"+Fore.BLACK)
  else:
    print(Fore.RED + "running H2O MultiClass"+Fore.BLACK)

  #h2o.init()
  frame = h2o.H2OFrame(dataset.frame)
  frame[y] = frame[y].asfactor()
  train, test = frame.split_frame(ratios=[ratio])

  automl = h2o.automl.H2OAutoML(max_runtime_secs=timelim)
  print(Fore.RED + "Search for H2O model"+Fore.BLACK)
  start_time2 = time.time()
  automl.train(x=x, y=y, training_frame=train)
  elapsed2 = time.time() - start_time2
  print(Fore.RED + "H2O model found"+Fore.BLACK)

  perf = automl.leader.model_performance(test)
  print("perf type:", type(perf))

  results.at[i*numl+st,'Algorithm'] = automl.leader.algo
  results.at[i*numl+st,'Time'] = elapsed2

  if len(dataset.target.unique())==2: #binary
    results.at[i*numl+st,'Accuracy'] = perf.accuracy()[0][1]
    results.at[i*numl+st,'Precision'] = perf.precision()[0][1]
    results.at[i*numl+st,'Recall'] = perf.recall()[0][1]
    results.at[i*numl+st,'F-measure'] = perf.F1()[0][1]
    results.at[i*numl+st,'AUC'] = perf.auc()  #[0][1]
  else: #multiclass
    y_test = test[y].as_data_frame() #for accuracy calculation
    y_pred_0 = automl.leader.predict(test)  #calculate predictions
    y_pred = y_pred_0['predict'].as_data_frame() #convert column with predictions to dataframe
    results.at[i*numl+st,'Accuracy'] = sklearn.metrics.accuracy_score(y_test, y_pred)

    preds_0 = y_pred_0.as_data_frame() #convert H2OFrame to DataFrame
    preds = preds_0.drop('predict', axis=1)  #remove name of classes (first column)
    y_test_p = y_test[y]  #convert DataFrame to Series
    calcMetrics(y_test, y_pred, results, i, numl, st, False, y_test_p, preds)

In [None]:
#method for starting AutoML libs in parallel
def runAutoML(dataset, source, time, ratio, i, libs=[]):
  print(Fore.RED + "starting AutoML num ", i)
  print(Fore.BLACK)

  if source=='oml': #type(dataset) == int:
    print("OpenML. fetch by id")
    opml = fetch_openml(data_id=dataset, as_frame=True)
    name = opml.details['name']
    #-------
    #opml_ds = openml.datasets.get_dataset(dataset, download_data=True)
    #info, _, _, features = opml_ds.get_data(dataset_format="dataframe")
    #features.remove(opml_ds.default_target_attribute)
    #X = info[features]
    #y = info[opml_ds.default_target_attribute]
    #name = opml_ds.name #opml.details['name']
    #opml = Info(X, y, info, features)
    #--------
  elif source=='uci':
    print("UCI Repository")
    ucir = fetch_ucirepo(id=dataset)
    opml = Info(ucir.data.features, ucir.data.targets[ucir.metadata['target_col'][0]], ucir.data.original, list(ucir.data.features.columns))
    name = ucir.metadata['name']
  elif source=='kag':
    print("kaggle")
  else:
    if type(dataset) != int:
      print("text. fetch from file")
      #load dataset from file
      info = pd.read_csv(dataset, delimiter=';')
      name = dataset
      #info = pd.read_csv(dataset)
      y = info["PerformanceRating"] # info[info.columns[info.columns.size-1]]
      X = info.drop("PerformanceRating", axis=1) #info[info.columns.delete(info.columns.size-1)]
      #print("type: ", type(info))
      print("X: ", X)
      print("y: ", y)
      #print("info columns:", info.columns.drop("PerformanceRating"))
      feature_names = list(info.columns.drop("PerformanceRating"))
      target_name = "PerformanceRating"
      opml = Info(X, y, info, feature_names)

  print("X:", len(opml.feature_names))
  print("y:", opml.target)

  numl = len(libs)
  t1b = False #flag if AS started
  t2b = False #flag if H2O started
  #t3b = False #flag if FLAML started #FLAML not suitable!!

  if 'AS' in libs:
    autoML = 'AutoSklearn'
    st = libs.index('AS')
    fillCommon(dataset, name, time, autoML, results, i, numl, st)

    print(Fore.RED + 'Start Auto-sklearn', st)
    print(Fore.BLACK)
    t1b = True
    # creating thread
    t1 = threading.Thread(target=runAutoSki, args=(opml, time, 1-ratio, i, numl, st))
    t1.start()
  if 'H2O' in libs:
    autoML = 'H2O'
    st = libs.index('H2O')
    fillCommon(dataset, name, time, autoML, results, i, numl, st)
    print(Fore.RED + 'Start H2O', st)
    print(Fore.BLACK)
    t2b = True
    # creating thread
    t2 = threading.Thread(target=runH2O, args=(opml, time, ratio, i, numl, st))
    t2.start()

  if len(libs)==0:
    numl = 2 #we have 2 libs here in total
    print(Fore.RED + 'Start All'+Fore.BLACK)
    t1b = True #flag if AS started
    autoML = 'AutoSklearn'
    st = 0
    fillCommon(dataset, name, time, autoML, results, i, numl, st)
    # creating thread
    t1 = threading.Thread(target=runAutoSki, args=(opml, time, 1-ratio, i, numl, st))
    t1.start()

    t2b = True #flag if H2O started
    autoML = 'H2O'
    st = 1
    fillCommon(dataset, name, time, autoML, results, i, numl, st)
    # creating thread
    t2 = threading.Thread(target=runH2O, args=(opml, time, ratio, i, numl, st))
    t2.start()

  # wait until threads are completely executed
  if t1b:
    print(Fore.RED + "t1 exists and needs to join!"+Fore.BLACK)
    t1.join()
  if t2b:
    print(Fore.RED + "t2 exists and needs to join!"+Fore.BLACK)
    t2.join()

  print("results: ", results)

In [None]:
h2o.init()

In [None]:
runAutoML(1590, "oml", 300, 0.75, i, ['AS', 'H2O'])
i = i + 1 #for filling next rows in table with results

In [None]:
print(results)
results.to_excel('/kaggle/working/AutoSklearn-H2O.FW.results.xlsx')