In [1]:
import numpy as np
import pathlib
import pandas as pd

import sklearn.datasets
import sklearn.metrics
from sklearn.utils.multiclass import type_of_target

import autosklearn.classification
import tensorflow as tf

In [2]:
cleaned_data_path = pathlib.Path("../data/interim/ctg_cleaned_manually.xlsx")
data = pd.read_excel(cleaned_data_path,header=0)
x_raw = data.loc[:, "Min":"SUSP"].to_numpy()
y_raw_class = data.loc[:, "CLASS"].to_numpy()
y_raw_nsp = data.loc[:, "NSP"].to_numpy()

In [3]:
y_class_one_hot  = tf.one_hot(indices=y_raw_class,depth=max(y_raw_class)).numpy()
y_nsp_one_hot = tf.one_hot(indices=y_raw_nsp,depth=max(y_raw_nsp)).numpy()

In [36]:
x_train = x_raw[:1913]
x_test = x_raw[1913:]
y_train = y_raw_nsp[:1913]
y_test = y_raw_nsp[1913:]

In [37]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=60,
    per_run_time_limit=30,
    initial_configurations_via_metalearning=0,
    smac_scenario_args={'runcount_limit': 1},
    
)

In [38]:
automl.fit(x_train, y_train)



AutoSklearnClassifier(initial_configurations_via_metalearning=0,
                      per_run_time_limit=30,
                      smac_scenario_args={'runcount_limit': 1},
                      time_left_for_this_task=60)

In [39]:
print(automl.show_models())

[(1.000000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'random_forest', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 1, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'data_preprocessing:categorical_tran

In [40]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 1d410eda-e871-11eb-aea7-51a0d96ebc6e
  Metric: accuracy
  Best validation score: 0.985759
  Number of target algorithm runs: 1
  Number of successful target algorithm runs: 1
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0



In [41]:
predictions = automl.predict(x_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score 0.9953051643192489


In [42]:
sklearn.metrics.confusion_matrix(y_test, predictions)

array([[171,   0,   0],
       [  1,   4,   0],
       [  0,   0,  37]])