In [3]:
import sklearn.datasets
import sklearn.metrics

import autosklearn.classification

X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, random_state=1)
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    include_estimators=["random_forest",], #,"decision_tree", "gradient_boosting", "libsvm_svc",
    #tmp_folder='/tmp/autosklearn_classification_example_tmp11',
    #output_folder='/tmp/autosklearn_classification_example_out11',
    #scoring_functions=["f1"]#balanced_accuracy, precision, recall,"f1"
)
automl.fit(X_train, y_train, dataset_name='breast_cancer')
#print(automl.show_models())






AutoSklearnClassifier(include_estimators=['random_forest'],
                      per_run_time_limit=30, time_left_for_this_task=120)

In [4]:
print(automl.show_models()) #not available for f1 scoring

[(0.160000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'random_forest', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'none', 'feature_preprocessor:__choice__': 'polynomial', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:criterion': 'entropy', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.5899611783591084, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 5, 'classifier:random_forest:min_samples_split': 10, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'data_preprocessing:categoric

In [5]:
print("autosklearn statistics: ", automl.sprint_statistics()) #not available for f1 scoring

autosklearn statistics:  auto-sklearn results:
  Dataset name: breast_cancer
  Metric: accuracy
  Best validation score: 0.985816
  Number of target algorithm runs: 23
  Number of successful target algorithm runs: 22
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0



In [6]:
losses_and_configurations = [
    (run_value.cost, run_key.config_id)
    for run_key, run_value in automl.automl_.runhistory_.data.items()
]
losses_and_configurations.sort()
print("Lowest loss:", losses_and_configurations[0][0])
print(
    "Best configuration:",
    automl.automl_.runhistory_.ids_config[losses_and_configurations[0][1]]
)

Lowest loss: 0.014184397163120588
Best configuration: Configuration:
  balancing:strategy, Value: 'none'
  classifier:__choice__, Value: 'random_forest'
  classifier:random_forest:bootstrap, Value: 'False'
  classifier:random_forest:criterion, Value: 'entropy'
  classifier:random_forest:max_depth, Constant: 'None'
  classifier:random_forest:max_features, Value: 0.5899611783591084
  classifier:random_forest:max_leaf_nodes, Constant: 'None'
  classifier:random_forest:min_impurity_decrease, Constant: 0.0
  classifier:random_forest:min_samples_leaf, Value: 5
  classifier:random_forest:min_samples_split, Value: 10
  classifier:random_forest:min_weight_fraction_leaf, Constant: 0.0
  data_preprocessing:categorical_transformer:categorical_encoding:__choice__, Value: 'one_hot_encoding'
  data_preprocessing:categorical_transformer:category_coalescence:__choice__, Value: 'minority_coalescer'
  data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction, Val

In [7]:
import autosklearn.classification
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import ast

class Autosklearn:
 
    
    def __init__(self, datafile):
        self.df = pd.read_csv(datafile)

    def run_autosklearn(
        self,
        Fullyqualified_op_filename,
        maxtime,
        per_run_time_limit,                  
        scoring):
        
        df_without_class = self.df.drop(
            ["Class"], axis=1
        )  # makesure target class lebel column name in the dataset is 'Class'

        imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
        imputer = imputer.fit(
            df_without_class
        )  # apply imputer class to remove anomalies in the dataset
        impute_df = imputer.transform(df_without_class)
        X = pd.DataFrame(impute_df, columns=df_without_class.columns)
        y = self.df["Class"]
        
       # apply autosklearn
        autosklearn_est = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=maxtime,
        per_run_time_limit=per_run_time_limit,
        include_estimators=["random_forest", "decision_tree", "libsvm_svc", ], #,"decision_tree", "gradient_boosting", "libsvm_svc",
        include_preprocessors=["no_preprocessing", ],
        scoring_functions=[scoring])#balanced_accuracy, precision, recall,"f1"
        autosklearn_est.fit(X, y)   
   
        #winning_pipeline        
        losses_and_configurations = [(run_value.cost, run_key.config_id)
        for run_key, run_value in automl.automl_.runhistory_.data.items()]
        losses_and_configurations.sort()
        print("Lowest loss:", losses_and_configurations[0][0])
        print("Best configuration:", automl.automl_.runhistory_.ids_config[losses_and_configurations[0][1]])
        
        '''can be printed when scoring function is not specified'''
        #print("autosklearn statistics: ", automl.sprint_statistics())
        #print("autosklearn cv result  is:", autosklearn_est.cv_results_)
        #print("autosklearn model is:", autosklearn_est.show_models())
        


    # Fullyqualified_op_filename with path along with additional '\' is required to generate python file for winning pipeline. for example Fullyqualified_op_filename="C:\\Users\\SG\\Desktop\\lantern\\local_notebook\\winning_pipeline.py"
    # tpot_maxtime is max time to run tpot default is set to 60 mins
    def apply_autosklearn(self,
        Fullyqualified_op_filename=None,
        maxtime=7200,
        per_run_time_limit=1800,                  
        scoring="f1"#balanced_accuracy, precision, recall
        ):
        try:
            self.run_autosklearn(
                Fullyqualified_op_filename,
                maxtime,
                per_run_time_limit,
                scoring
            )
        except AssertionError as error:
            print(error)


In [8]:
a=Autosklearn(datafile= "data_transformed.csv")

In [None]:
a.apply_autosklearn()





