In [77]:
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import autosklearn.classification
import os
import featuretools as ft
import warnings
warnings.filterwarnings('ignore')

In [83]:
def run_as(X, y, target_ft, time_budget=30, include_preprocessors = None):
    try:
        os.remove('/tmp/autosklearn_regression_example_tmp')
        os.remove('/tmp/autosklearn_regression_example_out')
    except:
        pass
    #X = df.drop(columns=target_ft)
    #y = df[target_ft]
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_budget,
        per_run_time_limit=30,
        tmp_folder='./tmp/autosklearn_regression_example_tmp',
        output_folder='./tmp/autosklearn_regression_example_out',
        include_preprocessors = include_preprocessors
    )
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    return "Accuracy score - " + str(sklearn.metrics.accuracy_score(y_test, y_hat))
    
    
def gen_feats(df):
    es = ft.EntitySet(id = 'df')
    es.entity_from_dataframe(entity_id = 'data', dataframe = df, 
                         make_index = True, index = 'index')
    feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'data',
                                      trans_primitives = ['add_numeric', 'multiply_numeric'])
    return feature_matrix

def run_test(df,target_ft, mode = 0, time_budget=30):
    results = []
    X = df.drop(columns=target_ft)
    y = df[target_ft]
    if mode!= 1:
        rs = run_as(X,y,target_ft, include_preprocessors=None )   
        results.append("Autosk Only: " + rs)
    if mode!=0:
        print("")
        X = gen_feats(X)
        rs = run_as(X,y,target_ft, include_preprocessors =["no_preprocessing"])
        results.append("Autosk with Featuretools: " + rs)
    
    print("===================================")
    [print(x) for x in results]

In [84]:
df = pd.read_csv("blood.csv")
target_ft = "class"
run_test(df, target_ft, mode=2, time_budget=60)


Autosk Only: Accuracy score - 0.7754010695187166
Autosk with Featuretools: Accuracy score - 0.732620320855615


In [85]:
df = pd.read_csv("winequality-red.csv")
target_ft = "quality"
run_test(df, target_ft, mode=2 ,time_budget=60)


Autosk Only: Accuracy score - 0.6825
Autosk with Featuretools: Accuracy score - 0.6625


In [42]:
df = pd.read_csv("blood.csv")
target_ft = "class"
df = gen_feats(df.drop(columns=target_ft))

In [73]:
!rm -r tmp