In [77]:
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import autosklearn.classification
import os
import featuretools as ft
import warnings
warnings.filterwarnings('ignore')

In [96]:
def run_as(X, y, target_ft, time_budget=30, include_preprocessors = None):
    try:
        os.remove('/tmp/autosklearn_regression_example_tmp')
        os.remove('/tmp/autosklearn_regression_example_out')
    except:
        pass
    #X = df.drop(columns=target_ft)
    #y = df[target_ft]
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_budget,
        per_run_time_limit=30,
        tmp_folder='./tmp/autosklearn_regression_example_tmp',
        output_folder='./tmp/autosklearn_regression_example_out',
        include_preprocessors = include_preprocessors
    )
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    return "Accuracy score - " + str(sklearn.metrics.accuracy_score(y_test, y_hat))
    
    
def gen_feats(df):
    es = ft.EntitySet(id = 'df')
    es.entity_from_dataframe(entity_id = 'data', dataframe = df, 
                         make_index = True, index = 'index')
    feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'data',
                                      agg_primitives=["mean", "max", "min", "std", "skew"],
                                      trans_primitives = ['add_numeric', 'multiply_numeric'])
    return feature_matrix

def run_test(df,target_ft, mode = 0, time_budget=30):
    results = []
    X = df.drop(columns=target_ft)
    y = df[target_ft]
    if mode!= 1:
        rs = run_as(X,y,target_ft, include_preprocessors=None )   
        results.append("Autosk Only with Preprocessing: " + rs)
    if mode!=0:
        rs = run_as(X,y,target_ft, include_preprocessors =["no_preprocessing"])
        results.append("Autosk Only without Preprocessing: " + rs)
    if mode!=0:
        X = gen_feats(X)
        rs = run_as(X,y,target_ft, include_preprocessors =["no_preprocessing"])
        results.append("Autosk with Featuretools: " + rs)    
    
    
    print("===================================")
    [print(x) for x in results]

In [93]:
df = pd.read_csv("blood.csv")
target_ft = "class"
run_test(df, target_ft, mode=2, time_budget=60)

Autosk Only with Preprocessing: Accuracy score - 0.7754010695187166
Autosk Only without Preprocessing: Accuracy score - 0.732620320855615
Autosk with Featuretools: Accuracy score - 0.732620320855615


In [94]:
df = pd.read_csv("winequality-red.csv")
target_ft = "quality"
run_test(df, target_ft, mode=2 ,time_budget=60)

Autosk Only with Preprocessing: Accuracy score - 0.6825
Autosk Only without Preprocessing: Accuracy score - 0.69
Autosk with Featuretools: Accuracy score - 0.6625


In [97]:
df = pd.read_csv("blood.csv")
target_ft = "class"
df = gen_feats(df.drop(columns=target_ft))

In [98]:
df

Unnamed: 0_level_0,V1,V2,V3,V4,V1 + V3,V2 + V4,V3 + V4,V1 + V4,V2 + V3,V1 + V2,...,V1 * V2 + V4,V1 + V4 * V4,V2 * V2 + V3,V1 + V3 * V4,V1 + V3 * V3 + V4,V2 + V4 * V3,V1 + V4 * V3 + V4,V2 + V3 * V3,V1 + V4 * V3,V1 * V2 + V3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,50,12500,98,12502,148,12598,100,12550,52,...,296,9800,627500,1225196,157500196,1850000,1259800,156875000,1250000,25100
1,0,13,3250,28,3250,41,3278,28,3263,13,...,0,784,42419,91000,10653500,133250,91784,10604750,91000,0
2,1,16,4000,35,4001,51,4035,36,4016,17,...,51,1260,64256,140035,16144035,204000,145260,16064000,144000,4016
3,2,20,5000,45,5002,65,5045,47,5020,22,...,130,2115,100400,225090,25235090,325000,237115,25100000,235000,10040
4,1,24,6000,77,6001,101,6077,78,6024,25,...,101,6006,144576,462077,36468077,606000,474006,36144000,468000,6024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743,23,2,500,38,523,40,538,61,502,25,...,920,2318,1004,19874,281374,20000,32818,251000,30500,11546
744,21,2,500,52,521,54,552,73,502,23,...,1134,3796,1004,27092,287592,27000,40296,251000,36500,10542
745,23,3,750,62,773,65,812,85,753,26,...,1495,5270,2259,47926,627676,48750,69020,564750,63750,17319
746,39,1,250,39,289,40,289,78,251,40,...,1560,3042,251,11271,83521,10000,22542,62750,19500,9789


In [90]:
!rm -r tmp