In [1]:
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import autosklearn.classification
import os
import featuretools as ft
import warnings
import autofeat 
warnings.filterwarnings('ignore')

In [6]:
def run_as(X, y, target_ft, time_budget=30, include_preprocessors = None):
    try:
        os.remove('/tmp/autosklearn_regression_example_tmp')
        os.remove('/tmp/autosklearn_regression_example_out')
    except:
        pass
    #X = df.drop(columns=target_ft)
    #y = df[target_ft]
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_budget,
        per_run_time_limit=30,
        tmp_folder='./tmp/autosklearn_regression_example_tmp',
        output_folder='./tmp/autosklearn_regression_example_out',
        include_preprocessors = include_preprocessors
    )
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    
    metrs = []
    metrs.append("Accuracy score - " + str(sklearn.metrics.accuracy_score(y_test, y_hat)))
    metrs.append("F1 score - " + str(sklearn.metrics.f1_score(y_test, y_hat, average='macro')))
    return str(metrs)

    
    
def gen_feats_featools(df):
    es = ft.EntitySet(id = 'df')
    es.entity_from_dataframe(entity_id = 'data', dataframe = df, 
                         make_index = True, index = 'index')
    feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'data',
                                      agg_primitives=["mean", "max", "min", "std", "skew"],
                                      trans_primitives = ['add_numeric', 'multiply_numeric'])
    return feature_matrix

def gen_feats_autofeat(X,y):
    fsel = autofeat.FeatureSelector(verbose=1)
    X = fsel.fit_transform(X,y)
    return X
    

    
    
def run_test(df,target_ft, mode = 0, time_budget=30):
    results = []
    X = df.drop(columns=target_ft)
    y = df[target_ft]
    if mode!= 1:
        rs = run_as(X,y,target_ft, time_budget=time_budget, include_preprocessors=None)   
        results.append("Autosk Only with Preprocessing: " + rs)
    if mode!=0:
        rs = run_as(X,y,target_ft,time_budget=time_budget, include_preprocessors =["no_preprocessing"])
        results.append("Autosk Only without Preprocessing: " + rs)
    if mode!=0:
        X_new = gen_feats_featools(X)
        rs = run_as(X_new,y,target_ft,time_budget=time_budget, include_preprocessors =["no_preprocessing"])
        results.append("Autosk with Featuretools: " + rs)
    if mode!=0:
        X_new = gen_feats_autofeat(X,y)
        rs = run_as(X_new,y,target_ft,time_budget=time_budget, include_preprocessors =["no_preprocessing"])
        results.append("Autosk with Autofeat: " + rs)
    
    print("===================================")
    [print(x) for x in results]

In [7]:
!rm -r tmp
df = pd.read_csv("blood.csv")
target_ft = "class"
run_test(df, target_ft, mode=2, time_budget=30)

[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 4 features after 5 feature selection runs
[featsel] 4 features after correlation filtering
[featsel] 4 features after noise filtering
Autosk Only with Preprocessing: ['Accuracy score - 0.7486631016042781', 'F1 score - 0.4833950508434726']
Autosk Only without Preprocessing: ['Accuracy score - 0.732620320855615', 'F1 score - 0.4228395061728395']
Autosk with Featuretools: ['Accuracy score - 0.732620320855615', 'F1 score - 0.4228395061728395']
Autosk with Autofeat: ['Accuracy score - 0.7647058823529411', 'F1 score - 0.6563648513197461']


In [41]:
df = pd.read_csv("winequality-red.csv")
target_ft = "quality"
run_test(df, target_ft, mode=2 ,time_budget=60)

Autosk Only with Preprocessing: ['Accuracy score - 0.675', 'F1 score - 0.317634996582365']
Autosk Only without Preprocessing: ['Accuracy score - 0.68', 'F1 score - 0.32347120204787116']
Autosk with Featuretools: ['Accuracy score - 0.67', 'F1 score - 0.32518177724324654']


In [42]:
df = pd.read_csv("blood.csv")
target_ft = "class"
df = gen_feats(df.drop(columns=target_ft))

In [17]:
df = pd.read_csv("data/airlines.csv")

In [18]:
df.columns

Index(['Airline', 'Flight', 'AirportFrom', 'AirportTo', 'DayOfWeek', 'Time',
       'Length', 'Delay'],
      dtype='object')

In [19]:
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,CO,269,SFO,IAH,3,15,205,1
1,US,1558,PHX,CLT,3,15,222,1
2,AA,2400,LAX,DFW,3,20,165,1
3,AA,2466,SFO,DFW,3,20,195,1
4,AS,108,ANC,SEA,3,30,202,0
...,...,...,...,...,...,...,...,...
539378,CO,178,OGG,SNA,5,1439,326,0
539379,FL,398,SEA,ATL,5,1439,305,0
539380,FL,609,SFO,MKE,5,1439,255,0
539381,UA,78,HNL,SFO,5,1439,313,1


In [34]:
!rm -r tmp
df = pd.read_csv("data/airlines.csv").drop(columns=["Airline","AirportFrom","AirportTo"])
target_ft = "Delay"
run_test(df, target_ft, mode=2 ,time_budget=180)

Autosk Only with Preprocessing: ['Accuracy score - 0.5546178603740564', 'F1 score - 0.35675510651898074']
Autosk Only without Preprocessing: ['Accuracy score - 0.5754267831452176', 'F1 score - 0.5525031803266209']
Autosk with Featuretools: ['Accuracy score - 0.574299571362888', 'F1 score - 0.5707862559168639']
