In [2]:
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import autosklearn.classification
import os
import featuretools as ft
import warnings
import autofeat 
from tpot import TPOTClassifier

warnings.filterwarnings('ignore')



In [18]:
def run_as(X, y, target_ft, time_budget=30, include_preprocessors = None):
    try:
        os.remove('/tmp/autosklearn_regression_example_tmp')
        os.remove('/tmp/autosklearn_regression_example_out')
    except:
        pass
    #X = df.drop(columns=target_ft)
    #y = df[target_ft]
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_budget,
        per_run_time_limit=30,
        tmp_folder='./tmp/autosklearn_regression_example_tmp',
        output_folder='./tmp/autosklearn_regression_example_out',
        include_preprocessors = include_preprocessors
    )
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    
    metrs = []
    metrs.append("Accuracy score - " + str(sklearn.metrics.accuracy_score(y_test, y_hat)))
    metrs.append("F1 score - " + str(sklearn.metrics.f1_score(y_test, y_hat, average='macro')))
    return str(metrs)

def run_tpot(X,y, target_ft,time_budget=30, include_preprocessors=None ):
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget/60, generations=None)
    pipeline_optimizer.fit(X_train, y_train)
    y_hat = pipeline_optimizer.predict(X_test)
    metrs = []
    metrs.append("Accuracy score - " + str(sklearn.metrics.accuracy_score(y_test, y_hat)))
    metrs.append("F1 score - " + str(sklearn.metrics.f1_score(y_test, y_hat, average='macro')))
    return str(metrs)

    
def gen_feats_featools(df):
    es = ft.EntitySet(id = 'df')
    es.entity_from_dataframe(entity_id = 'data', dataframe = df, 
                         make_index = True, index = 'index')
    feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'data', verbose = 1)
                                      #agg_primitives=["mean", "max", "min", "std", "skew"],
                                     # trans_primitives = ['add_numeric', 'multiply_numeric'])
    return feature_matrix

def gen_feats_autofeat(X,y):
    fsel = autofeat.FeatureSelector(verbose=1)
    X = fsel.fit_transform(X,y)
    return X
    
    
def run_test(df,target_ft, mode = 0, time_budget=30):
    results = []
    X = df.drop(columns=target_ft)
    y = df[target_ft]
    if mode ==0 or mode == 1:
        rs = run_tpot(X,y,target_ft, time_budget=time_budget, include_preprocessors=None)   
        results.append("TPOT Only with Preprocessing: " + rs)
    if mode == 0 or mode == 2:
        rs = run_as(X,y,target_ft, time_budget=time_budget, include_preprocessors=None)   
        results.append("Autosk Only with Preprocessing: " + rs)
    if mode == 0 or mode == 3:
        rs = run_as(X,y,target_ft,time_budget=time_budget, include_preprocessors =["no_preprocessing"])
        results.append("Autosk Only without Preprocessing: " + rs)
    if mode == 0 or mode == 4:
        X_new = gen_feats_featools(X)
        rs = run_as(X_new,y,target_ft,time_budget=time_budget, include_preprocessors =["no_preprocessing"])
        results.append("Autosk with Featuretools: " + rs)
    if mode == 0 or mode == 5:
        X_new = gen_feats_autofeat(X,y)
        rs = run_as(X_new,y,target_ft,time_budget=time_budget, include_preprocessors =["no_preprocessing"])
        results.append("Autosk with Autofeat: " + rs)
    
    
    print("===================================")
    print("Time budeget: ",time_budget)
    [print(x) for x in results]

In [3]:
!rm -r tmp
df = pd.read_csv("blood.csv")
target_ft = "class"
run_test(df, target_ft, mode=2, time_budget=60)

[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 5 features after 5 feature selection runs
[featsel] 4 features after correlation filtering
[featsel] 4 features after noise filtering
Time budeget:  60
TPOT Only with Preprocessing: ['Accuracy score - 0.7967914438502673', 'F1 score - 0.7082922824302135']
Autosk Only with Preprocessing: ['Accuracy score - 0.7486631016042781', 'F1 score - 0.4833950508434726']
Autosk Only without Preprocessing: ['Accuracy score - 0.7486631016042781', 'F1 score - 0.6361114561338136']
Autosk with Featuretools: ['Accuracy score - 0.7486631016042781', 'F1 score - 0.6422145328719723']
Autosk with Autofeat: ['Accuracy score - 0.7700534759358288', 'F1 score - 0.6179139938227607']


In [29]:
df = pd.read_csv("winequality-red.csv")
target_ft = "quality"
run_test(df, target_ft, mode=2 ,time_budget=30)

[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 6 features after 5 feature selection runs
[featsel] 6 features after correlation filtering
[featsel] 6 features after noise filtering
Time budeget:  30
TPOT Only with Preprocessing: ['Accuracy score - 0.7025', 'F1 score - 0.40879106424185435']
Autosk Only with Preprocessing: ['Accuracy score - 0.6825', 'F1 score - 0.32619453808332716']
Autosk Only without Preprocessing: ['Accuracy score - 0.675', 'F1 score - 0.34061263100501576']
Autosk with Featuretools: ['Accuracy score - 0.665', 'F1 score - 0.3233982170695826']
Autosk with Autofeat: ['Accuracy score - 0.655', 'F1 score - 0.3321084775392869']


In [17]:
df = pd.read_csv("data/airlines.csv")

In [18]:
df.columns

Index(['Airline', 'Flight', 'AirportFrom', 'AirportTo', 'DayOfWeek', 'Time',
       'Length', 'Delay'],
      dtype='object')

In [19]:
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,CO,269,SFO,IAH,3,15,205,1
1,US,1558,PHX,CLT,3,15,222,1
2,AA,2400,LAX,DFW,3,20,165,1
3,AA,2466,SFO,DFW,3,20,195,1
4,AS,108,ANC,SEA,3,30,202,0
...,...,...,...,...,...,...,...,...
539378,CO,178,OGG,SNA,5,1439,326,0
539379,FL,398,SEA,ATL,5,1439,305,0
539380,FL,609,SFO,MKE,5,1439,255,0
539381,UA,78,HNL,SFO,5,1439,313,1


In [None]:
!rm -r tmp
df = pd.read_csv("data/airlines.csv").drop(columns=["Airline","AirportFrom","AirportTo"])
target_ft = "Delay"
run_test(df, target_ft, mode=2 ,time_budget=30)



In [None]:
!rm -r tmp
df = pd.read_csv("data/gina.csv")
target_ft = "class"
run_test(df, target_ft, mode=2 ,time_budget=30)

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/stopit/utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tpot/decorators.py", line 57, in time_limited_call
    func(*args)
  File "/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py", line 354, in fit
    self._final_estimator.fit(Xt, y, **fit_params)
  File "/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/_forest.py", line 383, in fit
    for i, t in enumerate(trees))
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 1032, in __call__
    while self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 765, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.6/dist-packages/joblib/_p

In [None]:
!rm -r tmp
df = pd.read_csv("data/gina.csv")
target_ft = "class"
run_test(df, target_ft, mode=0 ,time_budget=1800)



In [19]:
df = pd.read_csv("data/gina.csv")
target_ft = "class"
X = df.drop(columns=target_ft)
y = df[target_ft]

In [4]:
X

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V961,V962,V963,V964,V965,V966,V967,V968,V969,V970
0,0,100,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,222,0,0,0,0,0,0,0,175,...,0,0,0,0,0,0,0,0,0,0
2,236,0,0,0,0,0,0,0,252,0,...,0,0,0,0,0,0,209,0,0,0
3,0,0,0,211,0,0,0,0,0,252,...,87,0,0,0,0,0,132,0,0,0
4,0,0,196,0,0,0,0,0,0,0,...,0,0,185,0,0,0,253,0,0,247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3148,69,0,0,0,0,0,0,0,0,0,...,26,0,0,0,0,0,196,0,137,0
3149,0,0,252,0,0,0,0,0,0,0,...,253,0,0,0,0,0,252,0,0,252
3150,233,0,53,106,0,0,213,0,0,0,...,203,0,0,0,0,145,0,0,0,0
3151,0,0,220,0,0,0,0,0,254,0,...,0,0,0,0,0,0,254,0,0,0


In [7]:
new_X = gen_feats_autofeat(X,y)

[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 115 features after 5 feature selection runs
[featsel] 115 features after correlation filtering
[featsel] 61 features after noise filtering


In [9]:
new_X.shape

(3153, 61)

In [10]:
X.shape

(3153, 970)

In [20]:
feat_X = gen_feats_featools(X)

Built 970 features
Elapsed: 00:00 | Progress: 100%|██████████


In [16]:
feat_X.shape

(3153, 970)