In [40]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#models
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import GridSearchCV
import pandas as pd
from datetime import datetime

from sklearn.metrics import accuracy_score, f1_score, auc, average_precision_score, roc_auc_score,roc_curve

import json  

import warnings
warnings.filterwarnings('ignore')

#### Gini

In [5]:
def gini_normalized(y_actual, y_pred):
    gini = lambda a, p: 2 * roc_auc_score(a, p) - 1
    return gini(y_actual, y_pred) / gini(y_actual, y_actual)

### Metrics

In [13]:
def measure_error(actual, predicted):
    return {'as': [accuracy_score(actual, predicted)],
#             'auc':[auc(actual, predicted)],
            'apc':[average_precision_score(actual, predicted)],
            'f1': [f1_score(actual, predicted)], 
            'roc_auc': [roc_auc_score(actual, predicted)], 
            'roc_cur': [roc_curve(actual, predicted)],#moze sie przez to wywalic
            'gini': [gini_normalized(actual, predicted)]}


### Config

In [14]:
def configuration():
    array_of_configs = []
    
    svc = {'C':[1.0],
           'kernel':['rbf'],
           'degree':[3],
           'gamma':['scale'],
           'coef0':[0.0],
           'shrinking':[True],
           'probability':[False],
           'tol':[0.001],
           'cache_size':[200],
           'class_weight':[None],
           'verbose':[False],
           'max_iter':[-1],
           'decision_function_shape':['ovr'],
           'break_ties':[False],
           'random_state':[None]}
    
    xgbost = {'loss':['deviance'],
              'learning_rate':[0.1],
              'n_estimators':[100],
              'subsample':[1.0],
              'criterion':['friedman_mse'],
              'min_samples_split':[2],
              'min_samples_leaf':[1],
              'min_weight_fraction_leaf':[0.0],
              'max_depth':[3],
              'min_impurity_decrease':[0.0],
              'min_impurity_split':[None],
              'init':[None],
              'random_state':[None],
              'max_features':[None],
              'verbose':[0],
              'max_leaf_nodes':[None],
              'warm_start':[False],
              'presort':['deprecated'],
              'validation_fraction':[0.1],
              'n_iter_no_change':[None],
              'tol':[0.0001],
              'ccp_alpha':[0.0]}
    
    rf = {'n_estimators':[100],
          'criterion':['gini'],
          'max_depth':[None],
          'min_samples_split':[2],
          'min_samples_leaf':[1],
          'min_weight_fraction_leaf':[0.0],
          'max_features':['auto'],
          'max_leaf_nodes':[None],
          'min_impurity_decrease':[0.0],
          'min_impurity_split':[None],
          'bootstrap':[True],
          'oob_score':[False],
          'n_jobs':[None],
          'random_state':[None],
          'verbose':[0],
          'warm_start':[False],
          'class_weight':[None],
          'ccp_alpha':[0.0],
          'max_samples':[None]}
    
    sgd = {'loss':['hinge'], 
           'penalty':['l2'],
           'alpha':[0.0001],
           'l1_ratio':[0.15],
           'fit_intercept':[True],
           'max_iter':[1000],
           'tol':[0.001],
           'shuffle':[True],
           'verbose':[0],
           'epsilon':[0.1],
           'n_jobs':[None],
           'random_state':[None],
           'learning_rate':['optimal'],
           'eta0':[0.0],
           'power_t':[0.5],
           'early_stopping':[False],
           'validation_fraction':[0.1],
           'n_iter_no_change':[5],
           'class_weight':[None],
           'warm_start':[False],
           'average':[False]}
    
    bnb = {'alpha':[1.0],
           'binarize':[0.0],
           'fit_prior':[True],
           'class_prior':[None]}
    
    mlp = {'hidden_layer_sizes':[(100, )],
           'activation':['relu'],
           'solver':['adam'],
           'alpha':[0.0001],
           'batch_size':['auto'],
           'learning_rate':['constant'],
           'learning_rate_init':[0.001],
           'power_t':[0.5],
           'max_iter':[200],
           'shuffle':[True],
           'random_state':[None],
           'tol':[0.0001],
           'verbose':[False],
           'warm_start':[False],
           'momentum':[0.9],
           'nesterovs_momentum':[True],
           'early_stopping':[False],
           'validation_fraction':[0.1],
           'beta_1':[0.9],
           'beta_2':[0.999],
           'epsilon':[1e-08],
           'n_iter_no_change':[10],
           'max_fun':[15000]}
    
    lsvc = {'penalty':['l2'],
            'loss':['squared_hinge'],
            'dual':[True],
            'tol':[0.0001],
            'C':[1.0],
            'multi_class':['ovr'],
            'fit_intercept':[True],
            'intercept_scaling':[1],
            'class_weight':[None],
            'verbose':[0],
            'random_state':[None],
            'max_iter':[1000]}
    
    sc = {}
    
    array_of_configs.append(svc)
    array_of_configs.append(xgbost)
    array_of_configs.append(rf)
    array_of_configs.append(sgd)
    array_of_configs.append(bnb)
    array_of_configs.append(mlp)
    array_of_configs.append(lsvc)
#     array_of_configs.append(sc)
    
    return array_of_configs

### Models

In [15]:
def models():
    array_of_models = []
    array_of_models.append(SVC())
    array_of_models.append(GradientBoostingClassifier())
    array_of_models.append(RandomForestClassifier())
    array_of_models.append(SGDClassifier())
    array_of_models.append(BernoulliNB())
    array_of_models.append(MLPClassifier())
    array_of_models.append(LinearSVC())
#     array_of_models.append(StackingClassifier())
    
    return array_of_models

### Data

In [16]:
colnames=['X', 'Y', 'Z', 'Value'] 
# series = pd.read_csv('Skin_Non.txt', sep='\t',names=colnames, header=None, index_col=False)
series = pd.read_csv('wine.data',header=None)
series['value'] = np.random.randint(0, 2, series.shape[0])
data = series.values
# X, y = data[:,:-1], data[:, -1]
X, y = data[:,:14],data[:,14]
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state =0,stratify=y)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

### Initial run

In [17]:
def initial_run(config, models):
    df = pd.DataFrame()
    list_of_models = {}
    
    for c,m in zip(config,models):
        clf = GridSearchCV(m, c)
        clf.fit(X_train_std,y_train)
#         print(clf.estimator)

        if(df.size<1):
            df = pd.DataFrame(measure_error(y_test,clf.predict(X_test_std)))
            df['model'] = str(m) +"-"+ str(1)
            list_of_models[str(m) +"-"+ str(1)]=clf
            df['date']=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        else:
            df1 = pd.DataFrame(measure_error(y_test,clf.predict(X_test_std)))
            df1['model'] = str(m) +"-"+ str(1)
            list_of_models[str(m) +"-"+ str(1)]=clf
            df1['date']=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            df= pd.concat([df,df1])
    df.reset_index(inplace=True)
    df.drop('index', inplace=True, axis=1)
    return df,list_of_models

In [18]:
df,list_of_model_and_configs = initial_run(configuration(),models())

In [19]:
def add_model(dataframe, model, config, list_of_models):
    clf = GridSearchCV(model, config)
    clf.fit(X_train_std,y_train)
#     list_of_models.append(clf)
    
    df = pd.DataFrame(measure_error(y_test,clf.predict(X_test_std)))
    df['model'] = str(clf.estimator) +"-"+ str(int(dataframe[dataframe['model'].str.contains(str(clf.estimator))]['model'].tail(1).values[0].split('-')[1])+1)
    list_of_models[str(clf.estimator) +"-"+ str(int(dataframe[dataframe['model'].str.contains(str(clf.estimator))]['model'].tail(1).values[0].split('-')[1])+1)]=clf
    df['date']=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    dataframe = pd.concat([dataframe,df])
    dataframe.reset_index(inplace=True)
    dataframe.drop('index', inplace=True, axis=1)
    return dataframe, list_of_models

In [20]:
df, list_of_model_and_configs = add_model(df,SVC(),{'kernel':['linear'],'C':[0.025]},list_of_model_and_configs)

In [21]:
def find_max(df,stat):
    return df.loc[df[stat].idxmax()]['model']

In [22]:
## key of best model
def get_best_model(models,key):
    return list_of_model_and_configs.get(key)

In [23]:
print(get_best_model(list_of_model_and_configs,find_max(df,'f1')))

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True], 'ccp_alpha': [0.0],
                         'class_weight': [None], 'criterion': ['gini'],
                         'max_depth': [None], 'max_features': ['auto'],
                         'max_leaf_nodes': [None], 'max_samples': [None],
                         'min_impurity_decrease': [0.0],
                         'min_impurity_split': [None], 'min_samples_leaf': [1],
                         'min_samples_split': [2],
                         'min_weight_fraction_leaf': [0.0],
                         'n_estimators': [100], 'n_jobs': [None],
                         'oob_score': [False], 'random_state': [None],
                         'verbose': [0], 'warm_start': [False]})


In [24]:
def convert_models_toDataframe(models):
    list_of_dfs = {}
    for k,v in models.items():
        key = k.split('-')[0]
        if key in list_of_dfs:
            if not isinstance(list_of_dfs[key], list):
                list_of_dfs[key] = [list_of_dfs[key]]
            list_of_dfs[key].append({k:v})
        else:
            list_of_dfs[key] = ({k:v})
    
    final_list = []
    for k,v in list_of_dfs.items():
        df = pd.DataFrame()
        if isinstance(v, list): 
            for i in v:
                if(df.size<1):             
                    df = pd.DataFrame((i.get(list(i.keys())[0])).param_grid)
                    df['model'] = list(i.keys())[0]
                else:
                    df2 = pd.DataFrame((i.get(list(i.keys())[0])).param_grid)
                    df2['model'] = str(list(i.keys())[0])
                    df = pd.concat([df,df2])
        else:
            df = pd.DataFrame((v.get(list(v.keys())[0])).param_grid)
            df['model'] = list(v.keys())[0]
        final_list.append(df)
                         
    return final_list

In [25]:
testDataFrame =convert_models_toDataframe(list_of_model_and_configs) 

In [26]:
#przerob na slownik 
testDataFrame[5].head()

Unnamed: 0,hidden_layer_sizes,activation,solver,alpha,batch_size,learning_rate,learning_rate_init,power_t,max_iter,shuffle,...,momentum,nesterovs_momentum,early_stopping,validation_fraction,beta_1,beta_2,epsilon,n_iter_no_change,max_fun,model
0,"(100,)",relu,adam,0.0001,auto,constant,0.001,0.5,200,True,...,0.9,True,False,0.1,0.9,0.999,1e-08,10,15000,MLPClassifier()-1


In [45]:
def convert_configs_to_json(configs):
    list_of_json = []
    for i in configs:
        dict = {}
        dict['estimator'] = str((configs.get(i)).estimator)
        dict['param_grid'] = (configs.get(i)).param_grid
        json_object = json.dumps(dict, indent = 4) 
        list_of_json.append(json_object)
    return list_of_json

In [46]:
convert_configs_to_json(list_of_model_and_configs)

['{\n    "estimator": "SVC()",\n    "param_grid": {\n        "C": [\n            1.0\n        ],\n        "kernel": [\n            "rbf"\n        ],\n        "degree": [\n            3\n        ],\n        "gamma": [\n            "scale"\n        ],\n        "coef0": [\n            0.0\n        ],\n        "shrinking": [\n            true\n        ],\n        "probability": [\n            false\n        ],\n        "tol": [\n            0.001\n        ],\n        "cache_size": [\n            200\n        ],\n        "class_weight": [\n            null\n        ],\n        "verbose": [\n            false\n        ],\n        "max_iter": [\n            -1\n        ],\n        "decision_function_shape": [\n            "ovr"\n        ],\n        "break_ties": [\n            false\n        ],\n        "random_state": [\n            null\n        ]\n    }\n}',
 '{\n    "estimator": "GradientBoostingClassifier()",\n    "param_grid": {\n        "loss": [\n            "deviance"\n        ],\n  

# TODO
- measure metrics
- json
- modele

In [None]:
from sklearn.base import ClassifierMixin
from sklearn.utils import all_estimators
classifiers=[est for est in all_estimators() if issubclass(est[1], ClassifierMixin)]
print(classifiers[0])

In [None]:
list_of_model_and_configs