Dataset for classification

In [9]:
from collections import Counter
from sklearn.datasets import make_classification
# create dataframe
X, y = make_classification(n_samples=1000,
                           n_features=20,
                           n_informative=15,
                           n_redundant=5,
                           random_state=1)
# print the data classes info
print(f'''Main dataframe:
Number of samples: {X.shape[0]}
Number of features: {X.shape[1]}
Samples by class:''')
counter = Counter(y)
for k, v in counter.items():
    per = v / len(y) * 100
    print('Class=%d, Count=%d, Percentage=%.1f%%' % (k, v, per))
    
# Main dataframe:
# Number of samples: 1000
# Number of features: 20
# Samples by class:
# Class=0, Count=501, Percentage=50.1%
# Class=1, Count=499, Percentage=49.9%

Cross-validation

In [10]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    # define the evaluation procedure
    cv = RepeatedStratifiedKFold(random_state=1)
    # evaluate the model and return results
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

List of classifiers

In [11]:
# dummy
from sklearn.dummy import DummyClassifier
# ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
# gaussian_process
from sklearn.gaussian_process import GaussianProcessClassifier
# linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
# naive_bayes
from sklearn.naive_bayes import BernoulliNB
# neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
# neural_network
from sklearn.neural_network import MLPClassifier
# tree
from sklearn.tree import DecisionTreeClassifier
# support vectors
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
# discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# define the models to evaluate
models = dict()
models['dummy'] = DummyClassifier(random_state=1)
models['adaboost'] = AdaBoostClassifier(random_state=1)
models['bagging'] = BaggingClassifier(random_state=1)
models['extrs'] = ExtraTreesClassifier(random_state=1)
models['grdboost'] = GradientBoostingClassifier(random_state=1)
models['randfor'] = RandomForestClassifier(random_state=1)
models['histgrdboost'] = HistGradientBoostingClassifier(random_state=1)
models['xgboost'] = XGBClassifier(random_state=1)
models['gausproc'] = GaussianProcessClassifier(random_state=1)
models['logreg'] = LogisticRegression(random_state=1)
models['pasagr'] = PassiveAggressiveClassifier(random_state=1)
models['prcptr'] = Perceptron(random_state=1)
models['ridge'] = RidgeClassifier(random_state=1)
models['sgd'] = SGDClassifier(random_state=1)
models['bernnb'] = BernoulliNB()
models['kneighb'] = KNeighborsClassifier()
models['nearcent'] = NearestCentroid()
models['mlp'] = MLPClassifier(max_iter=2000, random_state=1)
models['dtree'] = DecisionTreeClassifier(random_state=1)
models['svc'] = SVC(random_state=1)
models['lsvc'] = LinearSVC(max_iter=5000, random_state=1)
models['ldiscranal'] = LinearDiscriminantAnalysis()

Data Transformers

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
# define data transformation techniques
def get_pipelines(model):
    pipelines = list()
    # normalize
    p = Pipeline([('s', MinMaxScaler()), ('m', model)])
    pipelines.append(('norm', p))
    # standardize
    p = Pipeline([('s', StandardScaler()), ('m', model)])
    pipelines.append(('std', p))
    # quantile
    p = Pipeline([('s', QuantileTransformer(n_quantiles=800, random_state=1)), ('m', model)])
    pipelines.append(('quan', p))
    # power
    p = Pipeline([('s', PowerTransformer()), ('m', model)])
    pipelines.append(('pow', p))
    return pipelines

Grid search for the best models and best data preparation techniques

In [13]:
from datetime import datetime
from numpy import mean, std
# evaluate the models and store results
def pipeline_grid_searcher():
    # define results lists
    results, m_names, p_names = list(), list(), list()
    for m_name, model in models.items():
        print('\nmodel:', type(model).__name__)
        # get cross-validation scores
        scores = evaluate_model(model, X, y)
        # store the results
        results.append(scores)
        # store the model names
        m_names.append(m_name)
        # get current time
        current_time = datetime.now().strftime("%H:%M:%S")
        # show the results
        print('%s %s %.3f %.3f' % (current_time, m_name, mean(scores), std(scores)))
        # evaluate the pipelines
        for p_name, pipeline in get_pipelines(model):
            # get cross-validation scores
            scores = evaluate_model(pipeline, X, y)
            # store the results
            results.append(scores)
            # store preprocessing names
            p_names.append(p_name)
            current_time = datetime.now().strftime("%H:%M:%S")
            print('%s %s %s %.3f %.3f' % (current_time, p_name, m_name, mean(scores), std(scores)))
    return results, m_names, p_names

Call grid searcher

In [14]:
# call the function
results, m_names, p_names = pipeline_grid_searcher()

# model: DummyClassifier
# 08:07:03 dummy 0.501 0.002
# 08:07:03 norm dummy 0.501 0.002
# 08:07:03 std dummy 0.501 0.002
# 08:07:04 quan dummy 0.501 0.002
# 08:07:06 pow dummy 0.501 0.002
#
# model: AdaBoostClassifier
# 08:07:10 adaboost 0.857 0.022
# 08:07:13 norm adaboost 0.857 0.022
# 08:07:17 std adaboost 0.857 0.022
# 08:07:22 quan adaboost 0.857 0.022
# 08:07:27 pow adaboost 0.857 0.022
#
# model: BaggingClassifier
# 08:07:29 bagging 0.882 0.021
# 08:07:31 norm bagging 0.882 0.021
# 08:07:33 std bagging 0.882 0.021
# 08:07:35 quan bagging 0.881 0.020
# 08:07:38 pow bagging 0.882 0.021
#
# model: ExtraTreesClassifier
# 08:07:42 extrs 0.935 0.018
# 08:07:46 norm extrs 0.935 0.018
# 08:07:50 std extrs 0.935 0.018
# 08:07:55 quan extrs 0.931 0.018
# 08:08:01 pow extrs 0.937 0.018
#
# model: GradientBoostingClassifier
# 08:08:11 grdboost 0.915 0.021
# 08:08:21 norm grdboost 0.915 0.021
# 08:08:31 std grdboost 0.915 0.021
# 08:08:41 quan grdboost 0.915 0.021
# 08:08:53 pow grdboost 0.915 0.021
#
# model: RandomForestClassifier
# 08:09:00 randfor 0.918 0.018
# 08:09:06 norm randfor 0.918 0.018
# 08:09:13 std randfor 0.918 0.018
# 08:09:21 quan randfor 0.918 0.017
# 08:09:28 pow randfor 0.917 0.018
#
# model: HistGradientBoostingClassifier
# 08:09:38 histgrdboost 0.926 0.017
# 08:09:47 norm histgrdboost 0.926 0.017
# 08:09:57 std histgrdboost 0.926 0.017
# 08:10:07 quan histgrdboost 0.926 0.017
# 08:10:18 pow histgrdboost 0.926 0.017
#
# model: XGBClassifier
# 08:10:30 xgboost 0.921 0.019
# 08:10:42 norm xgboost 0.921 0.019
# 08:10:54 std xgboost 0.921 0.019
# 08:11:06 quan xgboost 0.921 0.019
# 08:11:19 pow xgboost 0.921 0.019
#
# model: GaussianProcessClassifier
# 08:11:25 gausproc 0.909 0.017
# 08:11:33 norm gausproc 0.894 0.020
# 08:11:38 std gausproc 0.958 0.012
# 08:11:46 quan gausproc 0.933 0.017
# 08:11:53 pow gausproc 0.959 0.012
#
# model: LogisticRegression
# 08:11:53 logreg 0.867 0.019
# 08:11:54 norm logreg 0.867 0.019
# 08:11:54 std logreg 0.867 0.019
# 08:11:55 quan logreg 0.869 0.019
# 08:11:57 pow logreg 0.866 0.019
#
# model: PassiveAggressiveClassifier
# 08:11:57 pasagr 0.806 0.039
# 08:11:57 norm pasagr 0.804 0.077
# 08:11:57 std pasagr 0.810 0.033
# 08:11:58 quan pasagr 0.819 0.059
# 08:11:59 pow pasagr 0.808 0.033
#
# model: Perceptron
# 08:12:00 prcptr 0.820 0.031
# 08:12:00 norm prcptr 0.806 0.068
# 08:12:00 std prcptr 0.820 0.038
# 08:12:01 quan prcptr 0.824 0.047
# 08:12:02 pow prcptr 0.820 0.035
#
# model: RidgeClassifier
# 08:12:02 ridge 0.866 0.018
# 08:12:02 norm ridge 0.866 0.018
# 08:12:03 std ridge 0.867 0.018
# 08:12:03 quan ridge 0.873 0.021
# 08:12:05 pow ridge 0.869 0.019
#
# model: SGDClassifier
# 08:12:05 sgd 0.814 0.037
# 08:12:05 norm sgd 0.842 0.036
# 08:12:05 std sgd 0.838 0.028
# 08:12:06 quan sgd 0.851 0.026
# 08:12:08 pow sgd 0.836 0.029
#
# model: BernoulliNB
# 08:12:08 bernnb 0.793 0.023
# 08:12:08 norm bernnb 0.507 0.010
# 08:12:08 std bernnb 0.779 0.028
# 08:12:09 quan bernnb 0.507 0.010
# 08:12:10 pow bernnb 0.778 0.026
#
# model: KNeighborsClassifier
# 08:12:11 kneighb 0.925 0.016
# 08:12:11 norm kneighb 0.953 0.010
# 08:12:11 std kneighb 0.954 0.012
# 08:12:12 quan kneighb 0.952 0.012
# 08:12:14 pow kneighb 0.956 0.011
#
# model: NearestCentroid
# 08:12:14 nearcent 0.714 0.030
# 08:12:14 norm nearcent 0.818 0.025
# 08:12:14 std nearcent 0.820 0.025
# 08:12:15 quan nearcent 0.817 0.023
# 08:12:17 pow nearcent 0.821 0.025
#
# model: MLPClassifier
# 08:12:38 mlp 0.950 0.013
# 08:14:00 norm mlp 0.947 0.013
# 08:14:26 std mlp 0.954 0.012
# 08:15:33 quan mlp 0.948 0.014
# 08:16:01 pow mlp 0.954 0.013
#
# model: DecisionTreeClassifier
# 08:16:01 dtree 0.815 0.029
# 08:16:02 norm dtree 0.815 0.029
# 08:16:02 std dtree 0.815 0.029
# 08:16:03 quan dtree 0.815 0.028
# 08:16:05 pow dtree 0.815 0.029
#
# model: SVC
# 08:16:05 svc 0.950 0.017
# 08:16:06 norm svc 0.963 0.013
# 08:16:07 std svc 0.966 0.012
# 08:16:08 quan svc 0.959 0.014
# 08:16:10 pow svc 0.966 0.012
#
# model: LinearSVC
# 08:16:13 lsvc 0.867 0.020
# 08:16:14 norm lsvc 0.868 0.017
# 08:16:15 std lsvc 0.867 0.020
# 08:16:16 quan lsvc 0.871 0.020
# 08:16:19 pow lsvc 0.867 0.019
#
# model: LinearDiscriminantAnalysis
# 08:16:19 ldiscranal 0.866 0.018
# 08:16:19 norm ldiscranal 0.866 0.018
# 08:16:19 std ldiscranal 0.866 0.018
# 08:16:20 quan ldiscranal 0.872 0.020
# 08:16:22 pow ldiscranal 0.869 0.019

Sorted result

In [15]:
from numpy import repeat
from pandas import DataFrame
# create list with preprocessing names
prep = [x for y in (['row'] + p_names[i:i + 4] * (i < len(p_names) - 2)
                    for i in range(0, len(p_names), 4)) for x in y]
# sort the results
df_results = DataFrame(list(zip(repeat(m_names, 5),
                                prep,
                                [round(mean(x), 3) for x in results],
                                [round(std(x), 3) for x in results])),
                       columns=['model', 'prep', 'mean', 'std']).sort_values(by=['mean'], ascending=False).head(n=20)
print('\nSummary of the best models:')
print(df_results)

# Summary of the best models:
#        model  prep   mean    std
# 99       svc   pow  0.966  0.012
# 97       svc   std  0.966  0.012
# 96       svc  norm  0.963  0.013
# 44  gausproc   pow  0.959  0.012
# 98       svc  quan  0.959  0.014
# 42  gausproc   std  0.958  0.012
# 79   kneighb   pow  0.956  0.011
# 77   kneighb   std  0.954  0.012
# 89       mlp   pow  0.954  0.013
# 87       mlp   std  0.954  0.012
# 76   kneighb  norm  0.953  0.010
# 78   kneighb  quan  0.952  0.012
# 95       svc   row  0.950  0.017
# 85       mlp   row  0.950  0.013
# 88       mlp  quan  0.948  0.014
# 86       mlp  norm  0.947  0.013
# 19     extrs   pow  0.937  0.018
# 16     extrs  norm  0.935  0.018
# 17     extrs   std  0.935  0.018
# 15     extrs   row  0.935  0.018