In [1]:
from lazypredict.Supervised import LazyClassifier
from lazypredict.Supervised import LazyRegressor
import utils

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import NearestCentroid, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

import joblib

import os

from utils import ALL_DATASETS

In [2]:
def train_models(X_train, X_val, y_train, y_val, models, is_dataset_classification):
    if len(models) == 0:
        return []
    
    if is_dataset_classification:
        lazy_ = LazyClassifier(
            verbose=0,
            ignore_warnings=True,
            custom_metric=None,
            classifiers=models  
        )
    else:
        lazy_ = LazyRegressor(
            verbose=0,
            ignore_warnings=False,
            custom_metric=None,
            regressors=models
        )
    scores, predictions = lazy_.fit(X_train, X_val, y_train, y_val)
    print(scores)
    model_dictionary = lazy_.provide_models(X_train, X_val, y_train, y_val)
    return model_dictionary

In [3]:
CLASSIFIERS = [
    GaussianNB,
    RandomForestClassifier,
    SVC,
    NearestCentroid,
    LogisticRegression,
    DecisionTreeClassifier,
]

REGRESSORS = [
    RandomForestRegressor,
    SVR,
    KNeighborsRegressor,
    LinearRegression,
    DecisionTreeRegressor
]

In [4]:
FILE_EXTENSION = ".joblib"

In [5]:
for dataset_name in ALL_DATASETS:
    print(dataset_name)
    
    results_path = utils.get_classicdescriptors_path(dataset_name)

    # Select the models to train
    models_to_train = [
        # All the ones in the list
        c for c in (CLASSIFIERS if utils.is_dataset_classification(dataset_name) else REGRESSORS) \
        # Except the ones already trained
            if c.__name__+FILE_EXTENSION not in os.listdir(results_path)
    ]

    if models_to_train:
        X,y = utils.get_X_y(dataset_name)
        indices_train,indices_val = utils.get_indices_train_eval(dataset_name)
        X_train, X_val = X[indices_train], X[indices_val]
        y_train, y_val = y[indices_train], y[indices_val]
        del X
        del y
        del indices_train
        del indices_val

        models = train_models(
            X_train,
            X_val,
            y_train,
            y_val,
            models_to_train,
            utils.is_dataset_classification(dataset_name)
        )

        for model_name,model in models.items():
            # Create the path for the model
            assert os.path.exists(results_path)
            model_path = os.path.join(results_path, model_name + FILE_EXTENSION)
            assert not os.path.exists(model_path)

            # Save it with joblib
            joblib.dump(model, model_path)
    
    print("#"*50)

HELOC


100%|██████████| 6/6 [00:04<00:00,  1.28it/s]


                        Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                    
SVC                         0.75               0.75     0.75      0.75   
RandomForestClassifier      0.75               0.75     0.75      0.75   
NearestCentroid             0.74               0.74     0.74      0.74   
LogisticRegression          0.73               0.73     0.73      0.73   
GaussianNB                  0.71               0.72     0.72      0.71   
DecisionTreeClassifier      0.63               0.63     0.63      0.63   

                        Time Taken  
Model                               
SVC                           2.63  
RandomForestClassifier        1.60  
NearestCentroid               0.23  
LogisticRegression            0.07  
GaussianNB                    0.02  
DecisionTreeClassifier        0.12  
##################################################
California_Housing


100%|██████████| 5/5 [00:30<00:00,  6.10s/it]


                       Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                 
RandomForestRegressor                0.82       0.82  0.49       17.13
SVR                                  0.75       0.75  0.57       12.88
KNeighborsRegressor                  0.68       0.68  0.65        0.18
DecisionTreeRegressor                0.63       0.64  0.69        0.26
LinearRegression                     0.62       0.62  0.71        0.02
##################################################
Dengue_Chikungunya


100%|██████████| 6/6 [00:05<00:00,  1.12it/s]


                        Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                    
SVC                         0.79               0.79     0.79      0.79   
LogisticRegression          0.78               0.78     0.78      0.78   
RandomForestClassifier      0.76               0.76     0.76      0.76   
NearestCentroid             0.74               0.74     0.74      0.74   
GaussianNB                  0.73               0.73     0.73      0.73   
DecisionTreeClassifier      0.67               0.67     0.67      0.67   

                        Time Taken  
Model                               
SVC                           4.16  
LogisticRegression            0.03  
RandomForestClassifier        1.05  
NearestCentroid               0.03  
GaussianNB                    0.03  
DecisionTreeClassifier        0.05  
##################################################
Covertype


100%|██████████| 6/6 [2:49:03<00:00, 1690.66s/it]  


                        Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                   
RandomForestClassifier      0.95               0.91    None      0.95   
DecisionTreeClassifier      0.94               0.90    None      0.94   
SVC                         0.79               0.60    None      0.79   
NearestCentroid             0.55               0.59    None      0.58   
LogisticRegression          0.72               0.50    None      0.71   
GaussianNB                  0.09               0.45    None      0.05   

                        Time Taken  
Model                               
RandomForestClassifier      185.40  
DecisionTreeClassifier        7.47  
SVC                        9927.87  
NearestCentroid               1.76  
LogisticRegression           19.31  
GaussianNB                    2.13  
##################################################
