In [12]:
from lazypredict.Supervised import LazyClassifier
from lazypredict.Supervised import LazyRegressor
import utils

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import NearestCentroid, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from joblib import dump, load

import os

from utils import HELOC_NAME, ADULT_INCOME_NAME, HIGGS_NAME, COVERTYPE_NAME, CALIFORNIA_HOUSING_NAME, ARBOVIRUSES_NAME

In [13]:
def train_models(X_train, X_val, y_train, y_val, models, is_dataset_classification):
    if len(models) == 0:
        return []
    
    if is_dataset_classification:
        lazy_ = LazyClassifier(
            verbose=0,
            ignore_warnings=True,
            custom_metric=None,
            classifiers=models  
        )
    else:
        lazy_ = LazyRegressor(
            verbose=0,
            ignore_warnings=False,
            custom_metric=None,
            regressors=models
        )
    scores, predictions = lazy_.fit(X_train, X_val, y_train, y_val)
    print(scores)
    model_dictionary = lazy_.provide_models(X_train, X_val, y_train, y_val)
    return model_dictionary

In [14]:
CLASSIFIERS = [
    GaussianNB,
    RandomForestClassifier,
    SVC,
    NearestCentroid,
    LogisticRegression,
    DecisionTreeClassifier,
]

REGRESSORS = [
    RandomForestRegressor,
    SVR,
    KNeighborsRegressor,
    LinearRegression,
    DecisionTreeRegressor
]

In [15]:
common_path = utils.get_results_path()
base_path = os.path.join(common_path, "classic_descriptors")

if not os.path.exists(base_path):
    os.mkdir(base_path)

In [16]:
FILE_EXTENSION = ".joblib"

# HELOC

In [17]:
name = HELOC_NAME

dataset_path = os.path.join(base_path, name)
if name not in os.listdir(base_path):
    os.mkdir(dataset_path)
models_to_train = [
    c for c in (
        CLASSIFIERS if utils.is_dataset_classification(name) else REGRESSORS
    ) if c.__name__+FILE_EXTENSION not in os.listdir(dataset_path)
]

if models_to_train:
    X,y = utils.get_X_y(name)
    indices_train,indices_val = utils.get_indices_train_eval(name)
    X_train, X_val = X[indices_train], X[indices_val]
    y_train, y_val = y[indices_train], y[indices_val]
    del X
    del y

    models = train_models(X_train, X_val, y_train, y_val, models_to_train, utils.is_dataset_classification(name))

    if models:
        for model in models:
            model_path = os.path.join(dataset_path, str(model) + FILE_EXTENSION)
            dump(model, model_path)

# Adult Income

In [18]:
name = ADULT_INCOME_NAME

dataset_path = os.path.join(base_path, name)
if name not in os.listdir(base_path):
    os.mkdir(dataset_path)
models_to_train = [
    c for c in (
        CLASSIFIERS if utils.is_dataset_classification(name) else REGRESSORS
    ) if c.__name__+FILE_EXTENSION not in os.listdir(dataset_path)
]

if models_to_train:
    X,y = utils.get_X_y(name)
    indices_train,indices_val = utils.get_indices_train_eval(name)
    X_train, X_val = X[indices_train], X[indices_val]
    y_train, y_val = y[indices_train], y[indices_val]
    del X
    del y

    models = train_models(X_train, X_val, y_train, y_val, models_to_train, utils.is_dataset_classification(name))

    if models:
        for model in models:
            model_path = os.path.join(dataset_path, str(model) + FILE_EXTENSION)
            dump(model, model_path)

# California Housing

In [19]:
name = CALIFORNIA_HOUSING_NAME

dataset_path = os.path.join(base_path, name)
if name not in os.listdir(base_path):
    os.mkdir(dataset_path)
models_to_train = [
    c for c in (
        CLASSIFIERS if utils.is_dataset_classification(name) else REGRESSORS
    ) if c.__name__+FILE_EXTENSION not in os.listdir(dataset_path)
]

if models_to_train:
    X,y = utils.get_X_y(name)
    indices_train,indices_val = utils.get_indices_train_eval(name)
    X_train, X_val = X[indices_train], X[indices_val]
    y_train, y_val = y[indices_train], y[indices_val]
    del X
    del y

    models = train_models(X_train, X_val, y_train, y_val, models_to_train, utils.is_dataset_classification(name))

    if models:
        for model in models:
            model_path = os.path.join(dataset_path, str(model) + FILE_EXTENSION)
            dump(model, model_path)

# Arboviruses

In [20]:
name = ARBOVIRUSES_NAME

dataset_path = os.path.join(base_path, name)
if name not in os.listdir(base_path):
    os.mkdir(dataset_path)
models_to_train = [
    c for c in (
        CLASSIFIERS if utils.is_dataset_classification(name) else REGRESSORS
    ) if c.__name__+FILE_EXTENSION not in os.listdir(dataset_path)
]

if models_to_train:
    X,y = utils.get_X_y(name)
    indices_train,indices_val = utils.get_indices_train_eval(name)
    X_train, X_val = X[indices_train], X[indices_val]
    y_train, y_val = y[indices_train], y[indices_val]
    del X
    del y

    models = train_models(X_train, X_val, y_train, y_val, models_to_train, utils.is_dataset_classification(name))

    if models:
        for model in models:
            model_path = os.path.join(dataset_path, str(model) + FILE_EXTENSION)
            dump(model, model_path)

# Covertype

In [21]:
name = COVERTYPE_NAME

dataset_path = os.path.join(base_path, name)
if name not in os.listdir(base_path):
    os.mkdir(dataset_path)
models_to_train = [
    c for c in (
        CLASSIFIERS if utils.is_dataset_classification(name) else REGRESSORS
    ) if c.__name__+FILE_EXTENSION not in os.listdir(dataset_path)
]

if models_to_train:
    X,y = utils.get_X_y(name)
    indices_train,indices_val = utils.get_indices_train_eval(name)
    X_train, X_val = X[indices_train], X[indices_val]
    y_train, y_val = y[indices_train], y[indices_val]
    del X
    del y

    models = train_models(X_train, X_val, y_train, y_val, models_to_train, utils.is_dataset_classification(name))

    if models:
        for model in models:
            model_path = os.path.join(dataset_path, str(model) + FILE_EXTENSION)
            dump(model, model_path)

  0%|          | 0/1 [00:00<?, ?it/s]

# HIGGS

In [None]:
name = HIGGS_NAME

CLASSIFIERS = [
    GaussianNB,
    RandomForestClassifier,
    # SVC,
    NearestCentroid,
    LogisticRegression,
    DecisionTreeClassifier,
]  

dataset_path = os.path.join(base_path, name)
if name not in os.listdir(base_path):
    os.mkdir(dataset_path)
models_to_train = [
    c for c in (
        CLASSIFIERS if utils.is_dataset_classification(name) else REGRESSORS
    ) if c.__name__+FILE_EXTENSION not in os.listdir(dataset_path)
]

if models_to_train:
    X,y = utils.get_X_y(name)
    indices_train,indices_val = utils.get_indices_train_eval(name)
    X_train, X_val = X[indices_train], X[indices_val]
    y_train, y_val = y[indices_train], y[indices_val]
    del X
    del y

    models = train_models(X_train, X_val, y_train, y_val, models_to_train, utils.is_dataset_classification(name))

    if models:
        for model in models:
            model_path = os.path.join(dataset_path, str(model) + FILE_EXTENSION)
            dump(model, model_path)

100%|██████████| 2/2 [00:28<00:00, 14.26s/it]

                    Accuracy  Balanced Accuracy  ROC AUC  F1 Score  Time Taken
Model                                                                         
LogisticRegression      0.64               0.64     0.64      0.64       16.96
NearestCentroid         0.59               0.58     0.58      0.59       11.57



