In [16]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

from scipy.stats import loguniform
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from tpot import TPOTClassifier, TPOTRegressor

from functools import partial
from sklearn.model_selection import StratifiedKFold
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [17]:
X, y = make_classification(1000, 10)
X.shape, y.shape

((1000, 10), (1000,))

In [18]:
search_space = {
                'lr__penalty' : ['l1', 'l2'],
                'lr__C' : loguniform.rvs(10**(-4),10**2, size=100)
                }

model = Pipeline([('lr', LogisticRegression(random_state=42, 
                            solver='liblinear'))])

## Grid search

In [19]:
reg_grid = GridSearchCV(model,
                        param_grid=search_space,
                        cv=3,
                        scoring='accuracy'
                        )
model_grid = reg_grid.fit(X, y)

In [20]:
print(model_grid.best_score_)
print(model_grid.best_params_)

0.8820047592502682
{'lr__C': 0.11020523020946806, 'lr__penalty': 'l1'}


## Random search

In [21]:
n_iter = 70

reg_rand = RandomizedSearchCV(model,
                        param_distributions=search_space,
                        cv=3,
                        scoring='accuracy'
                        )
model_rand = reg_rand.fit(X, y)

In [22]:
print(reg_rand.best_score_)
print(model_rand.best_params_)

0.8790017562472653
{'lr__penalty': 'l1', 'lr__C': 0.13874008976997212}


## TPOT

In [23]:
search_space = {
                'penalty' : ['l1', 'l2'],
                'C' : loguniform.rvs(10**(-4),10**2, size=100)
                }

In [24]:
tpot_classifier = TPOTClassifier(generations=5, population_size=50,
                                 offspring_size=25, verbosity=2,
                                 config_dict = {'sklearn.linear_model.LogisticRegression': search_space},
                                 cv=3, scoring='accuracy').fit(X, y)

Version 0.12.1 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.


                                                                             
Generation 1 - Current best internal CV score: 0.8770057482632333
                                                                             
Generation 2 - Current best internal CV score: 0.8770057482632333
                                                                              
Generation 3 - Current best internal CV score: 0.8770057482632333
                                                                              
Generation 4 - Current best internal CV score: 0.8770057482632333
                                                                              
Generation 5 - Current best internal CV score: 0.8770057482632333
                                                                              
Best pipeline: LogisticRegression(input_matrix, C=0.007387759613031098, penalty=l2)


In [25]:
args = {}
for arg in tpot_classifier._optimized_pipeline:
    if type(arg) != Primitive:
        try:
            if arg.value.split('__')[1].split('=')[0] in ['C', 'penalty']:
                args[arg.value.split('__')[1].split('=')[0]] = (arg.value.split('__')[1].split('=')[1])
            else:
                args[arg.value.split('__')[1].split('=')[0]] = float(arg.value.split('__')[1].split('=')[1])
        except:
            pass
params = args
params

NameError: name 'Primitive' is not defined

In [None]:
arg.value

AttributeError: 'Primitive' object has no attribute 'value'

## Байесовская оптимизация

In [None]:
search_space = {
                'lr__penalty' : hp.choice(label='penalty', 
                          options=['l1', 'l2']),
                'lr__C' : hp.loguniform(label='C', 
                        low=-4*np.log(10), 
                        high=2*np.log(10))
                }

In [None]:
def objective(params, model,  X_train, y_train):
    """
    Кросс-валидация с текущими гиперпараметрами

    :params: гиперпараметры
    :pipeline: модель
    :X_train: матрица признаков
    :y_train: вектор меток объектов
    :return: средняя точность на кросс-валидации
    """ 

    # задаём модели требуемые параметры    
    model.set_params(**params)
    
    # задаём параметры кросс-валидации (стратифицированная 4-фолдовая с перемешиванием)
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

    # проводим кросс-валидацию  
    score = cross_val_score(estimator=model, X=X_train, y=y_train, 
                            scoring='accuracy', cv=skf, n_jobs=-1)

    # возвращаем результаты, которые записываются в Trials()
    return   {'loss': -score.mean(), 'params': params, 'status': STATUS_OK}

In [None]:
trials = Trials()
best = fmin( 
          # функция для оптимизации  
            fn=partial(objective, model=model, X_train=X, y_train=y),
          # пространство поиска гиперпараметров  
            space=search_space,
          # алгоритм поиска
            algo=tpe.suggest,
          # число итераций (можно ещё указать  время поиска) 
            max_evals=40,
          # куда сохранять историю поиска
            trials=trials,
          # random state
            rstate=np.random.RandomState(42),
          # progressbar
            show_progressbar=True
        )

  0%|          | 0/40 [00:00<?, ?trial/s, best loss=?]


AttributeError: 'numpy.random.mtrand.RandomState' object has no attribute 'integers'

In [None]:
print(best)

NameError: name 'best' is not defined