In [2]:
#импорт библиотек
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials
import optuna

In [3]:
# Загружаем данные
data = pd.read_csv('data/_train_sem09 (1).csv')
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


Каждая строка представляет молекулу. 

- Первый столбец Activity содержит экспериментальные данные, описывающие фактический биологический ответ [0, 1]; 
- Остальные столбцы D1-D1776 представляют собой молекулярные дескрипторы — это вычисляемые свойства, которые могут фиксировать некоторые характеристики молекулы, например размер, форму или состав элементов.


In [4]:
data['Activity'].value_counts(normalize=True)

1    0.542255
0    0.457745
Name: Activity, dtype: float64

In [5]:
X = data.drop(['Activity'], axis=1)
y = data['Activity']

In [6]:
# Разделяем выборку на обучающую и тестовую
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
# Обучаем модель логистической регрессии
lr_model = linear_model.LogisticRegression(random_state=42)

lr_model.fit(X_train, y_train)

y_test_predict = lr_model.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))

f1_score на тестовом наборе: 0.78


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Обучаем модель случайного леса
rf_model = ensemble.RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)

y_test_predict = rf_model.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))

f1_score на тестовом наборе: 0.80


# Подбор гиперпараметров моделей

## 1. GridSearchCV

### 1.1. Линейная регрессия

In [8]:
# Подберем гиперпараметры для модели логистической регрессии
# зададим пространство поиска гиперпараметров
param_grid = [
    {
        'penalty': ['l2', 'none'] , 
        'solver': ['lbfgs', 'sag'],
        'C': [0.1, 0.25, 0.5, 0.75, 0.9]
    },
    {
        'penalty': ['l1', 'l2'] , 
        'solver': ['liblinear', 'saga'],
        'C': [0.1, 0.25, 0.5, 0.75, 0.9]
    }
]

grid_search_lr = GridSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=2000), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)

grid_search_lr.fit(X_train, y_train)

y_test_predict = grid_search_lr.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_lr.best_params_))

f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'C': 0.25, 'penalty': 'l1', 'solver': 'saga'}


В результате подбора гиперпареметров для модели линейной регрессии метрика f1 не улучшилась, проведем еще одну итерацию подбора параметров.

In [9]:
# зададим пространство поиска гиперпараметров
param_grid = [
    {
        'penalty': ['l2', 'none'] , 
        'solver': ['lbfgs', 'sag'],
        'C': [0.2, 0.3]
    },
    {
        'penalty': ['l1', 'l2'] , 
        'solver': ['liblinear', 'saga'],
        'C': [0.2, 0.3]
    }
]

grid_search_lr = GridSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=2000), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)

grid_search_lr.fit(X_train, y_train)

y_test_predict = grid_search_lr.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_lr.best_params_))

f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'C': 0.2, 'penalty': 'l1', 'solver': 'saga'}


### 1.2 Случайный лес

In [10]:
# Подберем гиперпараметры для модели случайного леса
# зададим пространство поиска гиперпараметров
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [x for x in range(1, 30)],
    'min_samples_leaf': list(np.linspace(5, 100, 50, dtype=int))    
}

grid_search_rf = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)

grid_search_rf.fit(X_train, y_train)

y_test_predict = grid_search_rf.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_rf.best_params_))

f1_score на тестовом наборе: 0.81
Наилучшие значения гиперпараметров: {'criterion': 'entropy', 'max_depth': 24, 'min_samples_leaf': 5, 'n_estimators': 300}


## 2. RandomizedSearchCV

### 2.1. Линейная регрессия

In [16]:
# Подберем гиперпараметры для модели логистической регрессии
# зададим пространство поиска гиперпараметров
param_grid = [
    {
        'penalty': ['l2', 'none'] , 
        'solver': ['lbfgs', 'sag'],
        'C': list(np.linspace(0.01, 1, 10, dtype=float))
    },
    {
        'penalty': ['l1', 'l2'] , 
        'solver': ['liblinear', 'saga'],
        'C': list(np.linspace(0.01, 1, 10, dtype=float))
    }
]

rand_search_lr = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=2000), 
    param_distributions=param_grid, 
    cv=5, 
    n_iter = 10,
    n_jobs = -1
)

rand_search_lr.fit(X_train, y_train)

y_test_predict = rand_search_lr.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))
print("Наилучшие значения гиперпараметров: {}".format(rand_search_lr.best_params_))

f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'solver': 'saga', 'penalty': 'l1', 'C': 0.23}


### 2.2 Случайный лес

In [17]:
# Подберем гиперпараметры для модели случайного леса
param_grid = {
    'n_estimators': range(100, 1000, 50),
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2, 30),
    'min_samples_leaf': list(np.linspace(5, 100, 50, dtype=int))    
}

grid_search_rf = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_distributions=param_grid,
    n_iter = 40,
    cv=5, 
    n_jobs = -1
)

grid_search_rf.fit(X_train, y_train)

y_test_predict = grid_search_rf.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_rf.best_params_))

f1_score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'n_estimators': 650, 'min_samples_leaf': 5, 'max_depth': 24, 'criterion': 'entropy'}


## 3. HYPEROPT

### 3.1. Линейная регрессия

In [12]:
random_state = 42
# Подберем гиперпараметры для модели логистической регрессии
def hyperopt_lr(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {
        'penalty': params['penalty'], 
        'solver': params['solver'], 
        'C': params['C']
    }
  
    model = linear_model.LogisticRegression(**params, random_state=random_state)
        
    # обучим модель с помощью кросс-валидации
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

In [26]:
# зададим пространство поиска гиперпараметров
space = {
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver' : hp.choice('solver', ['liblinear', 'saga']),
    'C': hp.quniform('C', 0.1, 1, 0.05)
}

space_dict = {
    'penalty': ['l1', 'l2'],
    'solver' : ['liblinear', 'saga']
}

In [14]:
trials = Trials()

best = fmin(
    hyperopt_lr,  
    space=space,
    algo=tpe.suggest, 
    max_evals=40,
    trials=trials,
    rstate=np.random.default_rng(random_state)
)

print("Наилучшие значения гиперпараметров {}".format(best))

100%|███████████████████████████████████████████████| 40/40 [06:58<00:00, 10.47s/trial, best loss: -0.7913324555968615]
Наилучшие значения гиперпараметров {'C': 0.30000000000000004, 'penalty': 0, 'solver': 1}


In [28]:
# рассчитаем точность для тестовой выборки
model_hp_lr = linear_model.LogisticRegression(
    penalty=space_dict['penalty'][best['penalty']],
    solver=space_dict['solver'][best['solver']],
    C=best['C'],
    max_iter=2000,
    random_state=random_state
)

model_hp_lr.fit(X_train, y_train)

y_test_predict = model_hp_lr.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))

f1_score на тестовом наборе: 0.78


### 3.2 Случайный лес

In [37]:
# Подберем гиперпараметры для модели случайного леса
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']), 
        'min_samples_leaf': int(params['min_samples_leaf'])
    }
  
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)

    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

In [38]:
# зададим пространство поиска гиперпараметров
space={
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
    'max_depth' : hp.quniform('max_depth', 2, 26, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 30, 1)
}

In [39]:
trials_rf = Trials()

best_rf = fmin(
    hyperopt_rf,  
    space=space,
    algo=tpe.suggest, 
    max_evals=20,
    trials=trials_rf,
    rstate=np.random.default_rng(random_state)
)

print("Наилучшие значения гиперпараметров {}".format(best_rf))

100%|███████████████████████████████████████████████| 20/20 [02:18<00:00,  6.93s/trial, best loss: -0.8125101634883489]
Наилучшие значения гиперпараметров {'max_depth': 17.0, 'min_samples_leaf': 4.0, 'n_estimators': 250.0}


In [41]:
# рассчитаем точность для тестовой выборки
model_hp_rf = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best_rf['n_estimators']),
    max_depth=int(best_rf['max_depth']),
    min_samples_leaf=int(best_rf['min_samples_leaf'])
)

model_hp_rf.fit(X_train, y_train)

y_test_predict = model_hp_rf.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))

f1_score на тестовом наборе: 0.80


## 4. OPTUNA

### 4.1. Линейная регрессия

In [80]:
# Подберем гиперпараметры для модели логистической регрессии
def optuna_lr(trial):
    # задаем пространства поиска гиперпараметров
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2']),
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga']),
    C = trial.suggest_float('C', 0.1, 1)

    # создаем модель
    model = linear_model.LogisticRegression(
        penalty=penalty,
        solver=solver,
        C=C,
        max_iter=2000,
        random_state=random_state
    )
    
    model.fit(X_train, y_train)
    score = metrics.f1_score(y_train, model.predict(X_train))

    return score

In [81]:
sampler = optuna.samplers.TPESampler(seed=random_state)
study_lr = optuna.create_study(
    sampler=sampler, 
    study_name='LogisticRegression', 
    direction='maximize'
)

study_lr.optimize(optuna_lr, n_trials=20)

print("f1_score на обучающем наборе: {:.2f}".format(study_lr.best_value))

[32m[I 2022-07-08 16:36:29,102][0m A new study created in memory with name: LogisticRegression[0m
[33m[W 2022-07-08 16:36:29,102][0m Trial 0 failed because of the following error: ValueError("Logistic Regression supports only solvers in ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'], got ('lbfgs',).")[0m
Traceback (most recent call last):
  File "C:\Users\solod\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\solod\AppData\Local\Temp\ipykernel_6372\117166586.py", line 17, in optuna_lr
    model.fit(X_train, y_train)
  File "C:\Users\solod\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\solod\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 48, in _check_solver
    raise ValueError(
ValueError: Logistic Regression supports only solvers in ['liblinear', 'newton-cg

ValueError: Logistic Regression supports only solvers in ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'], got ('lbfgs',).

In [82]:
model_opt_lr = linear_model.LogisticRegression(**study_lr.best_params, random_state=random_state)

model_opt_lr.fit(X_train, y_train)

y_test_predict = model_opt_lr.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))

ValueError: No trials are completed yet.

### 4.2 Случайный лес

In [64]:
def optuna_rf(trial):
    # задаем пространства поиска гиперпараметров
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, 20)
    max_depth = trial.suggest_int('max_depth', 1, 30, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 30, 1)

    # создаем модель
    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state)
    # обучаем модель
    
    score = cross_val_score(model, X, y, cv=5, scoring="f1", n_jobs=-1).mean()

    return score

In [65]:
sampler = optuna.samplers.TPESampler(seed=random_state)
study_rf = optuna.create_study(
    sampler=sampler, 
    study_name='RandomForestClassifier', 
    direction='maximize'
)

study_rf.optimize(optuna_rf, n_trials=20)

print("f1_score на обучающем наборе: {:.2f}".format(study_rf.best_value))

[32m[I 2022-07-08 12:25:30,051][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2022-07-08 12:25:43,022][0m Trial 0 finished with value: 0.7777842858400605 and parameters: {'n_estimators': 720, 'max_depth': 24, 'min_samples_leaf': 26}. Best is trial 0 with value: 0.7777842858400605.[0m
[32m[I 2022-07-08 12:25:47,357][0m Trial 1 finished with value: 0.7517299773972959 and parameters: {'n_estimators': 360, 'max_depth': 4, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.7777842858400605.[0m
[32m[I 2022-07-08 12:25:50,559][0m Trial 2 finished with value: 0.769350719047415 and parameters: {'n_estimators': 180, 'max_depth': 8, 'min_samples_leaf': 28}. Best is trial 0 with value: 0.7777842858400605.[0m
[32m[I 2022-07-08 12:25:58,967][0m Trial 3 finished with value: 0.7904155559989166 and parameters: {'n_estimators': 400, 'max_depth': 16, 'min_samples_leaf': 15}. Best is trial 3 with value: 0.7904155559989166.[0m
[32m[I 2022-07-08 12:26:16,220

f1_score на обучающем наборе: 0.82


In [68]:
model_opt_rf = ensemble.RandomForestClassifier(**study_rf.best_params, random_state=random_state)

model_opt_rf.fit(X_train, y_train)

y_test_predict = model_opt_rf.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_predict)))

f1_score на тестовом наборе: 0.81
