In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from hyperopt import hp, fmin, tpe, Trials
import optuna

In [2]:
data = pd.read_csv('data/_train_sem09 (1).csv')
data.head(3)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X = data.drop('Activity', axis=1)
y = data['Activity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Логистическая регрессия без подбора параметров

In [11]:
log_model = LogisticRegression(max_iter = 1000, random_state=42)
log_model.fit(X_train, y_train)
y_train_pred = log_model.predict(X_train)
y_test_pred = log_model.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))


f1_score на тренировочной выборке: 0.89
f1_score на тестовой выборке: 0.78


# Случайный лес без подбора параметров

In [12]:
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train, y_train)
y_train_pred = forest_model.predict(X_train)
y_test_pred = forest_model.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 1.00
f1_score на тестовой выборке: 0.80


# GridSearchCv

In [18]:
#для логистической регрессии
log_model = LogisticRegression(max_iter = 1000, random_state=42)
params = [{'penalty': ['l2', 'none'],
              'solver': ['lbfgs', 'sag'],
              'C': list(np.linspace(0.01, 1, 10))},
              
              {'penalty': ['l1', 'none'],
              'solver': ['liblinear', 'saga'],
              'C': list(np.linspace(0.01, 1, 10))}
]
grid_search = GridSearchCV(
    log_model,
    param_grid= params,
    cv= 5,
    n_jobs= -1    
)
grid_search.fit(X_train, y_train)

50 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 78, in _check_solver
    raise ValueError("penalty='none' is not supported for

In [19]:
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))


f1_score на тренировочной выборке: 0.83
f1_score на тестовой выборке: 0.78


In [19]:
# для случайного леса
forest_model = RandomForestClassifier(random_state=42)
params = {'n_estimators': list(range(100, 200, 10)),
          'min_samples_leaf': list(range(2, 7, 1)),
          'max_depth': list(range(7, 30, 2))}
grid_search = GridSearchCV(
    forest_model,
    param_grid=params,
    cv=5,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

In [20]:
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 0.98
f1_score на тестовой выборке: 0.81


In [21]:
grid_search.best_params_

{'max_depth': 15, 'min_samples_leaf': 2, 'n_estimators': 130}

# RandomSearchCv

In [6]:
# для логистической регрессии
log_model = LogisticRegression(max_iter = 500, random_state=42)
params = [{'penalty': ['l2', 'none'],
              'solver': ['lbfgs', 'sag'],
              'C': list(np.linspace(0.01, 1, 10))},
              
              {'penalty': ['l1', 'none'],
              'solver': ['liblinear', 'saga'],
              'C': list(np.linspace(0.01, 1, 10))}
]
rand_search = RandomizedSearchCV(
    log_model,
    param_distributions=params,
    n_iter= 3,
    cv=5,
    random_state=42
)
rand_search.fit(X_train, y_train)



In [7]:
y_train_pred = rand_search.predict(X_train)
y_test_pred = rand_search.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 0.89
f1_score на тестовой выборке: 0.78


In [10]:
# для случайного леса
forest_model = RandomForestClassifier(random_state=42)
params = {'n_estimators': list(range(100, 200, 10)),
          'min_samples_leaf': list(range(2, 7, 1)),
          'max_depth': list(range(7, 30, 2))}
rand_search = RandomizedSearchCV(
    forest_model,
    param_distributions=params,
    n_iter= 15,
    cv=5,
    n_jobs=-1
)
rand_search.fit(X_train, y_train)

In [11]:
y_train_pred = rand_search.predict(X_train)
y_test_pred = rand_search.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 0.95
f1_score на тестовой выборке: 0.79


# Hyperopt

In [21]:
model = LogisticRegression(max_iter=100, random_state=42)
score = cross_val_score(model, X_train, y_train, cv=5, scoring="f1", n_jobs=-1).mean()
score

0.7726249337736326

In [25]:
# для логистической регрессии
space = {'penalty': hp.choice('penalty', ['l2', 'none']),
              'solver': hp.choice('solver', ['saga', 'lbfgs']),
              'C': hp.choice('C', list(np.linspace(0.01, 1, 10)))}
def min_func(params, cv=5, X= X_train, y=y_train):
    params= {'penalty': params['penalty'],
             'solver': params['solver'],
             'C': params['C']}
    model = LogisticRegression(**params, max_iter=100, random_state=42)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score
trial = Trials()

best = fmin(
    min_func,
    space, 
    algo=tpe.suggest,
    max_evals= 20,
    trials=trial,
    rstate=np.random.default_rng(42)
)

print("Наилучшие значения гиперпараметров {}".format(best))


100%|██████████| 20/20 [02:38<00:00,  7.93s/trial, best loss: -0.7880145279769869]
Наилучшие значения гиперпараметров {'C': 0, 'penalty': 0, 'solver': 1}


In [33]:
model = LogisticRegression(penalty='l2', solver='lbfgs', C=0.01, max_iter=100, random_state=42)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 0.82
f1_score на тестовой выборке: 0.78


In [35]:
# для случайного леса
space = {'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
       'max_depth' : hp.quniform('max_depth', 15, 26, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)
      }
def min_func(params, cv=5, X= X_train, y=y_train):
    params= {'n_estimators': int(params['n_estimators']),
             'max_depth': int(params['max_depth']),
             'min_samples_leaf': int(params['min_samples_leaf'])}
    model = RandomForestClassifier(**params, random_state=42)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score
trial = Trials()

best = fmin(
    min_func,
    space, 
    algo=tpe.suggest,
    max_evals= 20,
    trials=trial,
    rstate=np.random.default_rng(42)
)

print("Наилучшие значения гиперпараметров {}".format(best))


100%|██████████| 20/20 [01:29<00:00,  4.45s/trial, best loss: -0.8160803811393121]
Наилучшие значения гиперпараметров {'max_depth': 18.0, 'min_samples_leaf': 2.0, 'n_estimators': 103.0}


In [48]:
model = RandomForestClassifier(n_estimators=103, max_depth=18, min_samples_leaf= 2, random_state=42)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 0.99
f1_score на тестовой выборке: 0.81


# Optuna

In [6]:
# для логистической регрессии
def optuna_func(trial):
    penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga'])
    C = trial.suggest_float('C', 0.01, 0.91, step=0.1)
    model = LogisticRegression(
        max_iter=500,
        random_state=42,
        penalty= penalty,
        solver= solver,
        C= C    
    )
    score = cross_val_score(model, X=X_train, y= y_train, scoring='f1', cv=5).mean()
    return score

study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
study.optimize(optuna_func, n_trials=20)

[32m[I 2022-12-11 11:58:28,457][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2022-12-11 11:58:37,357][0m Trial 0 finished with value: 0.7856190729512456 and parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.11}. Best is trial 0 with value: 0.7856190729512456.[0m
[32m[I 2022-12-11 11:58:52,195][0m Trial 1 finished with value: 0.7804231836409031 and parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.51}. Best is trial 0 with value: 0.7856190729512456.[0m
[32m[I 2022-12-11 12:05:21,732][0m Trial 2 finished with value: 0.7653993605915081 and parameters: {'penalty': 'none', 'solver': 'saga', 'C': 0.31000000000000005}. Best is trial 0 with value: 0.7856190729512456.[0m
[32m[I 2022-12-11 12:09:21,807][0m Trial 3 finished with value: 0.7653993605915081 and parameters: {'penalty': 'none', 'solver': 'saga', 'C': 0.31000000000000005}. Best is trial 0 with value: 0.7856190729512456.[0m
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the 

In [8]:
model = LogisticRegression(**study.best_params, random_state=42)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 0.82
f1_score на тестовой выборке: 0.78


In [13]:
# для случайного леса
def optuna_func(trial):
    n_estimators = trial.suggest_int('n_estimators', 80, 150, 5)
    max_depth = trial.suggest_int('max_depth', 10, 30, 2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5, 1)
    model = RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=42)
    score = cross_val_score(model, X=X_train, y= y_train, scoring='f1', cv=5).mean()
    return score

study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
study.optimize(optuna_func, n_trials=20)

[32m[I 2022-12-11 12:45:47,800][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2022-12-11 12:45:58,010][0m Trial 0 finished with value: 0.8036360839605841 and parameters: {'n_estimators': 80, 'max_depth': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8036360839605841.[0m
[32m[I 2022-12-11 12:46:21,036][0m Trial 1 finished with value: 0.813990302128647 and parameters: {'n_estimators': 145, 'max_depth': 28, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.813990302128647.[0m
[32m[I 2022-12-11 12:46:31,954][0m Trial 2 finished with value: 0.805093397262182 and parameters: {'n_estimators': 105, 'max_depth': 10, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.813990302128647.[0m
[32m[I 2022-12-11 12:46:44,139][0m Trial 3 finished with value: 0.8116752575729373 and parameters: {'n_estimators': 85, 'max_depth': 28, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.813990302128647.[0m
[32m[I 2022-12-11 12:47:04,550][0m T

In [14]:
model = RandomForestClassifier(**study.best_params, random_state=42)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('f1_score на тренировочной выборке: {:.2f}'.format(f1_score(y_train, y_train_pred)))
print('f1_score на тестовой выборке: {:.2f}'.format(f1_score(y_test, y_test_pred)))

f1_score на тренировочной выборке: 1.00
f1_score на тестовой выборке: 0.81
