In [1]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

df = pd.read_csv('data/molecular_activity.csv')
display(df.head(2))

X, y = df.drop('Activity', axis=1), df['Activity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0


### GridSeachCV (lr)

In [2]:
from sklearn.model_selection import GridSearchCV

param_grid = [{
    'penalty': ['l2', 'none'],
    'solver': ['sag', 'saga'],
    'C': [0.01, 0.3, 0.7, 1]
}]

grid_search = GridSearchCV(
    estimator=LogisticRegression(random_state=1, max_iter=50), 
    param_grid=param_grid, cv=5, n_jobs=-1
)
grid_search.fit(X_train, y_train) 
y_test_pred = grid_search.predict(X_test)
print(f1_score(y_test, y_test_pred))

0.7819790828640387




### RandomizedSearchCV (lr)

In [3]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=LogisticRegression(random_state=1, max_iter=50), 
    param_distributions=param_grid, cv=5, n_iter=10, n_jobs=-1
)
random_search.fit(X_train, y_train)
y_test_pred = random_search.predict(X_test)
print(f1_score(y_test, y_test_pred))

0.7819790828640387




### Hyperopt (rf)

In [4]:
from hyperopt import hp, fmin, Trials
import numpy as np

def hyperopt_rf(params, X=X_train, y=y_train):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']), 
        'min_samples_leaf': int(params['min_samples_leaf'])
    }
    rf = RandomForestClassifier(**params, random_state=1)
    rf.fit(X, y)
    return -f1_score(y, rf.predict(X))

trials = Trials()
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
    'max_depth' : hp.quniform('max_depth', 15, 26, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)
}
best = fmin(hyperopt_rf, space=space, max_evals=50, trials=trials) #, rstate=np.random.RandomState(1))

model = RandomForestClassifier(
    random_state=1, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print(f1_score(y_test, y_test_pred))

TPE is being used as the default algorithm.


100%|██████████| 50/50 [01:19<00:00,  1.59s/trial, best loss: -0.9932838458819372]
0.8168557536466774


### Optuna (rf)

In [5]:
import optuna

def optuna_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
    max_depth = trial.suggest_int('max_depth', 10, 30, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=1
    )
    model.fit(X_train, y_train)
    return f1_score(y_train, model.predict(X_train))

study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
study.optimize(optuna_rf, n_trials=50)

rf = RandomForestClassifier(**study.best_params,random_state=1)
rf.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print(f1_score(y_test, y_test_pred))

[32m[I 2022-06-05 02:02:57,212][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2022-06-05 02:02:58,325][0m Trial 0 finished with value: 0.9618747813920951 and parameters: {'n_estimators': 102, 'max_depth': 21, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9618747813920951.[0m
[32m[I 2022-06-05 02:02:59,789][0m Trial 1 finished with value: 0.9925768822905621 and parameters: {'n_estimators': 120, 'max_depth': 28, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.9925768822905621.[0m
[32m[I 2022-06-05 02:03:00,791][0m Trial 2 finished with value: 0.9012905476107429 and parameters: {'n_estimators': 103, 'max_depth': 20, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.9925768822905621.[0m
[32m[I 2022-06-05 02:03:02,081][0m Trial 3 finished with value: 0.9034965034965035 and parameters: {'n_estimators': 117, 'max_depth': 20, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.9925768822905621.[0m
[32m[I 2022-06-05 02:03:03,86

0.8168557536466774
