In [1]:
import numpy as np
import pandas as pd
from utils import ChemFeatureGenerator, split_and_preprocess
from pathlib import Path

In [2]:
base_dir = Path("../../data/molhiv")
X_train, y_train, X_test, y_test, X_eval, y_eval = split_and_preprocess(base_dir)




In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score
import tqdm

### Simple Baseline Models


In [4]:
def fit_basic_models(X_train, y_train, models):
    pipelines = []
    for name, model in tqdm.tqdm(models):
        pipe = Pipeline([
            ('imputer', SimpleImputer()),
            ('scaller', StandardScaler()),
            ('selector', VarianceThreshold(threshold=0.8*0.2)),
            (name, model),
        ])
        pipe.fit(X_train, y_train.values.ravel())
        pipelines.append( (name, pipe) )
    return pipelines

models = [("lr", LogisticRegression(max_iter=1_000)), ("rf",RandomForestClassifier()), ("nb",GaussianNB())]
fit_pipelines = fit_basic_models(X_train, y_train, models)

100%|██████████| 3/3 [00:24<00:00,  8.03s/it]


In [5]:
def evaluate_pipelines(pipelines, metrics, X, y):
    all_res = []
    for model_name, pipe in pipelines:
        y_pred = pipe.predict(X)
        res = {"model": model_name}
        for metric in metrics:
            res[metric.__name__] = metric(y, y_pred)
        all_res.append(res)
    return pd.DataFrame(all_res)


metrics = [f1_score, precision_score, recall_score]
evaluate_pipelines(fit_pipelines, metrics, X_train, y_train)


Unnamed: 0,model,f1_score,precision_score,recall_score
0,lr,0.283454,0.754386,0.174513
1,rf,0.999188,1.0,0.998377
2,nb,0.096942,0.051662,0.784903


In [6]:
evaluate_pipelines(fit_pipelines, metrics, X_eval, y_eval)

Unnamed: 0,model,f1_score,precision_score,recall_score
0,lr,0.298246,0.515152,0.209877
1,rf,0.28,0.736842,0.17284
2,nb,0.049942,0.025773,0.802469


### Models with weighted classes

In [7]:
models = [
    ("lr", LogisticRegression(max_iter=1_000, class_weight="balanced")),
    ("rf",RandomForestClassifier(class_weight="balanced")),
    ("nb",GaussianNB(priors=(31668/32900,1232/32900)))
]
fit_pipelines = fit_basic_models(X_train, y_train, models)

100%|██████████| 3/3 [00:17<00:00,  5.94s/it]


In [8]:
evaluate_pipelines(fit_pipelines, metrics, X_train, y_train)

Unnamed: 0,model,f1_score,precision_score,recall_score
0,lr,0.202688,0.118394,0.703734
1,rf,0.999594,1.0,0.999188
2,nb,0.096942,0.051662,0.784903


In [9]:
evaluate_pipelines(fit_pipelines, metrics, X_eval, y_eval)

Unnamed: 0,model,f1_score,precision_score,recall_score
0,lr,0.097196,0.052578,0.641975
1,rf,0.229167,0.733333,0.135802
2,nb,0.049942,0.025773,0.802469


#### Hyper parameter optimisation

In [10]:
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

def models_grid_search(models_with_grids, X, y, cv=3, n_iter=10, scoring=None):
    pipelines = []
    for name, model, grid in tqdm.tqdm(models_with_grids):
        pipe = Pipeline([
            ('imputer', SimpleImputer()),
            ('scaller', StandardScaler()),
            ('selector', VarianceThreshold(threshold=0.8*0.2)),
            (name, model),
        ])
        opt = BayesSearchCV(
            pipe,
            grid,
            cv=cv,
            n_iter=n_iter,
            scoring=scoring
        )
        opt.fit(X, y.values.ravel())
        pipelines.append( (name, opt) )
    return pipelines



In [24]:
grid_lr = {
    'lr__C': Real(0.005, 100, prior="log-uniform"),
    'lr__penalty': Categorical(['l1', 'l2'])
}

grid_rf = {
    'rf__max_depth': Categorical([5, 10, 20]),
    'rf__min_samples_split': Integer(2,10),
    'rf__n_estimators': Integer(50, 150),
}
grid_nb = {
    'nb_var_smoothing': Real(1e-10,)
}

models = [
    ("lr", LogisticRegression(max_iter=1_000, class_weight="balanced", solver="saga"), grid_lr),
    ("rf",RandomForestClassifier(class_weight="balanced"), grid_rf),
    ("nb",GaussianNB(), grid_nb)
]

In [25]:
opt_models = models_grid_search(models, X_train, y_train, cv=2, n_iter=10)

100%|██████████| 3/3 [15:14<00:00, 304.92s/it]


In [29]:
evaluate_pipelines(opt_models, metrics, X_train, y_train)

Unnamed: 0,model,f1_score,precision_score,recall_score
0,lr,0.185199,0.108656,0.626623
1,rf,0.947612,0.90176,0.998377
2,nb,0.103928,0.056186,0.691558


In [30]:
evaluate_pipelines(opt_models, metrics, X_eval, y_eval)

Unnamed: 0,model,f1_score,precision_score,recall_score
0,lr,0.105161,0.057082,0.666667
1,rf,0.294118,0.714286,0.185185
2,nb,0.051444,0.026698,0.703704


### Save models

In [31]:
import joblib
for name, model in opt_models:
    joblib.dump(model, base_dir.parent.parent / "models" /(name + "_hiv.pkg"))