In [1]:
from utils import split_and_preprocess, fit_basic_models, models_grid_search, evaluate_pipelines
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
base_dir = Path("../../data/moltox21")
X_train, y_train, X_test, y_test, X_eval, y_eval = split_and_preprocess(base_dir)

y_train = y_train.dropna()
X_train = X_train.loc[y_train.index]

y_eval = y_eval.dropna()
X_eval = X_eval.loc[y_eval.index]



### Baseline models

In [3]:
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = [
    ("RandomForestClassifier", MultiOutputClassifier(LogisticRegression(max_iter=5000))),
    ("LogisticRegression", MultiOutputClassifier(RandomForestClassifier()))
]

metrics = [hamming_loss, f1_score, precision_score, recall_score]

In [4]:

simple_models = fit_basic_models(X_train, y_train, models)

100%|██████████| 2/2 [00:10<00:00,  5.11s/it]


In [15]:
from ogb.graphproppred import Evaluator

evaluator = Evaluator(name = "ogbg-moltox21")
print(evaluator.expected_input_format) 
print(evaluator.expected_output_format)  

all_res = []
for name, model in simple_models:
    y_pred = model.predict(X_eval)
    f1_samples = f1_score(y_pred, y_eval, average="samples")
    input_dict = {
        "y_pred": y_pred,
        "y_true": y_eval.values,
    }
    res = {
        "name": name,
        "auc": evaluator.eval(input_dict)["rocauc"],
        "f1_samples": f1_samples,
    }
    all_res.append(res)

==== Expected input format of Evaluator for ogbg-moltox21
{'y_true': y_true, 'y_pred': y_pred}
- y_true: numpy ndarray or torch tensor of shape (num_graphs, num_tasks)
- y_pred: numpy ndarray or torch tensor of shape (num_graphs, num_tasks)
where y_pred stores score values (for computing AUC score),
num_task is 12, and each row corresponds to one graph.
nan values in y_true are ignored during evaluation.

==== Expected output format of Evaluator for ogbg-moltox21
{'rocauc': rocauc}
- rocauc (float): ROC-AUC score averaged across 12 task(s)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
pd.DataFrame(all_res)

Unnamed: 0,name,auc,f1_samples
0,RandomForestClassifier,0.594965,0.035714
1,LogisticRegression,0.568599,0.017494
