# Classifiers for all targets
Let's train classifiers for all target for different molecular represenatations and compare results. The next step will involve the optimization of hyperparameters for each classifier. `mlflow` will be used for experiment management.<br>

To launch the mlflow server run:
```bash
cd mlflow
mlflow ui --backend-store-uri sqlite:///mlflow.db
```

In [None]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore")

import polars as pl
from pathlib import Path
import random
import pickle
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
import seaborn as sns
import pandas as pd

# add parent directory to path
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from metrics import ClassificationMetrics
from tools import expand_array_column, train_classifier

In [None]:
import mlflow
mlflow.set_tracking_uri("sqlite:///../mlflow/mlflow.db")
mlflow.set_experiment("mol-reprs-benchmark")

In [None]:
path_parquet = Path("../data/2023_09_12_papyrus1k_dataset_more_params_STD_MFP_lessColumns.parquet")
assert path_parquet.exists()

Define paths to precomputed fingerprints:

In [None]:
path_fps = Path("../out/fingerprints_dicts")
assert path_fps.exists()

paths_fingerprints = list(path_fps.glob("*.pkl"))
def fps_name(fps_path): return fps_path.name.split(".")[0][2:]
fps_names = [fps_name(x) for x in paths_fingerprints]

Load data and train classifier:

In [None]:
df = pl.read_parquet(path_parquet)

In [None]:
for path_fingerprint, fps_name in tqdm(zip(paths_fingerprints, fps_names)):
    # load fingerpriint to test and correspoinding name
    with open(path_fingerprint, "rb") as f:
        d_fps = pickle.load(f)

    for target_id in tqdm(df["target_id"].unique().to_list()):

        with mlflow.start_run():

            mlflow.set_tag("model_type", "random_forest")
            mlflow.set_tag("dataset", "Papyrus1K")
            mlflow.set_tag("fingerprint", fps_name)

            mlflow.log_param("target_id", target_id)

            df_target = df.filter(pl.col("target_id") == target_id)
            df_target = df_target.drop(["target_id", "STD_SELFIES"])

            df_target = df_target.with_columns(
                fp = pl.col("STD_SMILES").map_elements(lambda x: d_fps[x])
            )
            
            # number of features
            k = list(d_fps.keys())[0]
            n_features = len(d_fps[k])
            df_target = expand_array_column(df_target, "fp", n_features)

            # Define the model with hyperparameters
            n_estimators = 1000
            random_state = 42
            model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("random_state", random_state)

            metrics = train_classifier(model, df_target, target_id)
            mlflow.log_metric("mcc_train", metrics.mcc_train)
            mlflow.log_metric("mcc_val", metrics.mcc_val)
            mlflow.log_metric("mcc_test", metrics.mcc_test)

Extract metrics from `mlflow` and visualize them. This allows not create additional data structure to store metrics.

In [None]:
# all_runs = mlflow.search_runs(search_all_experiments=True)
all_runs = mlflow.search_runs(experiment_ids=[1], order_by=["metrics.mcc_test DESC"])

In [None]:
all_runs.columns

In [None]:
all_runs.head()

In [None]:
# generate df_metrics from all_runs dataframe
df_metrics = all_runs[["params.target_id", "metrics.mcc_test", "metrics.mcc_train", "metrics.mcc_val", "tags.fingerprint"]].copy()
df_metrics = df_metrics.rename(columns={"params.target_id": "target_id", "metrics.mcc_test": "mcc_test",
                                        "metrics.mcc_train": "mcc_train", "metrics.mcc_val": "mcc_val", "tags.fingerprint": "fps_name"})
df_metrics

Final comparison of performance of different fingerprints:

In [None]:
sns.boxplot(x="fps_name", y="mcc_test", data=df_metrics);