# Classifiers for all targets
Let's train classifiers for all target for different molecular represenatations and compare results. The next step will involve the optimization of hyperparameters for each classifier. `mlflow` will be used for experiment management.

In [1]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore")

import polars as pl
from pathlib import Path
import random
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# add parent directory to path
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from metrics import ClassificationMetrics
from tools import expand_array_column

In [2]:
path_parquet = Path("../data/2023_09_12_papyrus1k_dataset_more_params_STD_MFP_lessColumns.parquet")
assert path_parquet.exists()

Load precomputed Morgan fingerprints:

In [3]:
path_fps = Path("../out/fingerprints_dicts/d_morgan.pkl")
assert path_fps.exists()

with open(path_fps, "rb") as f:
    d_morgan = pickle.load(f)

Load data and train classifier:

In [4]:
df = pl.read_parquet(path_parquet)

In [5]:
# # select randomly 5 targets
# random.seed(42)
# random_targets = random.sample(df["target_id"].unique().to_list(), 5)
# target_id = random_targets[0]

target_id = "P41144_WT"

df_target = df.filter(pl.col("target_id") == target_id)
df_target = df_target.drop(["target_id", "STD_SELFIES"])

df_target = df_target.with_columns(
    morgan_fp = pl.col("STD_SMILES").map_elements(lambda x: d_morgan[x])
)

df_target = expand_array_column(df_target, "morgan_fp", 2048)

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)


def train_classifier(model, df_target, target_id):
    """
    Parameters
    ----------
    model: Callable
        Model to be used for predictions.
    df_target: pl.DataFrame
        Polars dataframe with features `f_<id>`, train/validation/test
        split in `split` in `split` column, and labels `class_label`
        and features `f_<id>, id in `range(1, 2049)`.
    target_id: str
        Target id to be used for metrics calculations.
    """
    assert hasattr(model, "fit")
    assert "split" in df_target.columns

    df_train = df_target.filter(pl.col("split") == "train")
    # df_val = df_target.filter(pl.col("split") == "validation")
    # df_test = df_target.filter(pl.col("split") == "test")

    feature_cols = [f"f_{i+1:04d}" for i in range(2048)]
    target_col = "class_label"

    X_train, y_train = df_train[feature_cols], df_train[target_col]
    
    # Train the model
    model.fit(X_train, y_train)
    
    metrics = ClassificationMetrics(model, df_target, target_id)
    return metrics

metrics = train_classifier(model, df_target, target_id)
print(metrics)

Target: P41144_WT
Train MCC: 0.9936787625762681,
Val MCC: 0.5450571046392932,
Test MCC: 0.528680532637681.



Target: P41144_WT<br>
Train MCC: 0.9936787625762681,<br>
Val MCC: 0.5450571046392932,<br>
Test MCC: 0.528680532637681.