<a href="https://colab.research.google.com/github/ArianeMora/enzyme-tk/blob/main/unimol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and activate

In [1]:
! uv pip install enzymetk huggingface_hub

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m34 packages[0m [2min 797ms[0m[0m
[2K[2mPrepared [1m3 packages[0m [2min 404ms[0m[0m
[2K[2mInstalled [1m3 packages[0m [2min 54ms[0m[0m
 [32m+[39m [1mbiopython[0m[2m==1.86[0m
 [32m+[39m [1menzymetk[0m[2m==0.0.6[0m
 [32m+[39m [1msciutil[0m[2m==1.0.3[0m


In [3]:
from __future__ import annotations
import pandas as pd
from sciutil import SciUtil
import timeit
import logging
import subprocess
import os

u = SciUtil()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class Pipeline():

    def __init__(self, *steps: Step):
        self.steps = list(steps)

    def __rshift__(self, other: Step) -> Step:
        return Pipeline(*self.steps, other)

    def execute(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Execute some shit.
        """
        for step in self.steps:
            df = step.execute(df)
        return df

    def __rlshift__(self, other: pd.DataFrame) -> pd.DataFrame:
        return self.execute(other)


class Step():
    def __init__(self):
        # Should only have one of these
        self.venv = None
        self.conda = None
        self.exec = "/bin/bash"


    def execute(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Execute some shit """
        return df

    def install_venv(self, env_args=None):
        self.conda = None
        self.venv = None
        """ Install unimol_tools """
        cmd = ['uv', 'venv', self.env_name]
        if env_args:
            cmd.extend(env_args)
        self.run(cmd)
        # Ensure pip is up to date and installed
        try:
          cmd = [f'{self.env_name}/bin/python', 'pip', 'install', '--upgrade', 'pip']
          self.run(cmd)
        except:
          # Need to have this for jupyter envs
          cmd = ['wget', 'https://bootstrap.pypa.io/get-pip.py']
          self.run(cmd)
          cmd = [f'{self.env_name}/bin/python', 'get-pip.py']
          self.run(cmd)


    def install_conda(self):
        return

    def run(self, cmd: list):
        """ Run a command """
        result = None
        start = timeit.default_timer()
        # Prioitize running in a venv if we have it
        if self.venv:
            cmd = [self.venv] + cmd
            u.warn_p(['Running in venv:', self.venv])
        elif self.conda:
            cmd = ['conda', 'run', '-n', self.conda] + cmd
        u.dp(['Running command', ' '.join([str(c) for c in cmd])])

        result = subprocess.run(cmd, capture_output=True,
                                text=True,
                                check=True)

        u.warn_p(['Output:'])
        print(result.stdout)
        if result.stderr:
            u.err_p(['Error:', result.stderr])
            logger.error(result.stderr)
        logger.info(result.stdout)
        u.dp(['Time for command to run (min): ', (timeit.default_timer() - start)/60])
        return result

    def __rshift__(self, other: Step)   :
        return Pipeline(self, other)

    def __rlshift__(self, other: pd.DataFrame) -> pd.DataFrame:
        """
        Overriding the right shift operator to allow for the pipeline to be executed.
        """
        return self.execute(other)


In [4]:
import pandas as pd
from tempfile import TemporaryDirectory
import logging
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class UniMol(Step):

    def __init__(self, smiles_col: str, unimol_model = 'unimolv2',
                 unimol_size = '164m', num_threads = 1,
                 env_name = 'enzymetk', venv_name = None):
        self.smiles_col = smiles_col
        self.num_threads = num_threads
        self.conda = env_name
        self.env_name = env_name
        self.venv = venv_name if venv_name else f'{env_name}/bin/python'
        self.unimol_model = unimol_model
        self.unimol_size = unimol_size
        super().__init__()

    def install(self, env_args=None):
        # e.g. env args could by python=='3.1.1.
        self.install_venv(env_args)
        # Now the specific
        try:
            cmd = [f'{self.env_name}/bin/pip', 'install', 'unimol_tools']
            self.run(cmd)
        except Exception as e:
            cmd = [f'{self.env_name}/bin/pip3', 'install', 'unimol_tools']
            self.run(cmd)
        self.run(cmd)
        # Now set the venv to be the location:
        self.venv = f'{self.env_name}/bin/python'

    def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
        smiles_list = list(df[self.smiles_col].values)
        reprs = []
        for smile in smiles_list:
            try:
                unimol_repr = self.clf.get_repr([smile], return_atomic_reprs=True)
                reprs.append(unimol_repr['cls_repr'])
            except Exception as e:
                logger.warning(f"Error embedding smile {smile}: {e}")
                reprs.append(None)
        df['unimol_repr']  = reprs
        return df

    def execute(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            from unimol_tools import UniMolRepr
        except ImportError as e:
            raise ImportError(
                "UniMolRepr requires unimol-tools. "
                "Install after initializing class with install()"
            ) from e
        # single smiles unimol representation
        clf = UniMolRepr(data_type='molecule',
                        remove_hs=False,
                        model_name= self.unimol_model or 'unimolv2', # avaliable: unimolv1, unimolv2
                        model_size= self.unimol_size or '164m', # work when model_name is unimolv2. avaliable: 84m, 164m, 310m, 570m, 1.1B.
                        )
        self.clf = clf
        with TemporaryDirectory() as tmp_dir:
            if self.num_threads > 1:
                data = []
                df_list = np.array_split(df, self.num_threads)
                for df_chunk in df_list:
                    data.append(df_chunk)
                pool = ThreadPool(self.num_threads)
                output_filenames = pool.map(self.__execute, data)
                df = pd.DataFrame()
                for tmp_df in output_filenames:
                    df = pd.concat([df, tmp_df])
                return df

            else:
                return self.__execute(df)


In [5]:
num_threads = 1
id_col = 'Entry'
substrate_col = 'Substrate'
rows = [['P0DP23', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
        ['P0DP24', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
df = pd.DataFrame(rows, columns=[id_col, substrate_col])

In [6]:
um = UniMol(substrate_col, num_threads=num_threads)

In [None]:
um.install()

Using CPython 3.12.12 interpreter at: /usr/bin/python3
Creating virtual environment at: enzymetk
Activate with: source enzymetk/bin/activate

INFO:__main__:


[94m--------------------------------------------------------------------------------[0m
[94m                       Running command	uv venv enzymetk	                        [0m
[94m--------------------------------------------------------------------------------[0m
[93m--------------------------------------------------------------------------------[0m
[93m                                    Output:	                                    [0m
[93m--------------------------------------------------------------------------------[0m

[91m--------------------------------------------------------------------------------[0m
Using CPython 3.12.12 interpreter at: /usr/bin/python3
Creating virtual environment at: enzymetk
Activate with: source enzymetk/bin/activate
	[0m
[91m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m             Time for command to run 

ERROR:__main__:--2026-01-22 10:51:40--  https://bootstrap.pypa.io/get-pip.py
Resolving bootstrap.pypa.io (bootstrap.pypa.io)... 151.101.0.175, 151.101.64.175, 151.101.128.175, ...
Connecting to bootstrap.pypa.io (bootstrap.pypa.io)|151.101.0.175|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2182415 (2.1M) [text/x-python]
Saving to: ‘get-pip.py’

     0K .......... .......... .......... .......... ..........  2% 3.46M 1s
    50K .......... .......... .......... .......... ..........  4% 11.4M 0s
   100K .......... .......... .......... .......... ..........  7% 6.58M 0s
   150K .......... .......... .......... .......... ..........  9% 22.7M 0s
   200K .......... .......... .......... .......... .......... 11% 27.4M 0s
   250K .......... .......... .......... .......... .......... 14% 7.66M 0s
   300K .......... .......... .......... .......... .......... 16% 31.5M 0s
   350K .......... .......... .......... .......... .......... 18% 33.3M 0s
   400K ........

[93m--------------------------------------------------------------------------------[0m
[93m                                    Output:	                                    [0m
[93m--------------------------------------------------------------------------------[0m

[91m--------------------------------------------------------------------------------[0m
[91mError:	--2026-01-22 10:51:40--  https://bootstrap.pypa.io/get-pip.py
Resolving bootstrap.pypa.io (bootstrap.pypa.io)... 151.101.0.175, 151.101.64.175, 151.101.128.175, ...
Connecting to bootstrap.pypa.io (bootstrap.pypa.io)|151.101.0.175|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2182415 (2.1M) [text/x-python]
Saving to: ‘get-pip.py’

     0K .......... .......... .......... .......... ..........  2% 3.46M 1s
    50K .......... .......... .......... .......... ..........  4% 11.4M 0s
   100K .......... .......... .......... .......... ..........  7% 6.58M 0s
   150K .......... .......... ........

INFO:__main__:Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 28.6 MB/s  0:00:00
Installing collected packages: pip
Successfully installed pip-25.3



[93m--------------------------------------------------------------------------------[0m
[93m                                    Output:	                                    [0m
[93m--------------------------------------------------------------------------------[0m
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 28.6 MB/s  0:00:00
Installing collected packages: pip
Successfully installed pip-25.3

[94m--------------------------------------------------------------------------------[0m
[94m              Time for command to run (min): 	0.09616167153333327	              [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m             Running command	enzymetk/bin/pip install unimol_tools	             [0m
[94m-------------

In [None]:
%%capture

num_threads = 1
id_col = 'Entry'
substrate_col = 'Substrate'
rows = [['P0DP23', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
        ['P0DP24', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
df = pd.DataFrame(rows, columns=[id_col, substrate_col])
um = UniMol(substrate_col, num_threads=num_threads)
um.execute(df)

In [None]:
df