## Instructions to setup a demo run

1. Prepare an input file similar to csv files given in ./data/demo/
2. Define the path to the input file in below cell
3. Define the parameter for prediction
4. Run the cell to get output file
5. The last five columns of the output csv file saved will have the prediction results

In [1]:
parameter = 'kcat' # allowed values: ["kcat", "Km", "Ki"] 
parameter = parameter.lower()

use_cpu = 1 # set to 0 if you have GPU enabled

input_file_path = './demo/batch_kcat.csv'

## Navigate to below cell and click "Run->Run Selected Cell" to get prediction

The result will be printed on the right column

In [2]:
parameter = 'kcat' # allowed values: ["kcat", "Km", "Ki"] 
parameter = parameter.lower()

use_cpu = 1 # set to 0 if you have GPU enabled

input_file_path = './demo/batch_kcat.csv'

In [3]:
import os
import pandas as pd
import numpy as np
from IPython.display import Image, display
from rdkit import Chem
from IPython.display import display, Latex, Math

def create_csv_sh(parameter, input_file_path):
    df = pd.read_csv(input_file_path)
    smiles_list = df.SMILES
    seq_list = df.sequence
    smiles_list_new = []
    i=0
    for smi in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smi)
            smi = Chem.MolToSmiles(mol)
        except:
            print(f'Invalid SMILES input in input row {i}')
            print('Correct your input! Exiting..')
            return
        if parameter=='kcat':
            if '.' in smi:
              x = smi.split('.')
              y = sorted(x)
              smi = '.'.join(y)
        smiles_list_new.append(smi)
        i+=1
  
    i=0
    valid_aas = list('ACDEFGHIKLMNPQRSTVWY')
    for seq in seq_list:
      for aa in seq:
        if not aa in valid_aas:
          print(f'Invalid Enzyme sequence input in row {i}!')
          print('Correct your input! Exiting..')
          return
      i+=1

    input_file_new_path = f'{input_file_path[:-4]}_input.csv'
    df['SMILES'] = smiles_list_new
    df.to_csv(input_file_new_path)
    
    f = open(f'predict.sh', 'w')
    f.write(f'''
    TEST_FILE_PREFIX={input_file_new_path[:-4]}
    RECORDS_FILE=${{TEST_FILE_PREFIX}}.json
    CHECKPOINT_DIR=../data/pretrained/production/{parameter}/
    
    python ./scripts/create_pdbrecords.py --data_file ${{TEST_FILE_PREFIX}}.csv --out_file ${{RECORDS_FILE}}
    gzip ${{RECORDS_FILE}}
    python predict.py --test_path ${{TEST_FILE_PREFIX}}.csv --preds_path ${{TEST_FILE_PREFIX}}_output.csv --checkpoint_dir $CHECKPOINT_DIR --uncertainty_method mve --smiles_column SMILES --individual_ensemble_predictions --protein_records_path ${{RECORDS_FILE}}.gz
    ''')
    f.close()
    
    return input_file_new_path[:-4]+'_output.csv'

outfile = create_csv_sh(parameter, input_file_path)

print('Predicting.. This will take a while..\n')

if use_cpu:
    os.system("export PROTEIN_EMBED_USE_CPU=1;./predict.sh")
else:
    os.system("export PROTEIN_EMBED_USE_CPU=0;./predict.sh") #>/dev/null 2>&1

def get_predictions(parameter, outfile):
    df = pd.read_csv(outfile)
    pred_col = []
    pred_logcol = []
    pred_sd_totcol = []
    pred_sd_aleacol = []
    pred_sd_epicol = []

    unit = 'mM'
    if parameter == 'kcat':
        target_col = 'log10kcat_max'
        unit = 's^(-1)'
    elif parameter == 'km':
        target_col = 'log10km_mean'
    else:
        target_col = 'log10ki_mean'

    unc_col = f'{target_col}_mve_uncal_var'

    for _, row in df.iterrows():
        model_cols = [col for col in row.index if col.startswith(target_col) and 'model_' in col]

        tot_var = float(row[unc_col])
        prediction = float(row[target_col])
        prediction_linear = np.power(10, prediction)

        model_outs = np.array([row[col] for col in model_cols], dtype=float)
        epi_var = float(np.var(model_outs))
        alea_var = max(0.0, tot_var - epi_var)  # can go slightly negative due to numeric issues

        pred_col.append(prediction_linear)
        pred_logcol.append(prediction)
        pred_sd_totcol.append(np.sqrt(max(0.0, tot_var)))
        pred_sd_aleacol.append(np.sqrt(alea_var))
        pred_sd_epicol.append(np.sqrt(max(0.0, epi_var)))

    df[f'Prediction_({unit})'] = pred_col
    df['Prediction_log10'] = pred_logcol
    df['SD_total'] = pred_sd_totcol
    df['SD_aleatoric'] = pred_sd_aleacol
    df['SD_epistemic'] = pred_sd_epicol

    return df

output_final = get_predictions(parameter, outfile)
output_final.to_csv(f'{outfile}')
print('Output saved to', outfile)

Predicting.. This will take a while..



  @autocast(enabled = False)
  @autocast(enabled = False)
Traceback (most recent call last):
  File "/home/omerkfir/Kinetics_repo/CatPred/predict.py", line 35, in <module>
    results = main()
  File "/home/omerkfir/Kinetics_repo/CatPred/predict.py", line 30, in main
    results = catpred_predict()
  File "/home/omerkfir/Kinetics_repo/CatPred/catpred/train/make_predictions.py", line 515, in catpred_predict
    make_predictions(args=PredictArgs().parse_args())
  File "/home/omerkfir/miniconda3/envs/catpred/lib/python3.9/site-packages/tap/tap.py", line 481, in parse_args
    self.process_args()
  File "/home/omerkfir/Kinetics_repo/CatPred/catpred/args.py", line 990, in process_args
    super(PredictArgs, self).process_args()
  File "/home/omerkfir/Kinetics_repo/CatPred/catpred/args.py", line 214, in process_args
    self.checkpoint_paths = get_checkpoint_paths(
  File "/home/omerkfir/Kinetics_repo/CatPred/catpred/args.py", line 59, in get_checkpoint_paths
    raise ValueError(f'Failed to

calculating protein embed only on cpu
Output saved to ./demo/batch_kcat_input_output.csv


  loaded_model = torch.load(trained_model_fp, map_location=device)


  embeds = torch.load('./demo/structures_embeds.pt')
