In [16]:
import pandas as pd
import numpy as np
import yaml
import os
import sys
# Get the current working directory
working_dir = '/arc/project/st-ashapi01-1/git/afraz96/RADD/workflows/part2_version2'
sys.path.append(working_dir)
import train
import test
# Append the src directory to the Python path
src_dir = os.path.join(working_dir, 'src')
sys.path.append(src_dir)

# Path to the config file
config_path = os.path.join(working_dir, 'config', 'config.yaml')
import utils

with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.decomposition import PCA
import logging
import pickle
from sklearn import preprocessing
import math
import time
# Import the ML Models
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost
from tensorflow.keras.models import Sequential, model_from_json
import tensorflow as tf

# Load in the models and check the compounds they predict on

In [6]:
high_res_data = pd.read_csv(os.path.join(working_dir, 'Data/NEW_HIGH_RES_2023_SMILES.csv'))
x500r_data = pd.read_csv(os.path.join(working_dir, 'Data/NEW_X500R_SMILES.csv'))
bccdc_data = pd.read_csv(os.path.join(working_dir, 'Data/training_data_bccdc_20240708.csv'))

# Load the ML models trained and the preprocessors

In [11]:
ml_models_dir = '/scratch/st-ashapi01-1/RADD/SMILES_ML_PIPELINE'
bccdc_output_dir = 'NPS_OUTPUT'
x500r_output_dir = 'X500R_output'

In [12]:
def read_ml_models(directory):
    all_models = []
    for root, dirs, files in os.walk(os.path.join(directory, 'models')):
        for file in files:
            if file.endswith('.json'):
                    json_file = open(root + '/' + file, 'r')
                    loaded_model_json = json_file.read()
                    loaded_model = model_from_json(loaded_model_json)
                    all_models.append(loaded_model)
                    json_file.close() 
            elif file.endswith('.pkl'):
                with open(root + '/' + file, 'rb') as f:
                    all_models.append(pickle.load(f))
            elif file.endswith('.h5'):
                neural_net_weights = os.path.join(root, file)

        for i in all_models:
            if isinstance(i, tf.keras.Model):
                i.load_weights(neural_net_weights)

        with open(os.path.join(directory, 'processor.pkl'), 'rb') as f:
            processor = pickle.load(f)
    return all_models, processor
        

In [13]:
bccdc_models, bccdc_processor = read_ml_models(os.path.join(ml_models_dir, bccdc_output_dir))
x500r_models, x500r_processor = read_ml_models(os.path.join(ml_models_dir, x500r_output_dir))

In [14]:
## Load configurations
model_X = 'SMILES'
bccdc_model_index = 'Name'
x500r_model_index = 'Compound'
bccdc_model_y = 'PTC Confirmed RT'
x500r_model_y = 'Retention Time (min)'

In [26]:
def create_model_predictions(df, model_df, all_models, processor, database_name=''):
    # Instaniate the Encoder
    vectorizer = train.SMILESVectorizer()
    vectorizer.fit(model_df[model_X].to_list())
    df_vectorized, _ = vectorizer.transform(df[model_X].to_list())

    # Flatten and create feature names
    df_flattened = train.flatten_and_create_feature_names(df_vectorized)

    # Combine with additional features (assuming no additional features in this example)
    combined_df = train.combine_with_additional_features(df_flattened, None)

    # you have the normalized data now
    norm_X = processor.transform(combined_df)

    # Make the predictions
    model_names = config['model_names']
    df_col_names = []
    for i,name in enumerate(model_names):
        df_col_names.append(database_name + name + '_prediction')
        df[database_name + name + '_prediction'] = all_models[i].predict(norm_X)
    return df

In [21]:
# drop na
bccdc_data.dropna(subset=['SMILES'], inplace=True)

In [22]:
bccdc_data = create_model_predictions(bccdc_data, bccdc_models, bccdc_processor)
x500r_data = create_model_predictions(x500r_data, x500r_models, x500r_processor)

max_smiles_length: [27, 38, 45, 43, 41, 58, 43, 43, 43, 51, 46, 21, 57, 36, 50, 42, 17, 33, 31, 37, 40, 41, 42, 36, 33, 67, 33, 51, 36, 53, 30, 42, 36, 38, 47, 44, 39, 54, 45, 40, 41, 42, 37, 41, 47, 32, 44, 44, 53, 37, 33, 32, 41, 33, 53, 33, 51, 110, 36, 34, 44, 44, 52, 47, 47, 52, 52, 50, 91, 21, 59, 64, 21, 42, 41, 34, 45, 35, 62, 503, 38, 26, 20, 29, 67, 47, 43, 37, 25, 32, 25, 45, 17, 44, 18, 35, 29, 29, 35, 30, 24, 47, 36, 51, 41, 47, 49, 19, 52, 59, 42, 29, 41, 30, 29, 43, 33, 66, 34, 40, 34, 43, 45, 44, 44, 39, 46, 31, 32, 21, 83, 41, 18, 29, 44, 36, 40, 46, 32, 21, 44, 42, 55, 40, 64, 46, 44, 41, 44, 32, 48, 48, 50, 37, 37, 38, 125, 65, 35, 60, 49, 27, 46, 55, 47, 52, 39]
Charset Size: 39
Char to Int Mapping: {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13, '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26, '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, 

In [27]:
high_res_data = create_model_predictions(high_res_data, bccdc_data, bccdc_models, bccdc_processor, 'bccdc_')
high_res_data = create_model_predictions(high_res_data, x500r_data, x500r_models, x500r_processor, 'X500R_')

max_smiles_length: [27, 38, 45, 43, 41, 58, 43, 43, 43, 51, 46, 21, 57, 36, 50, 42, 17, 33, 31, 37, 40, 41, 42, 36, 33, 67, 33, 51, 36, 53, 30, 42, 36, 38, 47, 44, 39, 54, 45, 40, 41, 42, 37, 41, 47, 32, 44, 44, 53, 37, 33, 32, 41, 33, 53, 33, 51, 110, 36, 34, 44, 44, 52, 47, 47, 52, 52, 50, 91, 21, 59, 64, 21, 42, 41, 34, 45, 35, 62, 503, 38, 26, 20, 29, 67, 47, 43, 37, 25, 32, 25, 45, 17, 44, 18, 35, 29, 29, 35, 30, 24, 47, 36, 51, 41, 47, 49, 19, 52, 59, 42, 29, 41, 30, 29, 43, 33, 66, 34, 40, 34, 43, 45, 44, 44, 39, 46, 31, 32, 21, 83, 41, 18, 29, 44, 36, 40, 46, 32, 21, 44, 42, 55, 40, 64, 46, 44, 41, 44, 32, 48, 48, 50, 37, 37, 38, 125, 65, 35, 60, 49, 27, 46, 55, 47, 52, 39]
Charset Size: 39
Char to Int Mapping: {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13, '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26, '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, 

In [28]:
high_res_data.head()

Unnamed: 0,Compound,PMF,Pmass,F1mass,F2mass,F3mass,F4mass,F5mass,F6mass,F1_n,...,bccdc_LGBM_prediction,bccdc_Neural Network_prediction,bccdc_Random Forest_prediction,bccdc_XGBoost_prediction,X500R_CatBoost_prediction,X500R_Lasso_prediction,X500R_LGBM_prediction,X500R_Neural Network_prediction,X500R_Random Forest_prediction,X500R_XGBoost_prediction
0,(Iso)butyryl-F-fentanyl N-benzyl analogue,C22H27FN2O,355.218,,,,,,,,...,2.402229,4.199816,4.623239,4.022328,6.415314,6.476212,7.222208,5.690671,6.358788,6.387391
1,"1-(1,3-Diphenylpropan-2-yl)pyrrolidine",C19H23N,266.1903,91.0542,117.0699,72.0808,,,,1.0,...,4.097124,3.732729,4.105974,4.615909,4.632077,5.395864,3.647399,4.480739,4.204144,4.877635
2,1-(1-Phenylcyclohexyl)azepane,C18H27N,258.2216,,,,,,,,...,3.482074,4.242413,4.859005,4.281548,4.971552,4.402223,2.333991,4.599715,4.44498,4.5787
3,1-(1-Phenylcyclohexyl)azetidine,C15H21N,216.1747,,,,,,,,...,2.765469,4.041549,4.868998,4.238262,4.605505,4.859821,3.201466,4.552013,4.46914,4.49073
4,"1-(2,3,4-Trimethoxybenzyl)piperazine",C14H22N2O3,267.1703,,,,,,,,...,3.501935,3.637778,4.467979,4.468185,4.884131,4.434643,3.825713,3.923516,3.914561,4.294013


In [29]:
high_res_data.to_csv('/scratch/st-ashapi01-1/RADD/SMILES_ML_PIPELINE/high_res_prediction_both_models_092524.csv', index=False)
