# **Data Preprocessing** (Extracting fingerprints)


## Packaging
*  Numpy
*  Pandas
*   RDKit
*   Deepchem








In [None]:
import numpy as np
import pandas as pd
import rdkit

In [None]:
from rdkit import rdBase, Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [None]:
from rdkit.Chem import rdchem
from rdkit.Chem.rdchem import Mol

In [None]:
print('rdkit version: {}'.format(rdBase.rdkitVersion))

rdkit version: 2023.03.3


In [None]:
!pip install --pre deepchem

In [None]:
import deepchem as dc
dc.__version__

## Preparation of Functions

### Quantitative structure property relationships（QSPR)

In [None]:
def compute_QSPR(molecule):
  """
  Quantitative structure property relationships（QSPR)

  input:
  molecule: Mol object

  return:
  QSPR=[
    moleculer_weght,
    logp: hydrophobicity,
    rotate_bond: the number of rotatable bonds,
    dipole_moment: electron charge of a molecule,
    CSP3: the number of C with SP3
    ]

  """

  moleculer_weight = Descriptors.MolWt(molecule)
  logp = Crippen.MolLogP(molecule)
  rotate_bond = Descriptors.NumRotatableBonds(molecule)
  dipole_moment = Descriptors.TPSA(molecule)
  CSP3 = Descriptors.FractionCSP3(molecule)

  QSPR = np.array([moleculer_weight, logp, rotate_bond, dipole_moment, CSP3])

  return QSPR

In [None]:
test_molecule = Chem.MolFromSmiles('O=C(C1CC=CCC1c1nc2c(s1)cccc2)NCCCOCc1ccccc1')
test_QSPR = compute_QSPR(test_molecule)

print(test_QSPR)
print(test_QSPR.shape)

[4.06551000e+02 5.06920000e+00 8.00000000e+00 5.12200000e+01
 3.33333333e-01]
(5,)


### Functional group (CF)

In [None]:

def count_groups(molecule):
  """
  Count the number of specific group (substructure) in the molecule

  input:
  molecule: Mol object

  return:
  num_group =[
    num_cyano: the number of cyano group (CN)
    num_amido: the number of amido group (N-C=O)
    num_carboxy: the number of carboxy group (COOH)
    num_ester: the number of ester (C=O-O)
    num_keton: the number of keton group (C=O)
    num_phospho: the number of phospatydil group (PO3)
    num_halogen: the number of halogen (F,Br,Cl,I)
  ]

  """
  cyano = Chem.MolFromSmarts("C#N")
  amido = Chem.MolFromSmarts("*C(=O)N")
  carboxyl = Chem.MolFromSmarts("*C(=O)-[OH1]")
  ester = Chem.MolFromSmarts("[#6;X3](=O)([#6])([#8])")
  keton = Chem.MolFromSmarts("C(=O)")
  phospho = Chem.MolFromSmarts("P(=O)")
  fluoro = Chem.MolFromSmarts("F")
  bromo = Chem.MolFromSmarts("Br")
  chloro = Chem.MolFromSmarts("Cl")
  iodo = Chem.MolFromSmarts("I")

  num_cyano = len(molecule.GetSubstructMatches(cyano))
  num_amido = len(molecule.GetSubstructMatches(amido))
  num_carboxyl = len(molecule.GetSubstructMatches(carboxyl))
  num_ester = len(molecule.GetSubstructMatches(ester)) - num_carboxyl
  num_keton = len(molecule.GetSubstructMatches(keton)) - num_amido - num_ester - num_carboxyl
  num_phospho = len(molecule.GetSubstructMatches(phospho))
  num_fluoro = len(molecule.GetSubstructMatches(fluoro))
  num_bromo = len(molecule.GetSubstructMatches(bromo))
  num_chloro = len(molecule.GetSubstructMatches(chloro))
  num_iodo = len(molecule.GetSubstructMatches(iodo))
  num_halogen = num_fluoro + num_bromo + num_chloro + num_iodo

  num_group = np.array([num_cyano, num_amido, num_carboxyl, num_ester, num_keton, num_phospho, num_halogen])

  return num_group

In [None]:
test_molecule = Chem.MolFromSmiles('O[C@@H]1O[C@H](COP(=O)(O)O)[C@@H]([C@@H]([C@H]1O)O)O')
test_num_group = count_groups(test_molecule)

print(test_num_group)

[0 0 0 0 0 1 0]


In [None]:
def count_amino(molecule):
  """
  Count the number of each substructure including N

  input:
  molecule: Mol object

  return:
  num_amino =[
    num_NH0: the number of NR3
    num_NH1: the number of H-NR2
    num_NH2: the number of H2-NR
    (not in rings)
    num_n_inring: the number of N in rings
  ]

  """
  #Caluculate num_n_inring

  ring = molecule.GetRingInfo()
  # index bonds which is integrated in rings
  tp_ring_atom =ring.AtomRings()
  num_n_inring = 0

  for i in range (len(tp_ring_atom)):
    ring_atom_i = tp_ring_atom[i]

    for atom in ring_atom_i :
      atom_in_ring = molecule.GetAtomWithIdx(atom)
      # if bond b in ring i is aromatic or not
      if atom_in_ring.GetAtomicNum() == 7:
        num_n_inring += 1

  """
  """

  #Caluculate amino

  num_NH0 = 0
  num_NH1 = 0
  num_NH2 = 0

  for atom in molecule.GetAtoms():

    # search for N-R bond
    if atom.GetAtomicNum() == 7 and str(atom.GetHybridization()) != 'SP' and atom.IsInRing() == False:
      num_NH =3

      for bond in atom.GetBonds():

        if bond.GetEndAtom().GetAtomicNum() != 1:

          num_NH -= 1

      if num_NH == 0 :
        num_NH0 += 1

      elif num_NH == 1:
        num_NH1 += 1

      elif num_NH == 2:
        num_NH2 += 1



  return num_NH0, num_NH1, num_NH2, num_n_inring



In [None]:
def count_oxy(molecule):
  """
  Count the number of each substructure including O (not calboxyl)

  input:
  molecule: Mol object

  return:
  num_amino =[
    num_OH0: the number of R-O-R
    num_OH1: the number of R-OH
    (not in rings)
    num_o_inring: the number of O in rings
  ]

  """
  #Count num_o_inring

  ring = molecule.GetRingInfo()
  # index all bonds which are integrated in rings
  tp_ring_atom =ring.AtomRings()
  num_o_inring = 0

  for i in range (len(tp_ring_atom)):
    ring_atom_i = tp_ring_atom[i]

    for atom in ring_atom_i :
      # index each atom in the rings
      atom_in_ring = molecule.GetAtomWithIdx(atom)
      # +1 if an atom in the ring is O
      if atom_in_ring.GetAtomicNum() == 8:
        num_o_inring += 1

  #Count num_OH0 and num_OH1

  num_OH0 = 0
  num_OH1 = 0

  for atom in molecule.GetAtoms():

    # Index each O not in a ring
    if atom.GetAtomicNum() == 8 and atom.IsInRing() == False:
      num_OH =2

      for bond in atom.GetBonds():


        # Exclude C=O(keto)
        if str(bond.GetBondType()) == 'DOUBLE':
          num_OH =2

        # Count the number of -H
        elif str(bond.GetBondType()) == 'SINGLE' and bond.GetEndAtom().GetAtomicNum() != 1:
          num_OH -= 1

      # +1 to num_OH0 if O doesn't have any -H
      if num_OH == 0 :
        num_OH0 += 1

       # +1 to num_OH1 if O has a -H
      elif num_OH == 1:
        num_OH1 += 1


  num_oxy = np.array([num_OH0, num_OH1, num_o_inring])

  return num_oxy


In [None]:
test_molecule = Chem.MolFromSmiles('O=C(N1CCC(CC1)OCC1CCCO1)NCCc1nn(c2c1cccc2)C')
test_oxy = count_oxy(test_molecule)

print(test_oxy)

[1 0 1]


In [None]:
def count_sulfo(molecule):
  """
  Count the number of each substructure including S

  input:
  molecule: Mol object

  return:
  num_amino =[
    num_SH0: the number of R-S-R
    num_SH1: the number of R-SH
    num_SO2: the number of R-S(=02)-R
    num_SO3: the number of R-S(=O2)-OH
    (not in rings)
    num_n_inring: the number of N in rings
  ]

  """
  ring = molecule.GetRingInfo()
  # index bonds which is integrated in rings
  tp_ring_atom =ring.AtomRings()
  num_s_inring = 0

  for i in range (len(tp_ring_atom)):
    ring_atom_i = tp_ring_atom[i]

    for atom in ring_atom_i :
      a = molecule.GetAtomWithIdx(atom)
      # if bond b in ring i is aromatic or not
      if a.GetAtomicNum() == 16:
        num_s_inring += 1
  """
  """
  #Count other substructures

  num_SH0 = 0
  num_SH1 = 0
  num_SO2 = 0
  num_SO3 = 0

  for atom in molecule.GetAtoms():
    num_S2O = 0
    num_SO =0
    num_SH =2


    #Count Sulfonyl

    if atom.GetAtomicNum() == 16:
      for bond in atom.GetBonds():
        if str(bond.GetBondType()) == 'DOUBLE' and bond.GetEndAtom().GetAtomicNum() == 8:
          num_S2O += 1

        if str(bond.GetBondType()) == 'SINGLE' and bond.GetEndAtom().GetAtomicNum() == 8:
          num_SO += 1

      if num_S2O == 2 and num_SO == 0:
          num_SO2 += 1

      if num_S2O == 2 and num_SO == 1:
          num_SO3 += 1


      #Count thiol/thioether

      if atom.IsInRing() == False:

        for bond in atom.GetBonds():

          if str(bond.GetBondType()) == 'SINGLE' and bond.GetEndAtom().GetAtomicNum() != 1:

            num_SH -= 1

        if num_SH == 0 :
              num_SH0 += 1

        elif num_SH == 1:
              num_SH1 += 1

        num_SH0 -= (num_SO2 + num_SO3)


  num_sulfo = np.array([num_SH0, num_SH1, num_SO2, num_SO3, num_s_inring])

  return num_sulfo


In [None]:
test_molecule = Chem.MolFromSmiles('Cn1cnnc1SCc1coc(n1)c1cccs1')
test_sulfo = count_sulfo(test_molecule)

print(test_sulfo)



[1 0 0 0 1]


In [None]:
def count_rings(molecule):
  """
  Count the number of aromatic and non-aromatic rings in the molecule

  input:
  molecule: Mol object

  return:
  num_rings =[
    num_arm_rings: the number of aromatic rings
    num_nonarm_rings: the number of non-aromatic rings
  ]


  """
  #ring object in rdkit
  ring = molecule.GetRingInfo()
  # index bonds which is integrated in rings
  tp_ring_bond =ring.BondRings()
  num_arm_rings = 0

  for i in range (len(tp_ring_bond)):
    ring_bond_i = tp_ring_bond[i]
    num_bond = 0
    for b in ring_bond_i :
      bond = molecule.GetBondWithIdx(b)
      # if bond b in ring i is aromatic or not
      if bond.GetIsAromatic():
        num_bond +=1
    # if the number of aromatic bond in the ring is equal to the number of whole bond in the ring, count it as an aromatic ring
    if num_bond == len(tp_ring_bond[i]):
      num_arm_rings += 1

  num_nonarm_rings = len(tp_ring_bond) - num_arm_rings

  num_rings = np.array([num_arm_rings, num_nonarm_rings])

  return num_rings

In [None]:
test_molecule = Chem.MolFromSmiles('O=C(C1CC=CCC1c1nc2c(s1)cccc2)NCCCOCc1ccccc1')
test_num_rings = count_rings(test_molecule)

print(test_num_rings)

[3 1]


In [None]:
def count_bonds(molecule):
  """
  Count the number of double and triple bonds in the molecule

  input:
  molecule: Mol object

  return:
  num_bond =[
    num_double_bond: the number of double bonds
    num_triple_bond: the number of triple bonds
  ]


  """
  #ring object in rdkit()
  num_double_bond =0
  num_triple_bond = 0
  for bond in molecule.GetBonds():

    if bond.GetBondTypeAsDouble() == 2.0  :
      num_double_bond +=1

    if bond.GetBondTypeAsDouble() == 3.0 :
      num_triple_bond +=1

    num_bond = [num_double_bond, num_triple_bond]

  return   num_bond

In [None]:
test_molecule = Chem.MolFromSmiles('O=C(C1CC=CCC1c1nc2c(s1)cccc2)NCCCOCc1ccccc1')
test_num_bond = count_bonds(test_molecule)

print(test_num_bond)

[2, 0]


In [None]:
def calc_functional_group(smiles):
  '''
  Assemble all information about functional group into one function
  input:
  smiles: Mol object

  output:
  functional_group = [
    num_group(num_cyano, num_amido, num_carboxyl, num_ester, num_keton, num_phospho, num_halogen),
    num_amino(num_NH0, num_NH1, num_NH2, num_n_inring),
    num_oxy(num_OH0, num_OH1, num_o_inring),
    num_sulfo(num_SH0, num_SH1, num_SO2, num_SO3, num_s_inring),
    num_rings(aroma_ring, non_aroma_ring),
    num_bond(num_double_bond, num_triple_bond)
      ](23,)
  '''
  molecule = Chem.MolFromSmiles(smiles)

  num_cyano, num_amido, num_carboxyl, num_ester,  num_keton, num_phospho, num_halogen = count_groups(molecule)
  num_NH0, num_NH1, num_NH2, num_n_inring =count_amino(molecule)

  # amido is also counted as num_NH1 in count_amino function, so exclude it here
  num_NH1 = num_NH1 -  num_amido

  num_group = count_groups(molecule)
  num_amino = np.array([num_NH0, num_NH1, num_NH2, num_n_inring])
  num_oxy = count_oxy(molecule)
  num_sulfo = count_sulfo(molecule)
  num_rings = count_rings(molecule)
  num_bond = count_bonds(molecule)

  functional_group = np.append(num_group, num_amino)
  functional_group = np.append(functional_group, num_oxy)
  functional_group = np.append(functional_group, num_sulfo)
  functional_group = np.append(functional_group, num_rings )
  functional_group = np.append(functional_group, num_bond)

  return functional_group

In [None]:
test_smiles = 'O=C(NC(C(CC(Cc1ccccc1)NC(=O)C(C(C)C)NC(=O)N(Cc1csc(n1)C(C)C)C)O)Cc1ccccc1)OCc1cncs1'

functional_group_test = calc_functional_group(test_smiles)
k = np.array(functional_group_test)
print(k)
print(functional_group_test.shape)

[0 3 0 0 0 0 0 1 0 0 2 1 1 0 0 0 0 0 2 4 0 3 0]
(23,)


### Mol2vec

In [None]:
!pip install mol2vec

In [None]:
%reload_ext autoreload
%autoreload 2
import mol2vec
from mol2vec import features
from mol2vec import helpers
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec

In [None]:
from gensim.models import word2vec

In [None]:
def sentences2vec(sentences, model, unseen=None):

    """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
    sum of vectors for individual words.
    Parameters
    ----------
    sentences : list, array
        List with sentences
    model : word2vec.Word2Vec
        Gensim word2vec model
    unseen : None, str
        Keyword for unseen words. If None, those words are skipped.
        https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
    Returns
    -------
    np.array
    """

    keys = set(model.wv.key_to_index)
    vec = []
    if unseen:
        unseen_vec = model.wv.get_vector(unseen)

    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys
                       else unseen_vec for y in sentence]))
        else:
            vec.append(sum([model.wv.get_vector(y) for y in sentence
                            if y in set(sentence) & keys]))
    return np.array(vec)

In [None]:
#import pre-trained mol2vec fingerprint
#downloded from https://github.com/samoturk/mol2vec/tree/master/examples/models
model = word2vec.Word2Vec.load('''pathway_to_model_300dim.pkl''')

In [None]:
def extract_mol2vec_fingerprint(smiles):
  '''
  input:
    smiles(str): SMILES strings of molecules

  output:
    mol2vec_fingerprint(array(300,)): fingerprint
  '''
  #Get Mol object from SMILES
  Mol_object = Chem.MolFromSmiles(smiles)

  #Get vectrized fingerprints
  pre_mol2vec_fingerprint = sentences2vec(MolSentence(mol2alt_sentence(Mol_object, radius=1)), model, unseen='UNK')

  #Reshape into (300,) by summing up each column
  mol2vec_fingerprint = np.sum(pre_mol2vec_fingerprint, axis = 0)

  return mol2vec_fingerprint

### Evaluation Function

In [8]:
from sklearn.metrics import mean_absolute_error

In [9]:
def evaluate_model(y, y_pred):
  '''
  #caluculate mae

  input:
    y: array_like, actual docking score calculated by Autodock
    y_pred: array_like, predicted doking score by regression ML model

  output:
    MAE: Mean Abosolute Error

  '''
  MAE = mean_absolute_error(y, y_pred)

  return MAE

## Download Dataset

In [10]:
data = pd.read_csv('''pathway_to_train.csv''')

In [None]:
test = pd.read_csv('''pathway_to_test.csv''')

## Extraction of Fingerprint

### QSPR fingerprint

In [None]:
#input SMILES -> fingerprint
fingerprint_train_QSPR = np.zeros((data.shape[0], 5))
for i in range(data.shape[0]):
  smiles_i = data.at[i,'SMILES']
  molecule_i = Chem.MolFromSmiles(smiles_i)
  fingerprint_train_QSPR[i] = compute_QSPR(molecule_i)
  if i % 10000 == 0:
    print(f'{i}done')

In [None]:
df_fingerprint_train_QSPR =  pd.DataFrame(fingerprint_train_QSPR)
data_train_QSPR = df_fingerprint_train_QSPR.rename(columns = {0: "Moleculer Weight", 1: "logP", 2: "rotate", 3:"dipole", 4: "CSP3"})

### Count of functional groups (CF fingerprint)

In [None]:
#input SMILES -> fingerprint
fingerprint_train_CFG = np.zeros((data.shape[0], 23))
for i in range(data.shape[0]):
  smiles_i = data.at[i,'SMILES']
  fingerprint_train_CFG[i] = calc_functional_group(smiles_i)
  if i % 10000 == 0:
    print(f'{i}done')

In [None]:
df_fingerprint_train_CFG =  pd.DataFrame(fingerprint_train_CFG)
data_train_CFG = df_fingerprint_train_CFG.rename(columns = {0: "Cyano", 1: "Amido", 2:"Ester", 3:"Carboxy", 4:"Keto", 5:"Phospho", 6:"Halogen", 7:"NR3", 8:"H-NR2",9: 'H2-NR', 10:'N in ring', 11: 'ROR', 12:"ROH", 13:'O in ring', 14:'RSR', 15:'RSH', 16:'SO2', 17:'SO3', 18:'S in ring', 19:'Aromatic ring', 20:'Non-aromatic ring', 21:'Double bond', 22:'Triple bond' })

### ECFP fingerprint

In [None]:
featurizer_ECFP = dc.feat.CircularFingerprint(size=1024, radius=1)

In [None]:
#input SMILES -> fingerprint
fingerprint_train_ECFP = np.zeros((data.shape[0], 1024))
for i in range(data.shape[0]):
  smiles_i = data.at[i,'SMILES']
  fingerprint_train_ECFP[i] = featurizer_ECFP.featurize(smiles_i)
  if i % 10000 == 0:
    print(f'{i}done')

data_train_ECFP =  pd.DataFrame(fingerprint_train_ECFP)

### Mol2vec fingerprint

In [None]:
#input SMILES -> fingerprint
%reload_ext autoreload
%autoreload 2
import mol2vec
from mol2vec import features
from mol2vec import helpers
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec
fingerprint_train_Mol2Vec = np.zeros((data.shape[0], 300))
for i in range(data.shape[0]):
  smiles_i = data.at[i,'SMILES']
  fingerprint_train_Mol2Vec[i] = extract_mol2vec_fingerprint(smiles_i)

  if i % 10000 == 0:
    print(f'{i}done')

data_train_Mol2Vec =  pd.DataFrame(fingerprint_train_Mol2Vec)

### Combination

In [14]:
train_data_QSPR_CF_ECFP = pd.concat([data, data_train_QSPR, data_train_CFG, data_train_ECFP], axis=1)

In [15]:
train_data_QSPR_CF_Mol2Vec = pd.concat([data, data_train_QSPR, data_train_CFG, data_train_Mol2Vec], axis=1)

### Test set fingerprint

In [None]:
# test set QSPR
fingerprint_test_QSPR = np.zeros((test.shape[0], 5))
for i in range(test.shape[0]):
  smiles_i = test.at[i,'SMILES']
  molecule_i = Chem.MolFromSmiles(smiles_i)
  fingerprint_test_QSPR[i] = compute_QSPR(molecule_i)
  if i % 10000 == 0:
    print(f'{i}done')
df_fingerprint_test_QSPR =  pd.DataFrame(fingerprint_test_QSPR)
data_test_QSPR = df_fingerprint_test_QSPR.rename(columns = {0: "Moleculer Weight", 1: "logP", 2: "rotate", 3:"dipole", 4: "CSP3"})

In [None]:
# test set CF
fingerprint_test_CFG = np.zeros((test.shape[0], 23))
for i in range(test.shape[0]):
  smiles_i = test.at[i,'SMILES']
  fingerprint_test_CFG[i] = calc_functional_group(smiles_i)
  if i % 10000 == 0:
    print(f'{i}done')
df_fingerprint_test_CFG =  pd.DataFrame(fingerprint_test_CFG)
data_test_CFG = df_fingerprint_test_CFG.rename(columns = {0: "Cyano", 1: "Amido", 2:"Ester", 3:"Carboxy", 4:"Keto", 5:"Phospho", 6:"Halogen", 7:"NR3", 8:"H-NR2",9: 'H2-NR', 10:'N in ring', 11: 'ROR', 12:"ROH", 13:'O in ring', 14:'RSR', 15:'RSH', 16:'SO2', 17:'SO3', 18:'S in ring', 19:'Aromatic ring', 20:'Non-aromatic ring', 21:'Double bond', 22:'Triple bond' })

In [None]:
# test set ECFP
featurizer_ECFP = dc.feat.CircularFingerprint(size=1024, radius=1)
fingerprint_test_ECFP = np.zeros((test.shape[0], 1024))
for i in range(test.shape[0]):
  smiles_i = test.at[i,'SMILES']
  fingerprint_test_ECFP[i] = featurizer_ECFP.featurize(smiles_i)
  if i % 10000 == 0:
    print(f'{i}done')
data_test_ECFP =  pd.DataFrame(fingerprint_test_ECFP)

In [None]:
# test set Mol2Vec
fingerprint_test_Mol2Vec = np.zeros((test.shape[0], 300))
for i in range(test.shape[0]):
  smiles_i = data.at[i,'SMILES']
  fingerprint_test_Mol2Vec[i] = extract_mol2vec_fingerprint(smiles_i)

  if i % 10000 == 0:
    print(f'{i}done')
data_test_Mol2Vec =  pd.DataFrame(fingerprint_test_Mol2Vec)

In [None]:
test_QSPR_CF_ECFP = pd.concat([test['SMILES'], data_test_QSPR, data_test_CFG, data_test_ECFP], axis=1)

In [None]:
test_QSPR_CF_Mol2Vec = pd.concat([test['SMILES'], data_test_QSPR, data_test_CFG, data_test_Mol2Vec], axis=1)

# **Training and Validation**

## Packaging
* ### scikit learn
* ### XGBoost Regressor


In [11]:
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

In [None]:
!pip install xgboost
import xgboost
import xgboost as xgb

# Regression

Data Preparation

In [17]:
protein_target = data.columns[1:]
print(protein_target)

Index(['3CLPro_pocket1', 'ADRP-ADPR_pocket1', 'ADRP-ADPR_pocket5',
       'ADRP_pocket1', 'ADRP_pocket12', 'ADRP_pocket13', 'COV_pocket1',
       'COV_pocket2', 'COV_pocket8', 'COV_pocket10', 'NSP9_pocket2',
       'NSP9_pocket7', 'NSP15_pocket1', 'ORF7A_pocket2',
       'PLPro_chainA_pocket3', 'PLPro_chainA_pocket23', 'PLPro_pocket6',
       'PLPro_pocket50'],
      dtype='object')


In [None]:
test_set = test_QSPR_CF_Mol2Vec
test_set_x = test_set.loc[: , 'Moleculer Weight':]
test_set_x.head()

### Training

In [None]:
test_results = pd.read_csv('''pathway_to_test.csv''')

In [None]:
# Prediction using XGB
for i in range(18):
  if i == 16:
    train_set = train_data_QSPR_CF_Mol2Vec
    test_set = test_QSPR_CF_Mol2Vec
  else:
    train_set = train_data_QSPR_CF_ECFP
    test_set = test_QSPR_CF_ECFP
  x = train_set.loc[: , 'Moleculer Weight':]
  y = train_set.loc[: , protein_target[i]]
  test_set_y = model.predict(test_set_x)
  test_results.loc[:, protein_target[i]] = test_set_y
  
  model =  xgb.XGBRegressor( n_estimators = 300, learning_rate = 0.2, min_child_weight = 15, max_depth = 8, colsample_bytree = 0.5)
  fit = model.fit(x, y)
  y_pred_train = model.predict(x)
  MAE_train = evaluate_model(y, y_pred_train)
  print(f"MAE:{MAE_train}")

In [None]:
test.results.to_csv('''pathway_to_store_results''', index=False)