In [1]:
import pandas as pd
import numpy as np
import sqlite3
import io
def adapt_array(arr):
    out = io.BytesIO()
    np.save(out, arr)
    out.seek(0)
    return sqlite3.Binary(out.read())

def convert_array(text):
    out = io.BytesIO(text)
    out.seek(0)
    return np.load(out)

sqlite3.register_adapter(np.ndarray, adapt_array)
sqlite3.register_converter("array", convert_array)


conn = sqlite3.connect("chemdatabase_copy.db", detect_types=sqlite3.PARSE_DECLTYPES)
cursor = conn.cursor()


In [2]:
from test_database import test_base

In [3]:
test_base(cursor, "toxicity_85832.csv")

True

In [28]:
from mordred import Calculator, descriptors
from rdkit import Chem
import numpy as np
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
def canonize_smile (sm):
    m = Chem.MolFromSmiles(sm)
    try: return Chem.MolToSmiles(m, canonical=True, isomericSmiles=False)
    except: return None

def _canonize_mixture (mix):
    return '.'.join([canonize_smile(sm) for sm in mix.split('.')])

def randomStringwithDigitsAndSymbols(stringLength=10):
    """Generate a random string of letters, digits and special characters """
    password_characters = string.ascii_letters + string.digits
    return ''.join(random.choice(password_characters) for i in range(stringLength))
def func(smiles):
    value = [np.random.randn(random.randint(1,1825),1) for i in range(len(smiles))]
    return value

morded_calculator = Calculator(descriptors, ignore_3D=False)

fprint_params = {'bits': 4096, 'radius_multiplier': 1.5, 'rdkit_invariants': True}
confgen_params = {'max_energy_diff': 20.0, 'first': 10}

def ecfp( mol, r=3, nBits=4096, errors_as_zeros=True):
    mol = Chem.MolFromSmiles(mol) if not isinstance(mol, rdkit.Chem.rdchem.Mol) else mol
    try:
        arr = np.zeros((1,))
        ConvertToNumpyArray(GetMorganFingerprintAsBitVect(mol, r, nBits), arr)
        return arr.astype(np.float32)
    except:
        return np.NaN if not errors_as_zeros else np.zeros((nBits,), dtype=np.float32)

def ecfp_for_builder(x): return np.array([ecfp(x)]).astype(np.float32)
#def e3fp(smiles): return np.vstack([fp.to_vector(sparse=False).astype(np.float32) for fp in fprints_from_smiles(smiles,smiles,confgen_params,fprint_params)])
def morded(smiles): return np.array([list(morded_calculator(Chem.MolFromSmiles(smiles))
                                      .fill_missing(value=0.)
                                      .values())])\
    .astype(np.float32)

def get_avaliable_descriptors(): return {'morded':morded,'ecfp':ecfp}

In [3]:
import pandas as pd

In [15]:
df_melt = pd.read_csv("melting_prepared_282517.csv")

In [16]:
df_tox = pd.read_csv("toxicity_85832.csv")

In [17]:
df_melt.head()

Unnamed: 0,SMILES,MeltingPoint_None
0,*C(N)C(=O)NC(*)C(=O)NC(*)C(=O)O,280.0
1,B.B.CC(C)(C)SCCSC(C)(C)C,72.5
2,B.C1CCC(PC2CCCCC2)CC1,79.0
3,B.C1COCCN1,97.0
4,B.CC(C)(C)N,98.5


In [18]:
smiles_melt = list(df_melt[df_melt.columns[0]])
smiles_tox = list(df_tox[df_tox.columns[0]])

In [20]:
etanol='CCO'

In [21]:
from fill_base import canonize_smile

In [22]:
etanol = canonize_smile(etanol)

In [23]:
etanol

'CCO'

In [24]:
etanol in smiles_tox

True

In [25]:
smiles_tox.index(etanol)

23899

In [10]:
intersection_elems = list(set(smiles_melt).intersection(smiles_tox))

In [11]:
smile=intersection_elems[0]

In [12]:
molecule_id_source = smiles_melt.index(smile)

In [29]:
molecule_id_source = 23899
smile = smiles_tox[molecule_id_source]
morded_smile = np.array(morded(smile))

In [9]:
def columns(cursor, table):
    cursor = cursor.execute("select * from "+tabler)
    return list(map(lambda x: x[0], cursor.description))

In [52]:
def sql_fetch(cursor):
    cursor.execute('SELECT name from sqlite_master where type= "table"')
    return cursor
    

In [55]:
table_names = sql_fetch(cursor).fetchall()

In [57]:
names = columns(cursor, "molecules")

In [58]:
print(table_names)
print(names)

[('molecules',), ('sqlite_sequence',), ('tasks',), ('tasks_running',), ('descriptors',), ('descriptors_values',), ('endpoints',), ('experimental_data',)]
['id_molecule', 'inchi_key', 'inchi', 'canonical_smiles']


In [30]:
cursor = cursor.execute("select * from molecules where canonical_smiles='"+smile+"'")

In [32]:
molecule_id = list(cursor)[0][0]

In [33]:
molecule_id

46752

In [34]:
smile

'CCO'

In [69]:
columns(cursor, 'endpoints')

['id_endpoint', 'desc', 'type']

In [5]:
temp =list(cursor.execute("select * from endpoints").fetchall())

In [6]:
temp = [(data[0], data[1]+"_"+data[2]) for data in temp]

In [8]:
names_of_endpoints = [data[1] for data in temp]

In [11]:
names_of_endpoints = ["- "+data for data in names_of_endpoints]

In [13]:
for name in names_of_endpoints:
    print(name)

- o_mus_ipr_LD 
- o_rat_orl_TDLo 
- o_mus_ipr_LDLo 
- o_mus_orl_TDLo 
- o_rat_ipr_TDLo 
- o_mus_ivn_LD 
- o_rat_ipr_LD 
- o_mus_orl_LD 
- o_mus_unr_LD 
- o_rat_unr_LD 
- o_mus_scu_LDLo 
- o_rat_scu_LD 
- o_mus_scu_LD 
- o_rat_ipr_LDLo 
- o_mus_ipr_TDLo 
- o_rbt_skn_LD 
- o_rat_orl_LD 
- o_rat_ivn_TDLo 
- o_rat_orl_LDLo 
- o_rbt_orl_LD 
- o_rbt_ivn_LD 
- o_rat_ivn_LD 
- o_mus_orl_LDLo 
- o_rat_skn_LD 
- o_mam_unr_LD 
- o_gpg_orl_LD 
- o_wmn_orl_TDLo 
- o_man_orl_TDLo 
- o_rat_scu_TDLo 
- MeltingPoint_None


In [40]:
NameOfEndpoint2db_index = {data[1]:data[0] for data in temp}

In [17]:
NameOfEndpoint2db_index

{'o_mus_ipr_LD ': 1,
 'o_rat_orl_TDLo ': 2,
 'o_mus_ipr_LDLo ': 3,
 'o_mus_orl_TDLo ': 4,
 'o_rat_ipr_TDLo ': 5,
 'o_mus_ivn_LD ': 6,
 'o_rat_ipr_LD ': 7,
 'o_mus_orl_LD ': 8,
 'o_mus_unr_LD ': 9,
 'o_rat_unr_LD ': 10,
 'o_mus_scu_LDLo ': 11,
 'o_rat_scu_LD ': 12,
 'o_mus_scu_LD ': 13,
 'o_rat_ipr_LDLo ': 14,
 'o_mus_ipr_TDLo ': 15,
 'o_rbt_skn_LD ': 16,
 'o_rat_orl_LD ': 17,
 'o_rat_ivn_TDLo ': 18,
 'o_rat_orl_LDLo ': 19,
 'o_rbt_orl_LD ': 20,
 'o_rbt_ivn_LD ': 21,
 'o_rat_ivn_LD ': 22,
 'o_mus_orl_LDLo ': 23,
 'o_rat_skn_LD ': 24,
 'o_mam_unr_LD ': 25,
 'o_gpg_orl_LD ': 26,
 'o_wmn_orl_TDLo ': 27,
 'o_man_orl_TDLo ': 28,
 'o_rat_scu_TDLo ': 29,
 'MeltingPoint_None': 30}

In [76]:
endpoints_data = cursor.execute("select * from experimental_data where id_molecule="+str(molecule_id)).fetchall()

In [80]:
endpoints_data

[(46752, 1, 1.94),
 (46752, 2, 0.973),
 (46752, 3, 1.06),
 (46752, 4, 0.964),
 (46752, 5, 1.49),
 (46752, 6, 1.37),
 (46752, 7, 4.11),
 (46752, 8, 1.13),
 (46752, 13, 0.745),
 (46752, 15, 1.42),
 (46752, 17, 0.815),
 (46752, 18, 1.73),
 (46752, 19, 0.818),
 (46752, 20, 0.8640000000000001),
 (46752, 21, 1.29),
 (46752, 22, 1.51),
 (46752, 26, 0.918),
 (46752, 27, 1.58),
 (46752, 28, 1.88),
 (46752, 29, 0.7659999999999999),
 (46752, 30, -115.5783125)]

In [77]:
molecule_id

46752

In [21]:
np.array(endpoints_data)

array([[3.49477e+05, 3.00000e+01, 2.80000e+02]])

In [None]:
(molecule_id, )

In [81]:
endpoints_from_table = list(df_tox.loc[molecule_id_source].dropna())

In [82]:
endpoints_from_table[1:]

[1.94,
 0.973,
 1.06,
 0.964,
 1.49,
 1.37,
 4.11,
 1.13,
 0.745,
 1.42,
 0.815,
 1.73,
 0.818,
 0.8640000000000001,
 1.29,
 1.51,
 0.918,
 1.58,
 1.88,
 0.7659999999999999]

In [116]:
df_tox.loc[molecule_id_source].dropna()

SMILES               CCO
o_mus_ipr_LD        1.94
o_rat_orl_TDLo     0.973
o_mus_ipr_LDLo      1.06
o_mus_orl_TDLo     0.964
o_rat_ipr_TDLo      1.49
o_mus_ivn_LD        1.37
o_rat_ipr_LD        4.11
o_mus_orl_LD        1.13
o_mus_scu_LD       0.745
o_mus_ipr_TDLo      1.42
o_rat_orl_LD       0.815
o_rat_ivn_TDLo      1.73
o_rat_orl_LDLo     0.818
o_rbt_orl_LD       0.864
o_rbt_ivn_LD        1.29
o_rat_ivn_LD        1.51
o_gpg_orl_LD       0.918
o_wmn_orl_TDLo      1.58
o_man_orl_TDLo      1.88
o_rat_scu_TDLo     0.766
Name: 23899, dtype: object

In [117]:
from random import random

In [119]:
np.random.randint(0, len(smiles_tox), size =10)

array([27519,  1988, 10224, 34402, 52247, 53343, 81115,   405, 10674,
       54702])

In [122]:
all([True, True, True])

True

In [83]:
endpoints_indexes = list(df_tox.loc[molecule_id_source].dropna().index)

In [84]:
endpoints_indexes[1:]

['o_mus_ipr_LD ',
 'o_rat_orl_TDLo ',
 'o_mus_ipr_LDLo ',
 'o_mus_orl_TDLo ',
 'o_rat_ipr_TDLo ',
 'o_mus_ivn_LD ',
 'o_rat_ipr_LD ',
 'o_mus_orl_LD ',
 'o_mus_scu_LD ',
 'o_mus_ipr_TDLo ',
 'o_rat_orl_LD ',
 'o_rat_ivn_TDLo ',
 'o_rat_orl_LDLo ',
 'o_rbt_orl_LD ',
 'o_rbt_ivn_LD ',
 'o_rat_ivn_LD ',
 'o_gpg_orl_LD ',
 'o_wmn_orl_TDLo ',
 'o_man_orl_TDLo ',
 'o_rat_scu_TDLo ']

In [None]:
endpoints_data

In [86]:
endpoints_data==[(molecule_id,NameOfEndpoint2db_index[endpoints_indexes[i+1]],endpoints_from_table[i+1]) for i in range(len(endpoints_from_table[1:]))] 

False

In [88]:
temp_data = [(molecule_id,NameOfEndpoint2db_index[endpoints_indexes[i+1]],endpoints_from_table[i+1]) for i in range(len(endpoints_from_table[1:]))]

In [89]:
all(i in endpoints_data for i in temp_data)

True

In [None]:
cursor.execute("select * from experimental_data where id_molecule="+str(molecule_id)).fetchall()

In [116]:
smiles_tox.index(smile) 

3013

In [130]:
val = test_base(cursor, "toxicity_85832.csv")

NameError: name 'endpoints_data' is not defined

In [4]:
df_tox.loc[smiles_tox.index(smile)]

NameError: name 'df_tox' is not defined

In [121]:
%%timeit
cursor.execute("select * from descriptors_values where id_molecule="+str(molecule_id)).fetchall()

1.74 s ± 208 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [123]:
temp = cursor.execute("select * from descriptors_values where id_molecule="+str(molecule_id)).fetchall()

In [131]:
(temp[0][4] == morded(smile)).all()

True

In [86]:
cursor = cursor.execute("select * from endpoints where id_molecule="+str(molecule_id))

OperationalError: no such column: id_molecule

In [90]:
molecule_id_source

23899

In [105]:
molecule_id

46752

In [106]:
temp = list(cursor.execute("select * from descriptors_values where id_molecule="+str(molecule_id)))

In [110]:
temp = temp[0]

In [114]:
(np.nan_to_num(np.array(morded(smile))) == np.nan_to_num(np.array(temp[4]))).all()

True

In [1]:
import json

In [10]:
mol = ['CCO']

In [11]:
v= json.dumps(mol)

In [12]:
v

'["CCO"]'

In [7]:
json.loads(v)

['CCO']

In [19]:
temp

[(1,
  1,
  1,
  1,
  array([[13.572928 , 11.191481 ,  0.       , ..., 96.       ,  5.4444447,
           4.138889 ]], dtype=float32))]

In [16]:
temp1= list(cursor.execute("select * from molecules"))

In [17]:
temp1

[(1,
  'InChI=1S/C14H24N2O2/c17-13(15-9-10-15)7-5-3-1-2-4-6-8-14(18)16-11-12-16/h1-12H2',
  'InChI=1S/C14H24N2O2/c17-13(15-9-10-15)7-5-3-1-2-4-6-8-14(18)16-11-12-16/h1-12H2',
  'O=C(CCCCCCCCC(=O)N1CC1)N1CC1'),
 (2,
  'InChI=1S/C6H11NO/c1-4-5(2)8-6(3)7-4/h5-6H,1-3H3',
  'InChI=1S/C6H11NO/c1-4-5(2)8-6(3)7-4/h5-6H,1-3H3',
  'CC1=NC(C)OC1C'),
 (3,
  'InChI=1S/C17H23NO3.ClH/c19-16(17(7-8-17)15-5-2-1-3-6-15)21-12-4-9-18-10-13-20-14-11-18;/h1-3,5-6H,4,7-14H2;1H',
  'InChI=1S/C17H23NO3.ClH/c19-16(17(7-8-17)15-5-2-1-3-6-15)21-12-4-9-18-10-13-20-14-11-18;/h1-3,5-6H,4,7-14H2;1H',
  '[Cl-].O=C(OCCC[NH+]1CCOCC1)C1(c2ccccc2)CC1'),
 (4,
  'InChI=1S/C14H20N2O2/c1-3-16-13(17)11-8-6-4-5-7-10(11)12(9(2)15)14(16)18/h3-8,15H2,1-2H3',
  'InChI=1S/C14H20N2O2/c1-3-16-13(17)11-8-6-4-5-7-10(11)12(9(2)15)14(16)18/h3-8,15H2,1-2H3',
  'CCN1C(=O)C2=C(CCCCC2)C(=C(C)N)C1=O'),
 (5,
  'InChI=1S/C11H10ClN3O/c1-6-13-8-4-7(12)5-9-11(8)15(6)3-2-10(16)14-9/h4-5H,2-3H2,1H3,(H,14,16)',
  'InChI=1S/C11H10ClN3O/c1-6-13-8-4-7(12

In [15]:
len(temp)

0

In [6]:
import pandas as pd

In [9]:
len(pd.read_csv("melting_prepared_282517.csv"))+len(pd.read_csv("toxicity_85832.csv"))

368349

In [None]:
len