In [18]:
from jsonschema import validate
from tinydb import TinyDB, Query
from databases_demo.processors.master_parser import *
from pymongo import MongoClient

In [20]:
# Establish Database
# db = TinyDB('no-sql_db.json')
# db
client = MongoClient()
db = client.list_database_names()

In [3]:
# Get Schema
#Extrct schema from schema file
with open('schema/no-sql_schema.json') as fn:
    schema = json.load(fn)
schema

{'$id': 'https://raw.githubusercontent.com/D3TaLES/databases_demo/main/no-sql/schema/no-sql_schema.json',
 'title': 'no-sql_schema',
 'description': 'Schema for molecule-centric NoSQL demo database',
 'type': 'object',
 'properties': {'_id': {'description': 'IUPAC Name for the molecule for the molecule',
   'type': 'string'},
  'smiles': {'description': 'SMILES representation of a molecule',
   'type': 'string'},
  'synonyms': {'type': 'array',
   'description': 'Synonymous names of the molecule',
   'items': {'type': 'string',
    'description': 'One synonym for the molecule.'}},
  'molecular_formula': {'type': 'string',
   'description': 'Molecular formula for molecule.'},
  'number_of_atoms': {'type': 'integer',
   'description': 'Number of atoms in the molecule.'},
  'molecular_weight': {'type': 'number',
   'description': 'Molecular weight of the molecule in g/mol.'},
  'source': {'description': 'Source of the molecule', 'type': 'string'},
  'date_made': {'description': 'Date the 

In [4]:
# Insert a molecule into the database

mol_smiles = "C1=C(c2ccccc2)CCCC1"
source = 'our_lab'
names = ['biphenyl']
mol_data = GenerateMolInfo(mol_smiles, source=source, names=names).no_sql_data
mol_id = mol_data.get('_id')

# Validate data
validate(instance=mol_data, schema=schema)

# Insert molecule into database
db.insert(mol_data)

{'_id': 'cyclohexen-1-ylbenzene', 'smiles': 'C1=C(c2ccccc2)CCCC1', 'synonyms': ['biphenyl', '1-Phenyl-1-cyclohexene', '771-98-2', '1-Phenylcyclohexene', 'Cyclohexen-1-ylbenzene', "2,3,4,5-tetrahydro-1,1'-biphenyl", '1-Phenylcyclohex-1-ene', 'Benzene, 1-cyclohexen-1-yl-', 'Phenylcyclohexene', 'Cyclohexenylbenzene', 'Benzene, cyclohexenyl-', 'CYCLOHEXENE, 1-PHENYL-', 'trans-1-Phenylcyclohexene', 'UNII-PM437BQ1OF', '31017-40-0', 'cyclohex-1-en-1-ylbenzene', 'PM437BQ1OF', 'MFCD00001542', '2-Phenylcyclohexene', 'cyclohex-1-enylbenzene', 'EINECS 212-242-6', 'NSC 44834', 'NSC 403862', 'BRN 1905772', 'AI3-02304', 'Phenyl cyclohexene', '1-Phenyl-cyclohexene', 'Phenyl-1-cyclohexene', '1-cyclohexenyl-benzene', '1-Cyclohexen-1-ylbenzene', 'Benzene,cyclohexen-1-yl-', '3-cyclohexen-1-yl-Benzene', '1-Cyclohexen-1-ylbenzene #', '(cyclohex-1-en-1-yl)benzene', 'AMBZ0193', '1-Phenyl-1-cyclohexene, 95%', 'DTXSID10870771', 'CHEBI:183287', 'NSC44834', 'ZINC1677082', 'ICCB1_000079', 'ICCB1_000095', 'NSC-4483

2

In [5]:
# Insert Gaussian DFT data
gaussian_data = ProcessDFT('../raw_data/gaus_opt.log', mol_id=mol_id, parsing_class=ParseGausLog).no_sql_data
new_mol_data = {"_id": mol_id, "dft_data": [gaussian_data]}

# Validate data
validate(instance=new_mol_data, schema=schema)

# Insert molecule into database
db.update(new_mol_data)

[1, 2]

In [6]:
Molecule = Query()
# View molecule entry
db.search(Molecule.dft_data.exists())

[{'_id': 'cyclohexen-1-ylbenzene',
  'smiles': 'C1=C(c2ccccc2)CCCC1',
  'synonyms': ['biphenyl',
   '1-Phenyl-1-cyclohexene',
   '771-98-2',
   '1-Phenylcyclohexene',
   'Cyclohexen-1-ylbenzene',
   "2,3,4,5-tetrahydro-1,1'-biphenyl",
   '1-Phenylcyclohex-1-ene',
   'Benzene, 1-cyclohexen-1-yl-',
   'Phenylcyclohexene',
   'Cyclohexenylbenzene',
   'Benzene, cyclohexenyl-',
   'CYCLOHEXENE, 1-PHENYL-',
   'trans-1-Phenylcyclohexene',
   'UNII-PM437BQ1OF',
   '31017-40-0',
   'cyclohex-1-en-1-ylbenzene',
   'PM437BQ1OF',
   'MFCD00001542',
   '2-Phenylcyclohexene',
   'cyclohex-1-enylbenzene',
   'EINECS 212-242-6',
   'NSC 44834',
   'NSC 403862',
   'BRN 1905772',
   'AI3-02304',
   'Phenyl cyclohexene',
   '1-Phenyl-cyclohexene',
   'Phenyl-1-cyclohexene',
   '1-cyclohexenyl-benzene',
   '1-Cyclohexen-1-ylbenzene',
   'Benzene,cyclohexen-1-yl-',
   '3-cyclohexen-1-yl-Benzene',
   '1-Cyclohexen-1-ylbenzene #',
   '(cyclohex-1-en-1-yl)benzene',
   'AMBZ0193',
   '1-Phenyl-1-cyclohexene

In [7]:
# Insert Psi4 DFT data
psi4_data = ProcessDFT('../raw_data/psi4_opt.dat', mol_id=mol_id, parsing_class=ParsePsi4Log).no_sql_data
new_mol_data = {"_id": mol_id, "dft_data": [psi4_data]}

# Validate data
validate(instance=new_mol_data, schema=schema)

# Insert molecule into database
db.update(new_mol_data)

[1, 2]

In [8]:
# View molecule entry
db.search(Molecule.dft_data.exists())

[{'_id': 'cyclohexen-1-ylbenzene',
  'smiles': 'C1=C(c2ccccc2)CCCC1',
  'synonyms': ['biphenyl',
   '1-Phenyl-1-cyclohexene',
   '771-98-2',
   '1-Phenylcyclohexene',
   'Cyclohexen-1-ylbenzene',
   "2,3,4,5-tetrahydro-1,1'-biphenyl",
   '1-Phenylcyclohex-1-ene',
   'Benzene, 1-cyclohexen-1-yl-',
   'Phenylcyclohexene',
   'Cyclohexenylbenzene',
   'Benzene, cyclohexenyl-',
   'CYCLOHEXENE, 1-PHENYL-',
   'trans-1-Phenylcyclohexene',
   'UNII-PM437BQ1OF',
   '31017-40-0',
   'cyclohex-1-en-1-ylbenzene',
   'PM437BQ1OF',
   'MFCD00001542',
   '2-Phenylcyclohexene',
   'cyclohex-1-enylbenzene',
   'EINECS 212-242-6',
   'NSC 44834',
   'NSC 403862',
   'BRN 1905772',
   'AI3-02304',
   'Phenyl cyclohexene',
   '1-Phenyl-cyclohexene',
   'Phenyl-1-cyclohexene',
   '1-cyclohexenyl-benzene',
   '1-Cyclohexen-1-ylbenzene',
   'Benzene,cyclohexen-1-yl-',
   '3-cyclohexen-1-yl-Benzene',
   '1-Cyclohexen-1-ylbenzene #',
   '(cyclohex-1-en-1-yl)benzene',
   'AMBZ0193',
   '1-Phenyl-1-cyclohexene

In [16]:
# Search for molecules with more than 10 atoms
search = db.search(Molecule.number_of_atoms > 10)

# Get the number of molecules with more than 10 atoms by getting the length of the search
len(search)

2