<a href="https://colab.research.google.com/github/D3TaLES/databases_demo/blob/main/notebooks/no_sql_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and Import Needed Code
( This may take a few minutes ) 

In [None]:
%%capture
! apt install mongodb > log  # Install the No-SQL database arcitecture MongoDB 
! service mongodb start  # Start MongoDB
! pip install pymatgen  # Install Pymatgen for Gaussian file parsing 
! pip install pubchempy  # Install PubChem python API for moleucle information
! pip install rdkit-pypi  # Install RdKit for molecule transformations

In [None]:
! rm -r databases_demo/ # Remove database_demo directory if it already exists
! git clone https://github.com/D3TaLES/databases_demo.git # Get Processing code from GitHub

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pymongo import MongoClient
from jsonschema import validate
from databases_demo.file_parser import *

# 1. Initialize the database 

In [None]:
# Create database
client = MongoClient()
db = client['test_db']

# 2. Load Schema and use it to validate example data

In [None]:
# Get Schema by extracting schema from schema file
with open('databases_demo/schema/no-sql_schema.json') as fn:
    schema = json.load(fn)


In [None]:
# Generate basic data for biphenyl
mol_data = GenerateMolInfo(smiles="C1=C(c2ccccc2)CCCC1", source='our_lab', names=['biphenyl'], sql=False).data
# Get the generated primary key for biphenyl
bp_id = mol_data.get('_id')

# Validate data
validate(instance=mol_data, schema=schema)

In [None]:
mol_data

# 3. Insert validated data ino the database

In [None]:
# Insert molecule into database
db["molecules"].insert_one(mol_data)

## Insert different molecules into the database

In [None]:
# Insert Benzene, Nitrobenzene, and Anthracene
extra_mols = {'benzene': "C1=CC=CC=C1", 'nitrobenzene': "C1=CC=C(C=C1)[N+](=O)[O-]", 'anthracene': "C1=CC=C2C=C3C=CC=CC3=CC2=C1"}
extra_mol_ids = {}
for name, smiles in extra_mols.items(): 
  mol_data = GenerateMolInfo(smiles, source='our_lab', names=[name], sql=False).data
  validate(instance=mol_data, schema=schema)
  db["molecules"].insert_one(mol_data)

  # Record moleucle id
  extra_mol_ids[name] = mol_data.get('_id')


## Insert different types of data in to the database 

In [None]:
# Insert Gaussian DFT data
# Note: These cells use automatic file processors (which use the same techniques as shown in the above manula processors)

gaussian_data = ProcessDFT('databases_demo/raw_data/tddft_biphenyl.log', mol_id=bp_id, sql=False).data

# Validate data
validate(instance={"_id": bp_id, "dft_data": gaussian_data}, schema=schema)

# Insert molecule into database
db["molecules"].update_one({"_id": bp_id}, {"$set": {"dft_data": gaussian_data}}, upsert=True)

In [None]:
# Insert UV-Vis data
uvvis_data = ProcessUvVis('databases_demo/raw_data/uvvis_biphenyl.csv', mol_id=bp_id, sql=False).data

# Validate data
validate(instance={"_id": bp_id, "uvvis_data": uvvis_data}, schema=schema)

# # Insert molecule into database
db["molecules"].update_one({"_id": bp_id},  {"$set": {"uvvis_data": uvvis_data}}, upsert=True)

In [None]:
# Insert DFT and UV-Vis data for other molecules 

for name, mol_id in extra_mol_ids.items(): 
  # Generate data
  gaussian_data = ProcessDFT('databases_demo/raw_data/tddft_'+name+'.log', mol_id=mol_id, sql=False).data
  uvvis_data = ProcessUvVis('databases_demo/raw_data/uvvis_'+name+'.csv', mol_id=mol_id, sql=False).data
  # Validate 
  validate(instance={"_id": mol_id, "dft_data": gaussian_data, "uvvis_data": uvvis_data}, schema=schema)
  # Insert
  db["molecules"].update_one({"_id": mol_id}, {"$set": {"dft_data": gaussian_data, "uvvis_data": uvvis_data}}, upsert=True)

# 4. Query the database

## Basic Queries

In [None]:
# View Molecules data
query = db["molecules"].find({})

# Use Pandas DataFrame package to view the results of your query 
pd.DataFrame(list(query))

In [None]:
# Count the number of molecules in the database
db["molecules"].count_documents({})

In [None]:
# Get molecules with more than 10 atoms
query = db["molecules"].find({"number_of_atoms": { "$gt": 10}})

# Use Pandas DataFrame package to view the results of your query 
pd.DataFrame(list(query))

In [None]:
# Get molecules with greater than 10 atoms, showing only molecule IDs
query = db["molecules"].find({"number_of_atoms": { "$gt": 10}}, {"_id": 1})

# Use Pandas DataFrame package to view the results of your query 
pd.DataFrame(list(query))

In [None]:
# Get all the SMILES string in the molecules database where the molecular weight is greater than 100 
query = db["molecules"].find({"molecular_weight": {"$gt": 100}}, {"smiles": 1})

# Use Pandas DataFrame package to view the results of your query 
pd.DataFrame(list(query))

In [None]:
# Search for all single excitation values in the database
query = db["molecules"].find({}, {"dft_data.first_excitation": 1})
pd.DataFrame(list(query))

## Plotting

In [None]:
# Get the absorption spectrum data for cyclohexen-eylbenzene
query = db["molecules"].find({"_id":"cyclohexen-1-ylbenzene"}, {"uvvis_data.absorbance_data": 1})
# Convert data to a Pandas DataFrame for plotting
df = pd.DataFrame(query[0]['uvvis_data']['absorbance_data'])
# Plot data
df.plot(x='wavelength', y='absorbance')

## Comparing computationally-estimated singlet excitation and experimentally-measured optical gap

In [None]:
# Gather data
query = db["molecules"].find({}, {"dft_data.first_excitation": 1, 
                                  "uvvis_data.optical_gap": 1})
# Plot data
fig, ax = plt.subplots(figsize=(4,3))
for mol in query: 
  ax.scatter(mol["uvvis_data"]['optical_gap'], mol["dft_data"]['first_excitation'], label=mol['_id'])

# Add plot labels 
plt.legend()
plt.xlabel('Optical Gap (eV)')
plt.ylabel('Singlet Excitation (eV)')
plt.tight_layout()
plt.savefig('plot1.png', dpi=300)

## Plotting spectrum for only molecules where the singlet excitation is greater than 4 eV 

In [None]:
# Search for all singlet excitation values in the database
query = db["molecules"].find({}, {"dft_data.first_excitation": 1})
pd.DataFrame(list(query))

In [None]:
# Get the molecules wtih a singlet excitation greater than 4
query = db["molecules"].find({"dft_data.first_excitation": {"$gt": 4}})

# Plot absorption spectra for the molecules queried 
fig, ax = plt.subplots(figsize=(4.2,3))
for mol in query: 
  plot_df = pd.DataFrame(mol["uvvis_data"]['absorbance_data'])
  ax.plot(plot_df.wavelength, plot_df.absorbance, label=mol['_id'])
plt.legend()

# Add details 
plt.legend()
plt.xlabel('Wavelength (nm)')
plt.ylabel('Absorption')
plt.tight_layout()
plt.savefig('abs2.png', dpi=300)

# !!! Reset Database !!!

In [None]:
client.drop_database('test_db')