<a href="https://colab.research.google.com/github/D3TaLES/databases_demo/blob/main/notebooks/sql_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and Import Needed Code

In [None]:
%%capture
! pip install pymatgen  # Install Pymatgen for Gaussian file parsing 
! pip install pubchempy  # Install PubChem python API for moleucle information
! pip install rdkit-pypi  # Install RdKit for molecule transformations
! pip install qcfractal # Install QCFractal for SQL schema
! pip install sqlalchemy==1.3.* # Install SQLAlchemy for SQL database 

In [None]:
! rm -r databases_demo/ # Remove database_demo directory if it already exists
! git clone https://github.com/D3TaLES/databases_demo.git # Get Processing code from GitHub

In [None]:
# Import required packages
import sqlite3
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.schema import MetaData
from databases_demo.file_parser import *
from databases_demo.schema.sql_schema import *

# 1. Initialize the database 

In [None]:
# Connect to the database
engine = create_engine('sqlite:///sample.db') 
Session = sessionmaker(bind=engine)
session = Session()

# Create tables
MetaData(bind=engine).create_all(tables=[
                                         Molecules.__table__,
                                         DftData.__table__,
                                         Synonyms.__table__, 
                                         UvVisData.__table__,
                                         AbsorbanceData.__table__
                                         ])

# 2. Generate Example data

In [None]:
# Generate basic data for biphenyl
data_generation = GenerateMolInfo(smiles="C1=C(c2ccccc2)CCCC1", source='our_lab', names=['biphenyl'])
mol_data = data_generation.data
# Get the generated primary key for biphenyl
bp_id = mol_data.get('mol_id')

mol_data # Show data

In [None]:
# Get synonym data 
synonym_rawdata = data_generation.synonym_data
# Convert each synonym table row into a validated object
synonym_data = [Synonyms(**data) for data in synonym_rawdata] 

synonym_rawdata[:5] # Show (first 5 pieces of) data

# 3. Insert validated data ino the database

In [None]:
# add single molecule object to the Molecules table
session.add(Molecules(**mol_data))
session.commit()

# add multiple synonym objects to the Synonym table
session.bulk_save_objects(synonym_data) 
session.commit()


## Insert different molecules into the database 

In [None]:
# Insert Benzene, Nitrobenzene, and Anthracene
extra_mols = {'benzene': "C1=CC=CC=C1", 'nitrobenzene': "C1=CC=C(C=C1)[N+](=O)[O-]", 'anthracene': "C1=CC=C2C=C3C=CC=CC3=CC2=C1"}
extra_mol_ids = {}
for name, smiles in extra_mols.items(): 
  mol_data = GenerateMolInfo(smiles, source='our_lab', names=[name]).data
  session.add(Molecules(**mol_data))
  session.commit()

  # Record moleucle id
  extra_mol_ids[name] = mol_data.get('mol_id')


## Insert different types of data in to the database 

In [None]:
# Insert Gaussian DFT data
# Note: These cells use automatic file processors (which use the same techniques as shown in the above manula processors)
gaussian_data = ProcessDFT('databases_demo/raw_data/tddft_biphenyl.log', mol_id=bp_id).data

# Insert molecule into database
session.add(DftData(**gaussian_data))
session.commit()

In [None]:
# Insert UV-Vis data
uvvis_data = ProcessUvVis('databases_demo/raw_data/uvvis_biphenyl.csv', mol_id=bp_id).data
# Insert UV-Vis data into database
session.add(UvVisData(**uvvis_data))
session.commit()

absorbance_rawdata = ProcessUvVis('databases_demo/raw_data/uvvis_biphenyl.csv', mol_id=bp_id).absorbance_data
# Convert each absorbance table row into a validated object
absorbance_data = [AbsorbanceData(**data) for data in absorbance_rawdata] 
# Insert Absorbance data into database
session.bulk_save_objects(absorbance_data) 
session.commit()

In [None]:
# Insert DFT and UV-Vis data for other molecules 

for name, mol_id in extra_mol_ids.items(): 
  # Generate data
  gaussian_data = ProcessDFT('databases_demo/raw_data/tddft_'+name+'.log', mol_id=mol_id).data
  uvvis_data = ProcessUvVis('databases_demo/raw_data/uvvis_'+name+'.csv', mol_id=mol_id).data
  absorbance_rawdata = ProcessUvVis('databases_demo/raw_data/uvvis_'+name+'.csv', mol_id=mol_id).absorbance_data
  absorbance_data = [AbsorbanceData(**data) for data in absorbance_rawdata] 
  # Add data
  session.add(DftData(**gaussian_data))
  session.add(UvVisData(**uvvis_data))
  session.bulk_save_objects(absorbance_data) 
  # Commit insertions
  session.commit()

# 4. Query the database

## Basic Queries

In [None]:
# View Molecules data table
pd.read_sql("molecules", engine.connect())

In [None]:
# View UV-Vis data table
pd.read_sql("uvvis_data", engine.connect())

In [None]:
# View DFT data table
pd.read_sql("dft_data", engine.connect())

In [None]:
# Count the number of molecules in the database
pd.read_sql("SELECT COUNT(*) FROM molecules;", engine.connect())

In [None]:
# Get molecules with more than 10 atoms
pd.read_sql("SELECT * FROM molecules WHERE number_of_atoms > 10;", engine.connect())

In [None]:
# Get molecules with more than 10 atoms, showing only molecule IDs
pd.read_sql("SELECT * FROM molecules AS mols WHERE number_of_atoms > 10;", engine.connect())

In [None]:
# Get all the SMILES string in the molecules database where the molecular weight is greater than 100 
pd.read_sql("SELECT smiles FROM molecules WHERE molecular_weight > 100;", engine.connect())

In [None]:
# Search for all singlet excitation energy values in the database
pd.read_sql("SELECT mols.mol_id, mols.smiles, dft.first_excitation FROM dft_data AS dft INNER JOIN molecules AS mols ON (dft.mol_id=mols.mol_id);", engine.connect())

# Technically this query could work too (though it wouldn't give smiles), but it wouldn't demonstrate how to perform a table join 
# pd.read_sql("SELECT mol_id, first_excitation FROM dft_data;", engine.connect())

## Plotting

In [None]:
# Get the absorption spectrum data for cyclohexen-eylbenzene
query = pd.read_sql("SELECT wavelength, absorbance FROM absorbance_data WHERE mol_id LIKE 'cyclohexen-1-ylbenzene';", engine.connect())
# Plot data
query.plot(x='wavelength', y='absorbance')

## Comparing computationally-estimated singlet excitation and experimentally-measured optical gap

In [None]:
# Gather data
query = pd.read_sql(""" 
  SELECT uvv.mol_id, uvv.optical_gap, dft.first_excitation 
  FROM dft_data AS dft INNER JOIN uvvis_data AS uvv 
  ON (dft.mol_id=uvv.mol_id);
""", engine.connect())
query

In [None]:
# Plot data
fig, ax = plt.subplots(figsize=(4,3))

for i, mol in query.iterrows(): 
  plt.scatter(mol.optical_gap, mol.first_excitation, label=mol.mol_id)

# Add plot details 
plt.legend()
plt.xlabel('Optical Gap (eV)')
plt.ylabel('Singlet Excitation Energy (eV)')
plt.tight_layout()
plt.savefig('plot1.png', dpi=300)

## Plotting spectrum only when the singlet excitation energy is greater than 4 eV

In [None]:
# Search for all singlet excitation values in the database
pd.read_sql("SELECT first_excitation FROM dft_data;", engine.connect())

In [None]:
# Get the molecules wtih a single excitation greater than 4
molecules = pd.read_sql("SELECT mol_id from dft_data WHERE first_excitation > 4;", engine.connect())

# Plot absorption spectra for the molecules queried 
fig, ax = plt.subplots(figsize=(4.2,3))
for mol in molecules.mol_id: 
  query = pd.read_sql(f"SELECT wavelength, absorbance FROM absorbance_data WHERE mol_id LIKE '{mol}';", engine.connect())
  ax.plot(query.wavelength, query.absorbance, label=mol)

# Add details 
plt.legend()
plt.xlabel('Wavelength (nm)')
plt.ylabel('Absorption')
plt.tight_layout()
plt.savefig('plot2.png', dpi=300)

# !!! Reset Database !!!

In [None]:
# Clean all tables! 
MetaData(bind=engine).drop_all(tables=[
                                         Molecules.__table__,
                                         DftData.__table__,
                                         Synonyms.__table__, 
                                         UvVisData.__table__,
                                         AbsorbanceData.__table__
                                         ])