In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [None]:
from collections import Counter
import json
from pathlib import Path
import string
import sys

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from IPython.display import clear_output
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tqdm
import torch
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, recall_score

from multimodal_molecules.core import Ensemble, Estimator, get_data
from multimodal_molecules.plotting import set_defaults, set_grids, density_scatter, remove_axis_spines

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [None]:
set_defaults()

In [None]:
C_grid = np.loadtxt("data/c_grid.txt")
N_grid = np.loadtxt("data/n_grid.txt")
O_grid = np.loadtxt("data/o_grid.txt")

# Name-to-smiles maps

In [None]:
C_smiles_map = {
    "tyrosine"      : "N[C@@H](Cc1ccc(O)cc1)C(O)=O",
    "phenylalanine" : "c1ccc(cc1)C[C@@H](C(=O)O)N",
    "histidine"     : "O=C([C@H](CC1=CNC=N1)N)O",
    "alanine"       : "C[C@@H](C(=O)O)N",
    "lysine"        : "C(CCN)C[C@@H](C(=O)O)N"
}
C_name_to_data_map = {}
for file in Path("data/24-01-02_experiment/ripped/Carbon").glob("*.csv"):
    name = file.stem.split("carbon_")[1]
    data = np.loadtxt(file, delimiter=",")
    data = data[data[:, 0].argsort()]
    C_name_to_data_map[name] = data

assert len(C_name_to_data_map) == len(C_smiles_map)
assert C_name_to_data_map.keys() == C_smiles_map.keys()

In [None]:
N_smiles_map = {
    "l-arginine": "C(C[C@@H](C(=O)O)N)CNC(=N)N",
    "dl-aspartic-acid": "C(C(C(=O)O)N)C(=O)O",
    "l-lysine": "C(CCN)C[C@@H](C(=O)O)N",
    "l-alanine": "C[C@@H](C(=O)O)N",
    "l-glutamic-acid": "C(CC(=O)O)[C@@H](C(=O)O)N",
    "l-threonine": "C[C@H]([C@@H](C(=O)O)N)O",
    "l-valine": "CC(C)[C@@H](C(=O)O)N",
    "l-leucine": "CC(C)C[C@@H](C(=O)O)N",
    "l-glutamine": "O=C(N)CCC(N)C(=O)O",
    "l-serine": "C([C@@H](C(=O)O)N)O",
    "l-histidine": "O=C([C@H](CC1=CNC=N1)N)O",
    "l-proline": "C1C[C@H](NC1)C(=O)O",
    "3-indoleacetonitrile": "C1=CC=C2C(=C1)C(=CN2)CC#N",
    "4-hydroxy-2-pyridone": "C1=CC(O)=CC(=O)N1",
    "carbazole": "c1ccc2c(c1)c3ccccc3[nH]2",
    "pyrrole-2-carboxylic-acid": "C1=CNC(=C1)C(=O)O",
    "pyrazinecarboxylic-acid": "C1=CN=C(C=N1)C(=O)O",
    "pyrazinecarboxamide": "C1=CN=C(C=N1)C(=O)N",
    "pyrimidinecarbonitrile": "N#Cc1ncccn1",
    "cytosine": "O=C1Nccc(N)n1",
    "uracil": "O=C1C=CNC(=O)N1",
    "thymine": "O=C1NC(=O)NC=C1C",
}
N_name_to_data_map = {}
for file in Path("data/24-01-02_experiment/ripped/Nitrogen").glob("*.csv"):
    name = file.stem.split("nitrogen_")[1]
    data = np.loadtxt(file, delimiter=",")
    N_name_to_data_map[name] = data
assert len(N_name_to_data_map) == len(N_smiles_map)
assert N_name_to_data_map.keys() == N_smiles_map.keys()

In [None]:
O_smiles_map = {
    "naphthoic-acid": "C1=CC=C2C(=C1)C=CC=C2C(=O)O",
    "phthalic-anhydride": "O=C1OC(=O)c2ccccc12",
    "anthraldehyde": "C1=CC=C2C(=C1)C=C3C=CC=CC3=C2C=O",
    "methylcellulose": "COCC1C(C(C(C(O1)OC2C(OC(C(C2OC)OC)OC)COC)OC)OC)OC",
    "uracil": "O=C1C=CNC(=O)N1",
    "thymine": "O=C1NC(=O)NC=C1C",
    "cytosine": "O=C1Nccc(N)n1",
    "valine": "CC(C)[C@@H](C(=O)O)N",
}
O_name_to_data_map = {}
for file in Path("data/24-01-02_experiment/ripped/Oxygen").glob("*.csv"):
    name = file.stem.split("oxygen_")[1]
    data = np.loadtxt(file, delimiter=",")
    O_name_to_data_map[name] = data
assert len(O_name_to_data_map) == len(O_smiles_map)
assert O_name_to_data_map.keys() == O_smiles_map.keys()

# Index all smiles

In [None]:
def fp_from_smiles(smile):
    return FingerprintMols.FingerprintMol(
        Chem.MolFromSmiles(smile),
        minPath=1,
        maxPath=7,
        fpSize=2048,
        bitsPerHash=2,
        useHs=True,
        tgtDensity=0.0,
        minSize=128
    )

In [None]:
def get_fp_and_spectra(element):
    d = get_data(elements=element)
    X = np.concatenate([d["X_train"], d["X_val"], d["X_test"]], axis=0)
    all_smiles = d["smiles_train"] + d["smiles_val"] + d["smiles_test"]
    all_fingerprints = [fp_from_smiles(x) for x in all_smiles]
    return all_fingerprints, X, all_smiles

In [None]:
def bulk_similarity(smiles, all_fingerprints):
    return DataStructs.BulkTanimotoSimilarity(fp_from_smiles(smiles), all_fingerprints)

In [None]:
C_FEFF_fps, C_FEFF_X, C_FEFF_smiles = get_fp_and_spectra("C")
N_FEFF_fps, N_FEFF_X, N_FEFF_smiles = get_fp_and_spectra("N")
O_FEFF_fps, O_FEFF_X, O_FEFF_smiles = get_fp_and_spectra("O")

# Get the FEFF maps
Maybe we choose the top 5 or so FEFF matches by TCC similarity and go from there

In [None]:
C_FEFF_matches = {}

_map = C_smiles_map
_fps = C_FEFF_fps
_smiles = C_FEFF_smiles
_spectra = C_FEFF_X

for name, smiles in _map.items():
    sim = bulk_similarity(smiles, _fps)
    argsorted = np.argsort(sim)[::-1]
    top = argsorted[:5]
    top_smiles = [_smiles[ii] for ii in top]
    top_spectra = [_spectra[ii] for ii in top]
    top_sim = [sim[ii] for ii in top]
    C_FEFF_matches[name] = {}
    C_FEFF_matches[name]["top_smiles"] = top_smiles
    C_FEFF_matches[name]["top_spectra"] = top_spectra
    C_FEFF_matches[name]["tcc"] = top_sim

In [None]:
N_FEFF_matches = {}

_map = N_smiles_map
_fps = N_FEFF_fps
_smiles = N_FEFF_smiles
_spectra = N_FEFF_X

for name, smiles in _map.items():
    sim = bulk_similarity(smiles, _fps)
    argsorted = np.argsort(sim)[::-1]
    top = argsorted[:5]
    top_smiles = [_smiles[ii] for ii in top]
    top_spectra = [_spectra[ii] for ii in top]
    top_sim = [sim[ii] for ii in top]
    N_FEFF_matches[name] = {}
    N_FEFF_matches[name]["top_smiles"] = top_smiles
    N_FEFF_matches[name]["top_spectra"] = top_spectra
    N_FEFF_matches[name]["tcc"] = top_sim

In [None]:
O_FEFF_matches = {}

_map = O_smiles_map
_fps = O_FEFF_fps
_smiles = O_FEFF_smiles
_spectra = O_FEFF_X

for name, smiles in _map.items():
    sim = bulk_similarity(smiles, _fps)
    argsorted = np.argsort(sim)[::-1]
    top = argsorted[:5]
    top_smiles = [_smiles[ii] for ii in top]
    top_spectra = [_spectra[ii] for ii in top]
    top_sim = [sim[ii] for ii in top]
    O_FEFF_matches[name] = {}
    O_FEFF_matches[name]["top_smiles"] = top_smiles
    O_FEFF_matches[name]["top_spectra"] = top_spectra
    O_FEFF_matches[name]["tcc"] = top_sim

# Plot the data

In [None]:
def setup_intensity(y):
    # return y
    y = y.copy()
    y -= y[0]
    y /= y[-1]
    return y

## Carbon

In [None]:
_smiles_map =      C_smiles_map
_exp_spectra_map = C_name_to_data_map
_matches =         C_FEFF_matches
_feff_grid =       C_grid
_keep =            sorted(C_smiles_map.keys())
_keep =            ["alanine", "histidine", "lysine"]
_shift =           11
_element =         "Carbon"

In [None]:
smiles = []
feff_smiles_matches = []

for name in _keep:
    smile = _smiles_map[name]
    smiles.append(Chem.MolFromSmiles(smile))
    feff_smile = _matches[name]["top_smiles"][0]
    feff_smiles_matches.append(Chem.MolFromSmiles(feff_smile))

svg = Chem.Draw.MolsToGridImage(smiles, useSVG=True)
with open(f"figures/fig_experiment/{_element}/exp_molecules.svg", "w") as f:
    f.write(svg.data)
    
svg = Chem.Draw.MolsToGridImage(feff_smiles_matches, useSVG=True)
with open(f"figures/fig_experiment/{_element}/feff_molecules.svg", "w") as f:
    f.write(svg.data)

In [None]:
fig, axs = plt.subplots(len(_keep), 1, figsize=(2, len(_keep)), sharey=True, sharex=True)

cc = 0
for ii, name in enumerate(_keep):

    spectrum = _exp_spectra_map[name]

    ax = axs[cc]
    set_grids(ax)

    feff_spectrum = _matches[name]["top_spectra"]

    tcc = _matches[name]["tcc"][0]

    ax.text(0.95, 0.8, f"{name}", ha="right", transform=ax.transAxes, fontsize=8)
    ax.text(0.95, 0.65, f"TCC={tcc:.02f}", ha="right", transform=ax.transAxes)

    feff = feff_spectrum[0]
    exp = setup_intensity(spectrum[:, 1])

    ax.plot(_feff_grid + _shift, feff, "r-", label="FEFF")
    ax.plot(spectrum[:, 0], exp, "k-", label="exp")
    

    ax.set_xlim(284, 295)
    # ax.set_xticks([285, 292, 299])
    ax.set_yticks([])
    

    if ii == 0:
        ax.legend(frameon=False, loc="upper left", fontsize=8)
    
    cc += 1

ax.set_xlabel("$E$~(eV)")
axs[1].set_ylabel("$\mu(E)$~(a.u.)")
# plt.savefig(f"figures/fig_experiment/{_element}/tmp_{_element}.svg", dpi=300, bbox_inches="tight")
plt.show()

## Nitrogen

In [None]:
_smiles_map =      N_smiles_map
_exp_spectra_map = N_name_to_data_map
_matches =         N_FEFF_matches
_feff_grid =       N_grid
# _keep =            sorted([key for key in N_keep if N_FEFF_matches[key]["tcc"][0] > 0.95])
_keep =            ["l-alanine", "l-proline", "pyrimidinecarbonitrile"]
_element =         "Nitrogen"

In [None]:
smiles = []
feff_smiles_matches = []

for name in _keep:
    smile = _smiles_map[name]
    smiles.append(Chem.MolFromSmiles(smile))
    feff_smile = _matches[name]["top_smiles"][0]
    feff_smiles_matches.append(Chem.MolFromSmiles(feff_smile))

svg = Chem.Draw.MolsToGridImage(smiles, useSVG=True)
with open(f"figures/fig_experiment/{_element}/exp_molecules.svg", "w") as f:
    f.write(svg.data)
    
svg = Chem.Draw.MolsToGridImage(feff_smiles_matches, useSVG=True)
with open(f"figures/fig_experiment/{_element}/feff_molecules.svg", "w") as f:
    f.write(svg.data)

In [None]:
fig, axs = plt.subplots(len(_keep), 1, figsize=(2, len(_keep)), sharey=True, sharex=True)

cc = 0
for ii, name in enumerate(_keep):

    spectrum = _exp_spectra_map[name]

    ax = axs[cc]
    set_grids(ax)

    feff_spectrum = _matches[name]["top_spectra"]

    tcc = _matches[name]["tcc"][0]

    name = name.replace("l-", "")
    
    ax.text(0.95, 0.8, f"{name}", ha="right", transform=ax.transAxes, fontsize=8)
    # ax.text(0.95, 0.8, f"TCC={tcc:.02f}", ha="right", transform=ax.transAxes)
    
    feff = feff_spectrum[0]
    exp = setup_intensity(spectrum[:, 1])

    if name == "alanine":
        shift = 0.5
        xshift = 0
    elif name == "proline":
        shift = 0.5
        xshift = 0
    elif name == "pyrimidinecarbonitrile":
        shift = 0.5
        xshift = 1.5
    
    ax.plot(spectrum[:, 0] + xshift, exp / exp.max(), "k-", label="exp")
    ax.plot(_feff_grid, feff / feff.max() + shift, "r-", label="FEFF")
    
    

    ax.set_xlim(395, 420)
    ax.set_yticks([])
    ax.set_xticks([400, 408, 416])

    if ii == 0:
        ax.legend(frameon=False, loc="upper left", fontsize=8)
    
    cc += 1

ax.set_xlabel("$E$~(eV)")
axs[1].set_ylabel("$\mu(E)$~(a.u.)")
plt.savefig(f"figures/fig_experiment/{_element}/tmp_{_element}.svg", dpi=300, bbox_inches="tight")
# plt.show()

### Alternate

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(2, 0.5*len(_keep)), sharey=True, sharex=True)

set_grids(ax)
for ii, name in enumerate(_keep):

    spectrum = _exp_spectra_map[name]

    feff_spectrum = _matches[name]["top_spectra"]

    tcc = _matches[name]["tcc"][0]

    # ax.text(0.95, 0.8, f"{name}", ha="right", transform=ax.transAxes, fontsize=8)
    # ax.text(0.95, 0.8, f"TCC={tcc:.02f}", ha="right", transform=ax.transAxes)
    
    offset = ii 
    ax.plot(_feff_grid + _shift, feff_spectrum[0] / np.max(feff_spectrum[0]) - offset, "r-", label="FEFF")
    exp = setup_intensity(spectrum[:, 1])
    ax.plot(spectrum[:, 0], exp / exp.max() - offset, "k-", label="exp")
    

    ax.set_xlim(395, 420)
    ax.set_yticks([])
    ax.set_xticks([400, 408, 416])

    # if ii == 0:
    #     ax.legend(frameon=False, loc="upper left", fontsize=8)


ax.set_xlabel("$E$~(eV)")
axs[1].set_ylabel("$\mu(E)$~(a.u.)")
# plt.savefig(f"figures/fig_experiment/{_element}/tmp_{_element}.svg", dpi=300, bbox_inches="tight")
plt.show()

# Oxygen

In [None]:
_smiles_map =      O_smiles_map
_exp_spectra_map = O_name_to_data_map
_matches =         O_FEFF_matches
_feff_grid =       O_grid
# _keep =            sorted([key for key in O_keep if O_FEFF_matches[key]["tcc"][0] > 0.95])
_keep =            ["uracil", "valine", "cytosine"]
_element =         "Oxygen"

In [None]:
smiles = []
feff_smiles_matches = []

for name in _keep:
    smile = _smiles_map[name]
    smiles.append(Chem.MolFromSmiles(smile))
    feff_smile = _matches[name]["top_smiles"][0]
    feff_smiles_matches.append(Chem.MolFromSmiles(feff_smile))

svg = Chem.Draw.MolsToGridImage(smiles, useSVG=True)
with open(f"figures/fig_experiment/{_element}/exp_molecules.svg", "w") as f:
    f.write(svg.data)
    
svg = Chem.Draw.MolsToGridImage(feff_smiles_matches, useSVG=True)
with open(f"figures/fig_experiment/{_element}/feff_molecules.svg", "w") as f:
    f.write(svg.data)

In [None]:
fig, axs = plt.subplots(len(_keep), 1, figsize=(2, len(_keep)), sharey=True, sharex=True)

cc = 0
for ii, name in enumerate(_keep):

    spectrum = _exp_spectra_map[name]

    ax = axs[cc]
    set_grids(ax)

    feff_spectrum = _matches[name]["top_spectra"]

    tcc = _matches[name]["tcc"][0]

    ax.text(0.95, 0.8, f"{name}", ha="right", transform=ax.transAxes, fontsize=8)
    # ax.text(0.95, 0.8, f"TCC={tcc:.02f}", ha="right", transform=ax.transAxes)
    
    feff = feff_spectrum[0]
    exp = setup_intensity(spectrum[:, 1])

    if name == "uracil":
        shift = 0.5
    elif name == "valine":
        shift = 0.4
    elif name == "cytosine":
        shift = 0.5

    ax.plot(spectrum[:, 0], exp / exp.max(), "k-", label="exp")
    ax.plot(_feff_grid, feff / feff.max() + shift, "r-", label="FEFF")
    
    

    ax.set_xlim(525, 555)
    ax.set_yticks([])
    # ax.set_xticks([400, 408, 416])

    if ii == 0:
        ax.legend(frameon=False, loc="upper left", fontsize=8)
    
    cc += 1

ax.set_xlabel("$E$~(eV)")
axs[1].set_ylabel("$\mu(E)$~(a.u.)")
plt.savefig(f"figures/fig_experiment/{_element}/tmp_{_element}.svg", dpi=300, bbox_inches="tight")
# plt.show()