In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [None]:
from collections import Counter
import json
from pathlib import Path
import string
import sys

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from IPython.display import clear_output
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tqdm
import torch
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, recall_score

from multimodal_molecules.core import Ensemble, Estimator, get_data
from multimodal_molecules.plotting import set_defaults, set_grids, density_scatter, remove_axis_spines

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [None]:
set_defaults()

# Load the QM9 data

In [None]:
def load_qm9_xyz(path):
    with open(path, "r") as f:
        lines = f.readlines()
    smiles = lines[-2].strip().split()[1]
    return lines, smiles

In [None]:
smiles_to_info = {}

In [None]:
for fname in tqdm.tqdm(list(Path("/Users/mc/Data/QM9").glob("*.xyz"))):
    lines, smiles = load_qm9_xyz(fname)
    smiles_to_info[smiles] = {"xyz": lines, "fname": fname.name}

In [None]:
C_grid = np.loadtxt("data/c_grid.txt")
N_grid = np.loadtxt("data/n_grid.txt")
O_grid = np.loadtxt("data/o_grid.txt")

# Load experimental data

Indexes are relative to stacking the smiles like `smiles = data["smiles_train"] + data["smiles_val"] + data["smiles_test"]`.

## Carbon

- Glycine: `C(C(=O)O)N` (Closest structure in database: `17351 [NH3+]CC([O-])=O`, actually is glycine)
- Proline: `O=C(O)C1CCCN1` (Closest structure in database: `21828 [O-]C(=O)C1CCC[NH2+]1`, actually is proline)

In [None]:
d = get_data(elements="C")
X = np.concatenate([d["X_train"], d["X_val"], d["X_test"]], axis=0)

In [None]:
carbon_glycine = np.loadtxt("data/24-01-02_experiment/carbon_glycine.csv", delimiter=",")
carbon_glycine_feff = X[17351, :]

In [None]:
carbon_proline = np.loadtxt("data/24-01-02_experiment/carbon_proline.csv", delimiter=",")
carbon_proline_feff = X[21828, :]

## Nitrogen

- Glycine: `C(C(=O)O)N` (Closest structure in database: `68438 [NH3+]CC([O-])=O`, actually is glycine)
- Proline: `O=C(O)C1CCCN1` (Closest structure in database: `862 [O-]C(=O)C1CCC[NH2+]1`, actually is proline)

In [None]:
d = get_data(elements="N")
X = np.concatenate([d["X_train"], d["X_val"], d["X_test"]], axis=0)

In [None]:
nitrogen_glycine = np.loadtxt("data/24-01-02_experiment/nitrogen_glycine.csv", delimiter=",")
nitrogen_glycine_feff = X[68438, :]

In [None]:
nitrogen_proline = np.loadtxt("data/24-01-02_experiment/nitrogen_proline.csv", delimiter=",")
nitrogen_proline_feff = X[862, :]

## Oxygen

* Aldehyde is `C1=CC=C2C(=C1)C=C3C=CC=CC3=C2C=O` (9-Anthraldehyde) (Closest structure in database: `74831 O=CC1=CC=CC=C1`)
* Aliphatic hydroxyl is Polyvinyl alcohol, approximated as 2-butanol `CCC(C)O` (Closest structure in database: `63161 CCC(C)O`)

In [None]:
d = get_data(elements="O")
X = np.concatenate([d["X_train"], d["X_val"], d["X_test"]], axis=0)

In [None]:
oxygen_aldehyde = np.loadtxt("data/24-01-02_experiment/oxygen_aldehyde.csv", delimiter=",")
oxygen_aldehyde_feff = X[74831, :]

In [None]:
oxygen_aliphatic_hydroxyl = np.loadtxt("data/24-01-02_experiment/oxygen_aliphatic_hydroxyl.csv", delimiter=",")
oxygen_aliphatic_hydroxyl_feff = X[63161, :]

# Plot the data

In [None]:
def setup_intensity(y):
    # return y
    y = y.copy()
    y -= y[0]
    y /= y[-1]
    return y

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(6, 4), sharey=True)

ax = axs[0, 0]
ax.set_title("C")
ax.plot(carbon_glycine[:, 0], setup_intensity(carbon_glycine[:, 1]), label="EXP")
ax.plot(C_grid, carbon_glycine_feff, label="FEFF")
ax.text(0.9, 0.9, "Glycine", ha="right", va="top", transform=ax.transAxes)
ax.set_xticks([])
ax.legend(frameon=False, fontsize=8, loc="center right")

ax = axs[1, 0]
ax.plot(carbon_proline[:, 0], setup_intensity(carbon_proline[:, 1]))
ax.plot(C_grid, carbon_proline_feff)
ax.text(0.9, 0.9, "Proline", ha="right", va="top", transform=ax.transAxes)


ax = axs[0, 1]
ax.set_title("N")
ax.plot(nitrogen_glycine[:, 0], setup_intensity(nitrogen_glycine[:, 1]))
ax.plot(N_grid, nitrogen_glycine_feff)
ax.text(0.9, 0.9, "Glycine", ha="right", va="top", transform=ax.transAxes)

ax = axs[1, 1]
ax.plot(nitrogen_proline[:, 0], setup_intensity(nitrogen_proline[:, 1]))
ax.plot(N_grid, nitrogen_proline_feff)
ax.text(0.9, 0.9, "Proline", ha="right", va="top", transform=ax.transAxes)


ax = axs[0, 2]
ax.set_title("O")
ax.plot(oxygen_aldehyde[:, 0], setup_intensity(oxygen_aldehyde[:, 1]))
ax.plot(O_grid, oxygen_aldehyde_feff)


ax = axs[1, 2]
ax.plot(oxygen_aliphatic_hydroxyl[:, 0], setup_intensity(oxygen_aliphatic_hydroxyl[:, 1]))
ax.plot(O_grid, oxygen_aliphatic_hydroxyl_feff)


plt.show()

# Find matching simulated data

Indexed above for convenience.

In [None]:
def fp_from_smiles(smile):
    return FingerprintMols.FingerprintMol(
        Chem.MolFromSmiles(smile),
        minPath=1,
        maxPath=7,
        fpSize=2048,
        bitsPerHash=2,
        useHs=True,
        tgtDensity=0.0,
        minSize=128
    )

In [None]:
d = get_data(elements="O")

In [None]:
all_smiles = d["smiles_train"] + d["smiles_val"] + d["smiles_test"]

In [None]:
all_fingerprints = [fp_from_smiles(x) for x in all_smiles]

In [None]:
sims = DataStructs.BulkTanimotoSimilarity(fp_from_smiles("CCC(C)O"), all_fingerprints)

In [None]:
argsorted = np.argsort(sims)

In [None]:
ii = -1
Chem.MolFromSmiles(all_smiles[argsorted[ii]])
print(argsorted[ii])
print(all_smiles[argsorted[ii]])

Glycine is index `17351 [NH3+]CC([O-])=O`.

In [None]:
for ii, smile in enumerate(all_smiles):
    cc = Counter(smile)
    if cc["C"] == 5 and cc["N"] == 1 and cc["O"] == 2 and cc["1"] == 2 and cc["="] == 1 and cc["("] == 1 and cc[")"] == 1\
        and cc["#"] == 0:
        print(ii, smile)

In [None]:
Chem.CanonSmiles("C(C(=O)O)N")

In [None]:
Chem.CanonSmiles("O=C([O-])C1CCC[NH3+]1")