In [2]:
import umap
# import umap.umap_ as umap
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from matplotlib import pyplot as plt

# Load the molecule graphs from the files
file1 = '../../data/train_100K.sdf'
file2 = '../../data/D2_7jvr_dop_393b_2comp_final_10M_test_10K.sdf'

# Read the molecule graphs from the files
mols1 = PandasTools.LoadSDF(file1)
mols2 = PandasTools.LoadSDF(file2)

# Generate the molecular fingerprints for each set of molecule graphs
fps1 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols1['ROMol']]
fps2 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols2['ROMol']]

# Convert the fingerprints to numpy arrays
X1 = np.array(fps1)
X2 = np.array(fps2)


In [5]:
file3 = '../results/test_1M_results.sdf'
mol3 = PandasTools.LoadSDF(file3)
fps3 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mol3['ROMol']]
X3 = np.array(fps3)



In [6]:
# Perform dimensionality reduction using UMAP
reducer = umap.UMAP()

# Fit the reducer on the combined dataset to get the transformation, then transform separately
combined_X = np.vstack((X1, X2, X3))
reducer.fit(combined_X)

embedding1 = reducer.transform(X1)
embedding2 = reducer.transform(X2)
embedding3 = reducer.transform(X3)

# Plot the 2D map with different colors
plt.scatter(embedding1[:, 0], embedding1[:, 1], c='red', label='File 1')
plt.scatter(embedding2[:, 0], embedding2[:, 1], c='blue', label='File 2')
plt.scatter(embedding3[:, 0], embedding3[:, 1], c='green', label='File 3')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.title('Molecule Graph Projection')
plt.legend()
plt.show()


KeyboardInterrupt: 