In [1]:
%load_ext autoreload
%autoreload 2
import torch
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors
from src.cdr_bench.io_utils.io import read_features_hdf5_dataframe, read_optimization_results, check_hdf5_file_format

# Prepare features for optimization

In [None]:
model_params = {
    'edge_in_feats': 12,
    'embed_size': 16,
    'node_in_feats': 74
}

NF = CanonicalAtomFeaturizer()
BF = CanonicalBondFeaturizer()

# Assuming your .pt file path is 'model.pt' and contains a state_dict
model_path = '/home/aorlov/Programs/ChemDist_paper/Model/model_trained.pt'

# Create an instance of your model
model = DistanceNetworkLigthning(**model_params)

# Load the state dictionary
state_dict = torch.load(model_path)#, map_location=torch.device('cpu'))

# If your saved model was a PyTorch Lightning model, state_dict might be under 'state_dict' key
if 'state_dict' in state_dict:
    model.load_state_dict(state_dict['state_dict'])
else:
    model.load_state_dict(state_dict)

# Ensure the model is in evaluation mode
model.eval()

model.cuda()

# Generate features for optimization

# HDF5 File Structure (CHEMBL204.h5)

This HDF5 file contains chemical compound data along with several molecular features. The structure of the file is organized into two main sections: **Dataset and SMILES** and **Features**.

## 1. Dataset and SMILES (smi)
- **dataset**: Contains identifiers for chemical compounds (e.g., "CHEMBL204").
- **smi**: Contains SMILES strings representing the chemical structure of compounds.

## 2. Features
The features section contains several key molecular features:
- **embed**: A numerical feature representation of the chemical compounds. These are lists of floating-point numbers.
- **maccs_keys**: A list of MACCS molecular fingerprints. These are binary fingerprints indicating the presence or absence of certain molecular features (0s and 1s).
- **mfp_r2_1024**: A list of Morgan molecular fingerprints (radius 2, 1024 bits). These are used to encode molecular substructures as lists of integers.

## Overview of Data
- The dataset contains **4020 rows**, each representing a distinct chemical compound.
- Each row has the following columns:
  - **dataset**: The compound identifier.
  - **smi**: The SMILES string of the compound.
  - **embed**: Numerical embeddings for each compound, stored as lists of floating-point numbers.
  - **maccs_keys**: Binary MACCS molecular keys, stored as lists of integers.
  - **mfp_r2_1024**: Morgan molecular fingerprints (1024-bit), stored as lists of integers.

Each of these feature columns provides a different numerical or categorical representation of the molecular structure for machine learning or chemical informatics analysis.


In [11]:
df_features = read_features_hdf5_dataframe('../datasets/CHEMBL33.h5')

In [12]:
df_features.head()


Unnamed: 0,dataset,smi,RDKit_fp
0,b'CHEMBL33',b'BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br',"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,b'CHEMBL33',b'BrCCCCCCBr',"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,b'CHEMBL33',b'BrCc1cc(Br)c2cc(NBr)c(Br)c(Br)c2c1',"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,b'CHEMBL33',b'BrCc1cc(Br)c2cc(NBr)c(Br)cc2c1Br',"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,b'CHEMBL33',b'BrCc1cc2cc(Br)c(NBr)cc2c(Br)c1Br',"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [16]:
df_features['mfp_r2_1024'][0][50]

0

# Read optimization results

## HDF5 File Structure (ambient_dist_and_PCA_results.h5)

This HDF5 file contains datasets related to high-dimensional data and the results of Principal Component Analysis (PCA) performed on this data.

### Datasets
1. **X_HD**: 
   - A high-dimensional dataset with a shape of **(4020, 1024)**.
   - It contains 4020 samples, each with 1024 features.
   - This is the original high-dimensional feature representation of the data.

2. **X_PCA**: 
   - A PCA-transformed dataset with a shape of **(4020, 2)**.
   - It contains the same 4020 samples, but each sample has been reduced to 2 principal components using PCA.
   - This reduced dataset is typically used for visualization or as input for further machine learning analysis.

The file likely stores high-dimensional data and the results of PCA to enable comparison between the original and PCA-reduced data.


## HDF5 File Structure (mfp_r2_1024.h5)

This HDF5 file contains datasets and groups related to various dimensionality reduction techniques applied to the Morgan Fingerprint (radius 2, 1024 bits).

### Datasets and Groups:

1. **GTM_coordinates**:
   - Coordinates of the data after applying Generative Topographic Mapping (GTM).

2. **GTM_metrics**:
   - Metrics related to neighborhood preservation.

3. **PCA_coordinates**:
   - Coordinates of the data after applying Principal Component Analysis (PCA).

4. **PCA_metrics**:
   - Metrics related to neighborhood preservation.

5. **UMAP_coordinates**:
   - Coordinates after applying Uniform Manifold Approximation and Projection (UMAP).

6. **UMAP_metrics**:
   - Metrics related to neighborhood preservation.

7. **dataframe**:
   - A tabular dataset, containing information related to the features or molecules analyzed in the file.

8. **mfp_r2_1024**:
   - The original Morgan Fingerprint dataset (radius 2, 1024 bits).
   - It encodes molecular features in a binary vector format, used for cheminformatics and molecular modeling.

9. **t-SNE_coordinates**:
   - Coordinates after applying t-Distributed Stochastic Neighbor Embedding (t-SNE).

10. **t-SNE_metrics**:
   - Metrics related to neighborhood preservation.

This file organizes molecular data and its corresponding projections in various dimensionality reduction methods, enabling analysis and comparison of the techniques.


In [18]:
file_path = r'C:\Users\akash\OneDrive\Desktop\DR\New_DR\cdr_bench\Output2\CHEMBL33\mfp_r2_1024\mfp_r2_1024.h5'
descriptor_set = 'mfp_r2_1024'
methods_to_extract = ['PCA']
df, fp_array, results = read_optimization_results(file_path, feature_name=descriptor_set, method_names=methods_to_extract)

In [19]:
df.head()

Unnamed: 0,dataset,smi
0,CHEMBL33,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br
1,CHEMBL33,BrCCCCCCBr
2,CHEMBL33,BrCc1cc(Br)c2cc(NBr)c(Br)c(Br)c2c1
3,CHEMBL33,BrCc1cc(Br)c2cc(NBr)c(Br)cc2c1Br
4,CHEMBL33,BrCc1cc2cc(Br)c(NBr)cc2c(Br)c1Br


In [20]:
fp_array.shape

(1253, 2048)

In [21]:
results

{'PCA': {'metrics': {'AUC': np.float64(0.7490751579907181),
   'LCMC': array([2.00318596e-01, 2.05904551e-01, 2.11224478e-01, ...,
          7.54666755e-04, 3.82723432e-06, 0.00000000e+00]),
   'QNN': array([0.20111732, 0.207502  , 0.21362064, ..., 0.99915722, 0.99920511,
          1.        ]),
   'Qglobal': np.float64(0.821823994058668),
   'Qlocal': np.float64(0.4391937678192743),
   'cont_ls': array([0.97046352, 0.96007936, 0.95046278, 0.93980193, 0.92414277]),
   'kmax': np.int64(237),
   'nn_overlap': array([20.75019952, 22.60175579, 23.74301676, 26.36073424, 34.13407821]),
   'nn_overlap_best': np.float64(26.36073423782921),
   'trust_ls': array([0.86401232, 0.8531832 , 0.84696975, 0.83999683, 0.8397765 ])},
  'coordinates': array([[ -2.12014389,   1.08764101],
         [ -2.26125522,   1.41243534],
         [ -2.42520217,   0.01348121],
         ...,
         [  5.04943598,  -7.9835993 ],
         [  5.10513373, -10.10171932],
         [  5.05167446,  -9.46756859]])}}