In [7]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import h5py
import skdim
import matplotlib.pyplot as plt
from glob import glob
from skdim.id import FisherS
from src.cdr_bench.io_utils.data_preprocessing import get_filename
from src.cdr_bench.io_utils.io import load_fp_array
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Path to data

In [8]:
# Directory to save plots and metrics summaries
h5_file_path = '../datasets/*.h5'
files = glob(h5_file_path)

In [9]:
# Load fingerprint arrays for all files
fp_arrays = {get_filename(file): load_fp_array(file).astype(np.float64) for file in files}

AttributeError: 'NoneType' object has no attribute 'astype'

In [13]:
fp_arrays

{'chembl_random_9269_seed301': array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 2., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]]),
 'CHEMBL4644': array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 4., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]]),
 'chembl_random_500_seed976': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 2., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'chembl_random_1500_seed54': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., 

# Calculating FP

In [22]:
# Initialize the results list
results = []

# Iterate through each key in the dictionary
for key, array in tqdm(fp_arrays.items()):
    # 1. Calculate the length of the vector
    length, desc = array.shape[0], array.shape[1]
    
    # 2. Standardize the vector
    scaler = StandardScaler()
    standardized_array = scaler.fit_transform(array)
    
    # 3. Calculate Fisher information dimension using skdim
    fid = FisherS().fit_transform(standardized_array)
    
    # Store the results
    results.append({
        'Dataset': key,
        '# of data points': length,
        '# non-constant descriptors': desc,
        'Fisher Separability ID': fid
    })

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:29<00:00,  1.29s/it]


In [None]:
# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Save the DataFrame to a file (optional)
df.to_csv('../results/Fisher_ID_stat.csv', index=False)