In [21]:
import glob
import os
import itertools
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.DataStructs import FingerprintSimilarity
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.cm as cm

### MFP_matrix

`AllChem.GetMorganFingerprintAsBitVect(mol, 2`) is used to generate the Morgan fingerprint for each molecule in `mol_list`.
_The Morgan fingerprint_ is a type of circular fingerprint that represents the molecular structure of a compound. 
The 2 in the function call specifies the radius of the circular fingerprint. This means that the fingerprint will capture structural information up to **two bonds** away from each atom.

`itertools.product(fp, repeat=2)` generates all possible pairs of fingerprints from the fp. 
Then `FingerprintSimilarity(x, y)` calculetes similiraty score between x and y (all combinations generated by itertools.product from the list `fp`) and store them in the `mfps` list.

Finally, we convert the `mfps` list into a Numpy array that is reshaped into a a square matrix. 


In [22]:
def MFP_matrix(mol_list):
    fp = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mol_list] #  list fp containing Morgan fingerprints for all molecules in mol_list.
    mfps = list(FingerprintSimilarity(x, y) for x, y in itertools.product(fp, repeat=2))  
    return fp, np.array(mfps).reshape(len(fp), len(fp))

Gets Target name from the sdf file. 
Creates a MFP matrix with the supplier
Then uses SDMOlSupplier to iterate through all the molecules in the SDF files and get the names for each of them. 

In [29]:
target_data = {}
target_labels = []

target = ['KNC']
sdd = glob.glob('{}'.format(os.getcwd())+f'/{target}*.sdf')
sdf = sdd[0]
target = sdf.split('\\')[-1].split('_ligand')[0]
suppl = Chem.SDMolSupplier(sdf)
fp_list, fp_mat = MFP_matrix(suppl)
name_list = [mol.GetProp('_Name') for mol in suppl]

In [32]:
target_data={}
target_data['fp_list'] = fp_list
target_data['name_list'] = name_list

dict_keys(['KNC'])

In [None]:

target_data[target] = {}
target_data[target]['fp_list'] = fp_list
target_data[target]['name_list'] = name_list
target_labels.extend([target] * len(suppl))