In [1]:
import time
import random
import sys
from pathlib import Path
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib import pyplot
from rdkit import Chem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import SimilarityMaps

# Show full results
np.set_printoptions(threshold=sys.maxsize)

# Read the input CSV file.
f = open("smiles2.csv","r")
ligands = f.readlines()
f.close()
ligands = ligands[1:] #remove the first line of csv file (header)

# Create arrays
molecules = []
labels = []

# Insert molecules and labels in arrays from file
for line in ligands:
    #print(line)
    line=line.split(",")
    molecules.append(Chem.MolFromSmiles(line[2]))
    labels.append(line[1])

# Create fingerprints for all molecules
rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7)
fgrps = [rdkit_gen.GetFingerprint(mol) for mol in molecules]

#print(int(fgrps[1].ToBitString(),2))
#print(int(fgrps[4].ToBitString(),2))

# Calculate number of fingerprints
nfgrps = len(fgrps)

# Define a function to calculate similarities among the molecules
def pairwise_similarity(fingerprints_list):
    global similarities
    similarities = np.zeros((nfgrps, nfgrps))
    maxdev=-1
    element=0
    maxmean=-1
    element2=0
    maxratio=-1
    element3=0
    for i in range(1, nfgrps):
            similarity = DataStructs.BulkTanimotoSimilarity(fgrps[i], fgrps[:i])
            sim2=DataStructs.BulkTanimotoSimilarity(fgrps[i], fgrps[:])
            stdev=np.std(sim2)
            aver=np.average(sim2)
            if stdev>maxdev:
                maxdev=stdev
                element=i
                maxdevline=sim2
            if aver>maxmean:
                maxmean=aver
                element2=i
            tmp=stdev/aver
            if tmp>maxratio:
                maxratio=tmp
                element3=i
                maxratioline=sim2
            similarities[i, :i] = similarity
            similarities[:i, i] = similarity
    print("max std dev ",maxdev,labels[element],maxdevline)
    print("max average ",maxmean,labels[element2])
    print("max stdev/average ",maxratio,labels[element3])
    #return similarities

# Calculate similarities of molecules
pairwise_similarity(fgrps)

# Write similarities
f = open("similarities.txt", "w")
print (similarities, file=f)
f.close()







max std dev  0.21674158133373447 2-hydroxypentanoic acid [0.019337016574585635, 0.01984126984126984, 0.021367521367521368, 0.010416666666666666, 0.016304347826086956, 0.3064516129032258, 0.26548672566371684, 0.2923076923076923, 0.2786885245901639, 0.055350553505535055, 0.2619047619047619, 0.2246376811594203, 0.09361702127659574, 0.2753623188405797, 0.21904761904761905, 0.22727272727272727, 0.31007751937984496, 0.18181818181818182, 0.09180327868852459, 0.20134228187919462, 0.10077519379844961, 0.2222222222222222, 0.2684563758389262, 0.25157232704402516, 0.22916666666666666, 0.07913669064748201, 0.43373493975903615, 0.5373134328358209, 0.29850746268656714, 0.7611940298507462, 0.5204081632653061, 0.5862068965517241, 0.43037974683544306, 1.0, 0.7701149425287356, 0.410958904109589, 0.8375, 0.7701149425287356, 0.273972602739726, 0.2, 0.23469387755102042, 0.2948717948717949, 0.2631578947368421, 0.379746835443038, 0.248, 0.39, 0.5131578947368421, 0.3482142857142857, 0.33620689655172414, 0.2846