In [95]:
# Funktionen comparee_all_coefficients(smiles), tar fram en matris 
# som visar nivån av likhet mellan alla molekyler två och två med datafused
# likhetskoefficienter.
# Anledningen till varför hälften av matrisen har NaN värden är för att annars blir det
# en upprepniing av alla värden. 

# Funktionen compare_all_coefiicients(smiles)
# 1. Skriver ut värdena för 4 olika likhetsmått. En datafusion görs som 
# skrivs ut. 

# 2. Kollar om de olika likhetskoefficienterna kommer fram till 
# samma molekyl som är mest lik referensmolekylen. Om de inte gör det 
# skrivs ett meddelande ut som säger vilka likhetskoefficienter som 
# inte överenstämmer.

# 3.Radar upp alla molekyler och motsvarande molekyl som är mest lik.

In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Chem.Fingerprints import FingerprintMols
#Urpsungliga fps, ta inte hänsyn till om refernsen är i smiles strängen. 
descriptors = {
    'maccs':       lambda m: MACCSkeys.GenMACCSKeys(m),
    'morgan3':     lambda m: AllChem.GetMorganFingerprintAsBitVect(m,3),
    'morgan5':     lambda m: AllChem.GetMorganFingerprintAsBitVect(m,5),
    'rdkit':       lambda m: FingerprintMols.FingerprintMol(m)
}

metrics = {
    'asymmetric':    DataStructs.AsymmetricSimilarity,
    'braunblanquet': DataStructs.BulkBraunBlanquetSimilarity,
    'cosine':        DataStructs.BulkCosineSimilarity,
    'dice':          DataStructs.BulkDiceSimilarity,
    'kulczynski':    DataStructs.BulkKulczynskiSimilarity,
    'mcconnaughey':  DataStructs.BulkMcConnaugheySimilarity,
    'rogotgoldberg': DataStructs.BulkRogotGoldbergSimilarity,
    'russel':        DataStructs.BulkRusselSimilarity,
    'sokal':         DataStructs.BulkSokalSimilarity,
    'tanimoto':      DataStructs.BulkTanimotoSimilarity
}

"""
Returns a list of similarity scores for a list of smiles strings compared to a
reference compound. The fingerprints and similarity coefficients can be chosen
from the list of descriptors and metrics (default 'rdkit' and 'tanimoto').
"""
def fpss_sim(ref, smiles, descriptor='rdkit', metric='tanimoto'):
    # ref - reference smiles
    # smiles - list of smiles to compare with
    # descriptor - fingerprint type from "descriptors", default 'rdkit'
    # similarity score from "metrics", default 'tanimoto'

    if descriptor not in descriptors:
        raise ValueError('Invalid descriptor name ' + descriptor)

    if metric not in metrics:
        raise ValueError('Invalid metric ' + metric)
        
    
    ref_ms = Chem.MolFromSmiles(ref)
    ms=[]
    fps=[]
    for x in smiles: 
        
            lo=Chem.MolFromSmiles(x)
            ms.append(lo)
            fingerprint = descriptors[descriptor]
            ref_fps = fingerprint(ref_ms)
            fps.append(fingerprint(lo))
            score = metrics[metric] 
            

   
    return score(ref_fps, fps)

In [18]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
from copy import copy
#itertols.combination
#Får ut en matris av alla Koefficienter i smiles. 
def comparee_all_coefficients(smiles): 
 
    datafusion1=[]

    scores=["Tan","Cos","Dice","Sokal"]
    scores.sort()

    count=0
    ref1=[]

    while count < len(smiles):

        ref=smiles[count]
        
    
        tan=fpss_sim(ref, smiles, descriptor='rdkit', metric='tanimoto')
        dice=fpss_sim(ref, smiles, descriptor='rdkit', metric='dice')
        cos=fpss_sim(ref, smiles, descriptor='rdkit', metric='cosine')
        sokal=fpss_sim(ref, smiles, descriptor='rdkit', metric='sokal')
        

        df = pd.DataFrame({'Cos':cos,'Dice':dice, 'Sokal': sokal, 'Tan':tan}, index=smiles)
        covariance = df.cov()
        #print(df)

        #Får ut datafusion i vektor  
        datafusion=[]
        col_list= list(df)
        data=(df.loc[:,col_list].sum(axis=1).values)/4
        for i in data:
             datafusion.append(i)
             
        datafusion1.append(datafusion)
        
     
    
        count+=1
        ref1.append(ref)
    df1 = pd.DataFrame(datafusion1,columns=smiles,index=ref1,dtype=float)
    ble=df1.values
    
    n=0
    k=0
    while n<len(smiles):
      while k<len(smiles):
           ble[k][n] = np.nan
           k+=1
      n+=1
      k=n
    
    df2 = pd.DataFrame(ble,columns=smiles,index=ref1,dtype=float)
    print(df2.sort_values('C1CCCCC1', ascending=False))
    
    
   

        


In [19]:
comparee_all_coefficients(["CC(=O)OC","CC(=O)OO","CC(=O)OCC", "O=C=O", "CCN(CC)CC", "C1CCCCC1"])

           CC(=O)OC  CC(=O)OO  CC(=O)OCC     O=C=O  CCN(CC)CC  C1CCCCC1
CC(=O)OO        NaN       NaN   0.483232  0.079922   0.220703  0.200719
CC(=O)OC        NaN  0.523807   0.785214  0.144338   0.244964  0.181995
CC(=O)OCC       NaN       NaN        NaN  0.180797   0.246912  0.154282
CCN(CC)CC       NaN       NaN        NaN       NaN        NaN  0.153655
O=C=O           NaN       NaN        NaN       NaN   0.000000  0.000000
C1CCCCC1        NaN       NaN        NaN       NaN        NaN       NaN


In [100]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
from copy import copy

# 1. Skriver ut värdena för 4 olika likhetsmått. En datafusion görs som 
# skrivs ut. 

# 2. Kollar om de olika likhetskoefficienterna kommer fram till 
# samma molekyl som är mest lik referensmolekylen. Om de inte gör det 
# skrivs ett meddelande ut som säger vilka likhetskoefficienter som 
# inte överenstämmer.

# 3.Radar upp alla molekyler och motsvarande molekyl som är mest lik.
def compare_all_coefficients(smiles): 
 
    TotalSumDF=[]
    MaxValueDF=[]
    MaxMolecule=[]

     
 
    scoras=["Tan","Cos","Dice","Sokal"]
    scoras.sort()
    count=0
    ref1=[]

    #Går igenom alla substrat. Tar ett element i vektorn och jämför med alla andra substrat. 
    while count < len(smiles):
        scores=copy(scoras)
    
        datafusion=[]
        #referens molekylen som ska jämföras med alla andra
        ref=smiles[count]
        print("REFERENSEN ÄR " + ref)
        smales= copy(smiles)
        
        #fps_sim tar fram likhetsmåtten för alla smiles. Koden är justerad så ref tar inte hänsyn till sig själv i smiles-vektorn. 
        tan=fps_sim(ref, smales, descriptor='rdkit', metric='tanimoto')
        dice=fps_sim(ref, smales, descriptor='rdkit', metric='dice')
        cos=fps_sim(ref, smales, descriptor='rdkit', metric='cosine')
        sokal=fps_sim(ref, smales, descriptor='rdkit', metric='sokal')
        #mcconnaughey=fps_sim(ref, smales, descriptor='rdkit', metric='mcconnaughey')

        
        #skapar en dataframe med alla likhetsmått som rad och alla smiles förutom refernsmolekylen
        df = pd.DataFrame({'Cos':cos,'Dice':dice, 'Sokal': sokal, 'Tan':tan}, index=smales)
        covariance = df.cov()
        print(df)
        print(" ")
        
        #Får ut datafusion i vektor  
        ref1.append(ref)
        
        #summerar över vaje rad och skriver ut datafused coeffcients
        print("Datafused coefficients:")
        print((df.sum(axis=1))/4)
        print(" ")
        df3=(df.sum(axis=1))/4
        MaxMolecule.append(df3.idxmax(0))
        
        
        #lagrar summan i vektorn "Data"
        col_list=list(df)
        data=(df.loc[:,col_list].sum(axis=1).values)/4
        arr=np.array(data)
        MaxValueDF.append(np.amax(arr))
        
      
        
        for i in data:
             datafusion.append(i)
        SumDF=sum(datafusion)
        TotalSumDF.append(SumDF/len(smales))
        
        
        

     #Prints the molecule with the highest similarity score  + the coefficient itself+ highest similarity score for each coefficient
        g=0
        #print("Molecule with highest similarity score with corresponding score:")
        for i in scoras:
          #print ('\033[91m' + df[[i]].idxmax()+'\033[0m' )
          #print ('\033[1m' +" - Max value: " + str(df.max(axis=0)[g])+ '\033[0m')
          g+=1 
          print(" ")

 # check if the coefficients are the same. 
        for i in scores: 
            blu= (df.index[df[i] == df[i].max()][0])
            for b in scores:
                bla= (df.index[df[b] == df[b].max()][0])
                if bla!=blu:
                    print("The similarity coefficients doesn't agree on which molecule is most similar to the query")
                    print(b+ " and "+ i+ " shows different molecules" )
        
            scores.remove(i)   
        
    
        count+=1
        
       
    df2 = pd.DataFrame(TotalSumDF,index=ref1,dtype=float)
    arrays = [ref1, MaxMolecule]
    index=pd.MultiIndex.from_arrays(arrays, names=('Referens', 'Molekyl mest lik referensen'))
    df4 = pd.DataFrame(MaxValueDF,index,dtype=float)
    print(df4)


In [101]:
compare_all_coefficients(["CC(=O)OC","CC(=O)OO","CC(=O)OCC", "O=C=O", "CCN(CC)CC", "C1CCCCC1"])

REFERENSEN ÄR CC(=O)OC
                Cos      Dice     Sokal       Tan
CC(=O)OO   0.650791  0.648649  0.315789  0.480000
CC(=O)OCC  0.877058  0.869565  0.625000  0.769231
O=C=O      0.258199  0.173913  0.050000  0.095238
CCN(CC)CC  0.335410  0.333333  0.111111  0.200000
C1CCCCC1   0.258199  0.250000  0.076923  0.142857
 
Datafused coefficients:
CC(=O)OO     0.523807
CC(=O)OCC    0.785214
O=C=O        0.144338
CCN(CC)CC    0.244964
C1CCCCC1     0.181995
dtype: float64
 
 
 
 
 
REFERENSEN ÄR CC(=O)OO
                Cos      Dice     Sokal       Tan
CC(=O)OC   0.650791  0.648649  0.315789  0.480000
CC(=O)OCC  0.618347  0.604651  0.276596  0.433333
O=C=O      0.140028  0.100000  0.027027  0.052632
CCN(CC)CC  0.303170  0.303030  0.098039  0.178571
C1CCCCC1   0.280056  0.275862  0.086957  0.160000
 
Datafused coefficients:
CC(=O)OC     0.523807
CC(=O)OCC    0.483232
O=C=O        0.079922
CCN(CC)CC    0.220703
C1CCCCC1     0.200719
dtype: float64
 
 
 
 
 
REFERENSEN ÄR CC(=O)OCC
        

In [49]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Chem.Fingerprints import FingerprintMols

descriptors = {
    'maccs':       lambda m: MACCSkeys.GenMACCSKeys(m),
    'morgan3':     lambda m: AllChem.GetMorganFingerprintAsBitVect(m,3),
    'morgan5':     lambda m: AllChem.GetMorganFingerprintAsBitVect(m,5),
    'rdkit':       lambda m: FingerprintMols.FingerprintMol(m)
}

metrics = {
    'asymmetric':    DataStructs.AsymmetricSimilarity,
    'braunblanquet': DataStructs.BulkBraunBlanquetSimilarity,
    'cosine':        DataStructs.BulkCosineSimilarity,
    'dice':          DataStructs.BulkDiceSimilarity,
    'kulczynski':    DataStructs.BulkKulczynskiSimilarity,
    'mcconnaughey':  DataStructs.BulkMcConnaugheySimilarity,
    'rogotgoldberg': DataStructs.BulkRogotGoldbergSimilarity,
    'russel':        DataStructs.BulkRusselSimilarity,
    'sokal':         DataStructs.BulkSokalSimilarity,
    'tanimoto':      DataStructs.BulkTanimotoSimilarity
}

"""
Returns a list of similarity scores for a list of smiles strings compared to a
reference compound. The fingerprints and similarity coefficients can be chosen
from the list of descriptors and metrics (default 'rdkit' and 'tanimoto').
"""
def fps_sim(ref, smiles, descriptor='rdkit', metric='tanimoto'):
    # ref - reference smiles
    # smiles - list of smiles to compare with
    # descriptor - fingerprint type from "descriptors", default 'rdkit'
    # similarity score from "metrics", default 'tanimoto'

    if descriptor not in descriptors:
        raise ValueError('Invalid descriptor name ' + descriptor)

    if metric not in metrics:
        raise ValueError('Invalid metric ' + metric)
        
    #compare ref with smiles
    #mols
    ref_ms = Chem.MolFromSmiles(ref)
    ms=[]
    fps=[]
    for x in smiles: 
        if ref!=x: 
            lo=Chem.MolFromSmiles(x)
            ms.append(lo)
            fingerprint = descriptors[descriptor]
            ref_fps = fingerprint(ref_ms)
            fps.append(fingerprint(lo))
            score = metrics[metric] # similarity score
            
        else:
           pass  
    for b in smiles: 
        if b==ref:
            smiles.remove(ref)
   
    return score(ref_fps, fps)