In [None]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
import networkx as nx
import pandas as pd

In [4]:
G = nx.Graph()
f = open('net_0_7.tsv') #file format is shown below
f.readline()
for i in f:
    #print(i.split('\t'))
    tmp = i.split('\t')
    G.add_edge(tmp[0],tmp[2],cosine_score=float(tmp[4]))

In [2]:
#file format (generated by https://github.com/mohimanilab/MASSTplus)
# scan_1	mz_1	scan_2	mz_2	dot_product	dot_product_shared	dot_product_shifted	scan	refscan
# 211345	148.133	3855717	85.0842	0.776385	0.726864	0.0495215	211345	3855717
# 211345	148.133	1077302	210.16	0.771759	0.583175	0.188584	211345	1077302
# 211345	148.133	2302324	231.084	0.764259	0.581443	0.182816	211345	2302324
# 211345	148.133	219395	149.06	0.759846	0.759846	0	211345	219395
# 211345	148.133	2241169	227.138	0.75851	0.541649	0.216861	211345	2241169
# 211345	148.133	793036	192.062	0.752499	0.0411562	0.711343	211345	793036
# 211345	148.133	1185457	218.984	0.735534	0.517326	0.218208	211345	1185457
# 211345	148.133	790913	192.061	0.726921	0.0396675	0.687254	211345	790913
# 211345	148.133	252462	151.035	0.719489	0.245673	0.473816	211345	252462

In [5]:
smiles_dict = {}
f = open('all_distances_smiles.tsv') #file format is shown below
cnt = 0
for i in f:
    t = i.strip().split('\t')
    smiles_dict[cnt] = t[1]
    cnt+=1

In [6]:
# 0.899029	C([C@H]([C@H]([C@H](C=O)O)O)O)OP(=O)([O-])[O-]	D-ribose 5-phosphate dianion - 40.0 eV Unknown	584312
# 0.741545	N/A	Suspect related to Spectral Match to 9(S)-HpOTrE from NIST14 (predicted molecular formula SIRIUS: C18H34N3O4 / BUDDY: C20H36O5) with delta m/z 46.042 (putative explanation: unspecified|unspecified; atomic difference: 2C,6H,1O|4H,3N) [M-H2O+H]+ [M-H2O+H]+	198983
# 0.960905	O=C(O)C=1C=CC=CC1O	Salicyclic acid [M+H]+	246876
# -1	0	0	0
# 0.794993	O=C(OC1C(OC(OCCC2=CC=C(OC)C(O)=C2)C(O)C1OC3OC(C)C(O)C(O)C3O)COC4OCC(O)C(O)C4O)C=CC5=CC=C(O)C(OC)=C5	Angoroside C [M+H]+	356865
# -1	0	0	0
# 0.880612	OC(=O)C1=CN=CC=C1	NICOTINATE - 20.0 eV M+H	8799
# 0.716229	C(CCCC(=O)O)CCCC(=O)O	azelaic_acid M-H	97505
# -1	0	0	0
# 0.996778	Cc1cc(O)cc(O)c1	Orcinol CollisionEnergy:205060 M+H	122434


In [7]:
print("number of components:", nx.number_connected_components(G))
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
Gc = G.subgraph(Gcc[0])
print("largest component: ",len(Gc))

number of components: 98881
largest component:  3334511


In [8]:
rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7)

In [None]:
tanimoto_res = []
tanimoto_res_cos = []
cnt = 0
for i, j, d in Gc.edges(data=True):
    cnt+=1
    if cnt % 1000 == 0:
        print(cnt)
    sm1 = smiles_dict[int(i)]
    sm2 = smiles_dict[int(j)]
    if str(sm1) == 'N/A':
        continue
    if str(sm1) == '0':
        continue
    if str(sm2) == 'N/A':
        continue
    if str(sm2) == '0':
        continue
    #print(sm1, sm2)
    try:
        mol1 = Chem.MolFromSmiles(sm1)
        mol2 = Chem.MolFromSmiles(sm2)
        fgrps = [rdkit_gen.GetFingerprint(mol) for mol in [mol1, mol2]]
        t = DataStructs.BulkTanimotoSimilarity(fgrps[0], [fgrps[1]])
        tanimoto_res += [t[0]]
        tanimoto_res_cos += [d['cosine_score']]
    except:
        print("error", sm1, sm2)

In [None]:
df = pd.DataFrame(tanimoto_res)
df.columns = ['tanimoto']
df['cosine'] = tanimoto_res_cos
df.to_csv('tanimoto_matches_cosine_07.csv')