diff --git a/Project.toml b/Project.toml index 691e5af..447b859 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,6 @@ ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5" Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" MolecularGraph = "6c89ec66-9cd8-5372-9f91-fabc50dd27fd" -RDKitMinimalLib = "44044271-7623-48dc-8250-42433c44e4b7" [weakdeps] ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5" @@ -23,12 +22,11 @@ ChemicalIdentifiers = "0.1" Clapeyron = "0.4,0.5" Combinatorics = "1" MolecularGraph = "0.14,0.15,0.16" -RDKitMinimalLib = "1" julia = "1.6" [extras] -Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a" ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5" +Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] diff --git a/ext/GCIdentifierChemicalIdentifiersExt.jl b/ext/GCIdentifierChemicalIdentifiersExt.jl index c274321..831e7a8 100644 --- a/ext/GCIdentifierChemicalIdentifiersExt.jl +++ b/ext/GCIdentifierChemicalIdentifiersExt.jl @@ -10,10 +10,10 @@ function GC.get_groups_from_name(component::String,groups;connectivity = false) return GC.get_groups_from_name(component,groups,connectivity=connectivity) end -function GC.get_groups_from_name(component::String,groups::Vector{GC.GCPair},lib = GC.DEFAULTLIB;connectivity=false,check = true) +function GC.get_groups_from_name(component::String,groups::Vector{GC.GCPair};connectivity=false,check = true) res = search_chemical(component) smiles = res.smiles - gcpairs = get_groups_from_smiles(smiles,groups,lib;connectivity=connectivity,check = check) + gcpairs = get_groups_from_smiles(smiles,groups;connectivity=connectivity,check = check) if connectivity == true (smiles,groups_found,connectivity) = gcpairs return (component,groups_found,connectivity) diff --git a/src/GCIdentifier.jl b/src/GCIdentifier.jl index 35f2ec2..f52b0b6 100644 --- a/src/GCIdentifier.jl +++ b/src/GCIdentifier.jl @@ -1,5 +1,4 @@ module GCIdentifier -import RDKitMinimalLib using Combinatorics @static if !isdefined(Base,:eachsplit) diff --git a/src/group_search.jl b/src/group_search.jl index 7d4368a..4639219 100644 --- a/src/group_search.jl +++ b/src/group_search.jl @@ -87,16 +87,12 @@ function get_grouplist end get_grouplist(x::Vector{GCPair}) = x """ - get_groups_from_smiles(smiles::String,groups,lib = DEFAULTLIB;connectivity = false) + get_groups_from_smiles(smiles::String,groups;connectivity = false) Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount. If `connectivity` is true, then it will additionally return a vector containing the amount of bonds between each pair. -`lib` allows the selection of a molecular library to perform the substructure matching. the two available options are: -- `RDKitLib()` : uses `RDKit` (via the `RDKitMinimalLib.jl` package) to perform the substructure matching. Default in Linux and Mac. -- `MolecularGraphJL()` : uses `MolecularGraph.jl`to perform the substructure matching. Default in Windows (due to a bug in RDKit in this particular Operating System). - ## Examples ```julia @@ -112,13 +108,13 @@ function get_groups_from_smiles(smiles::String,groups;connectivity = false) return get_groups_from_smiles(smiles,groups;connectivity = connectivity) end -function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAULTLIB;connectivity=false,check = true) - mol = get_mol(lib,smiles) - atoms = get_atoms(lib,mol) +function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivity=false,check = true) + mol = get_mol(smiles) + atoms = get_atoms(mol) natoms = length(atoms) - __bonds = __getbondlist(lib,mol) + __bonds = __getbondlist(mol) - smatches_idx_expanded, bond_mat = find_covered_atoms(mol, groups, lib, atoms, __bonds, check) + smatches_idx_expanded, bond_mat = find_covered_atoms(mol, groups, atoms, __bonds, check) # Find all atoms that are in more than one group overlap = findall(sum(bond_mat, dims=1)[:] .> 1) # non_overlap = findall(sum(bond_mat, dims=1)[:] .== 1) @@ -172,21 +168,21 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAU end if connectivity - return (smiles,gcpairs,get_connectivity(mol,group_id,groups,lib)) + return (smiles,gcpairs,get_connectivity(mol,group_id,groups)) else return (smiles,gcpairs) end end -function find_covered_atoms(mol, groups, lib, atoms, __bonds, check) +function find_covered_atoms(mol, groups, atoms, __bonds, check) smatches = [] smatches_idx = Int[] #step 0.a, find all groups that could get a match for i in 1:length(groups) - query_i = get_qmol(lib,smarts(groups[i])) - if has_substruct_match(lib,mol,query_i) - push!(smatches,get_substruct_matches(lib,mol,query_i,__bonds)) + query_i = get_qmol(smarts(groups[i])) + if has_substruct_match(mol,query_i) + push!(smatches,get_substruct_matches(mol,query_i,__bonds)) push!(smatches_idx,i) end end @@ -218,7 +214,7 @@ function find_covered_atoms(mol, groups, lib, atoms, __bonds, check) return smatches_idx_expanded, bond_mat end -function get_connectivity(mol,group_id,groups,lib = DEFAULTLIB) +function get_connectivity(mol,group_id,groups) ngroups = length(group_id) A = zeros(ngroups,ngroups) @@ -227,10 +223,10 @@ function get_connectivity(mol,group_id,groups,lib = DEFAULTLIB) gci = groups[group_id[i]] smart1 = smarts(gci) smart2 = smarts(gci) - querie = get_qmol(lib,smart1*smart2) - smatch = get_substruct_matches(lib,mol,querie) + querie = get_qmol(smart1*smart2) + smatch = get_substruct_matches(mol,querie) name_i = name(gci) - A[i,i] = length(smatch) + A[i,i] = length(unique(smatch)) if A[i,i]!=0 append!(connectivity,[(name_i,name_i)=>Int(A[i,i])]) end @@ -238,8 +234,8 @@ function get_connectivity(mol,group_id,groups,lib = DEFAULTLIB) for j in i+1:ngroups gcj = groups[group_id[j]] smart2 = smarts(gcj) - querie = get_qmol(lib,smart1*smart2) - smatch = get_substruct_matches(lib,mol,querie) + querie = get_qmol(smart1*smart2) + smatch = get_substruct_matches(mol,querie) A[i,j] = length(smatch) name_j = name(gcj) if A[i,j]!=0 diff --git a/src/missing_groups.jl b/src/missing_groups.jl index 3874727..5deb803 100644 --- a/src/missing_groups.jl +++ b/src/missing_groups.jl @@ -1,5 +1,5 @@ """ - find_missing_groups_from_smiles(smiles::String, groups, lib = DEFAULTLIB;max_group_size = nothing, environment=false, reduced=false) + find_missing_groups_from_smiles(smiles::String, groups;max_group_size = nothing, environment=false, reduced=false) Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of potential groups (`new_groups::Vector{GCPair}`) which could cover those atoms not covered within `groups`. If no `groups` vector is provided, it will simply generate all possible groups for the molecule. @@ -27,15 +27,15 @@ julia> find_missing_groups_from_smiles("CC(=O)O") GCIdentifier.GCPair("[CX3;H0;!R](=[OX1;H0;!R])([OX2;H1;!R])", "C=O=OH") ``` """ -function find_missing_groups_from_smiles(smiles, groups=nothing, lib=MolecularGraphJL(); max_group_size=nothing, environment=false, reduced=false) - mol = get_mol(lib, smiles) - __bonds = __getbondlist(lib,mol) - atoms = get_atoms(lib,mol) +function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=nothing, environment=false, reduced=false) + mol = get_mol(smiles) + __bonds = __getbondlist(mol) + atoms = get_atoms(mol) if isnothing(groups) missing_atoms = ones(Bool, length(atoms)) else - smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, groups, lib, atoms, __bonds, false) + smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, groups, atoms, __bonds, false) missing_atoms = (sum(atom_coverage, dims=1) .== 0)[:] end @@ -188,9 +188,9 @@ function find_missing_groups_from_smiles(smiles, groups=nothing, lib=MolecularGr occurrence = zeros(Int, length(unique_smarts)) for i in 1:length(unique_smarts) push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]]) - query_i = get_qmol(lib,unique_smarts[i]) + query_i = get_qmol(unique_smarts[i]) - occurrence[i] = length(get_substruct_matches(lib,mol,query_i,__bonds)) + occurrence[i] = length(get_substruct_matches(mol,query_i,__bonds)) # println(unique_smarts[i], " ", unique_names[i], " ", occurrence[i]) end diff --git a/src/prelude.jl b/src/prelude.jl index dcede74..7d8732f 100644 --- a/src/prelude.jl +++ b/src/prelude.jl @@ -1,64 +1,24 @@ #we use MolecularGraph for windows, but we need the same api. -""" - RDKitLib - -Struct used to select the RDKit library (via `RDKitMinimalLib.jl` package) to perform the group search. Default in Linux and Mac. -""" -struct RDKitLib end - -""" - MolecularGraphJL - -Struct used to select the `MolecularGraph.jl` library to perform the group search. Default in Windows. -""" -struct MolecularGraphJL end - -#get useful structure from SMILES -function get_mol(::RDKitLib,smiles) - mol = RDKitMinimalLib.get_mol(smiles) - return mol -end - -function get_mol(::MolecularGraphJL,smiles) +function get_mol(smiles) mol = MolecularGraph.smilestomol(smiles) return mol end -#get useful structure from SMARTS -function get_qmol(::RDKitLib,smarts) - return RDKitMinimalLib.get_qmol(smarts) -end - -function get_qmol(::MolecularGraphJL,smarts) +function get_qmol(smarts) return MolecularGraph.smartstomol(smarts) end - -#used in the check function -function get_atoms(::RDKitLib,mol) - 0:(length(RDKitMinimalLib.get_substruct_matches(mol,mol)[1]["atoms"]) - 1) -end - -function get_atoms(::MolecularGraphJL,mol) +function get_atoms(mol) 0:(length(mol.graph.fadjlist) - 1) end - -#check if query has an substruct match -function has_substruct_match(::RDKitLib,mol,query) - !isempty(RDKitMinimalLib.get_substruct_match(mol,query)) -end - -function has_substruct_match(::MolecularGraphJL,mol,query) +function has_substruct_match(mol,query) MolecularGraph.has_substruct_match(mol,query) end -#function used for MolecularGraph.jl. the RDKitLib version returns nothing -__getbondlist(::RDKitLib,mol) = nothing - -function __getbondlist(::MolecularGraphJL,mol) +function __getbondlist(mol) res = Set{NTuple{2,Int}}() adjlist = mol.graph.fadjlist for (i,xi) in pairs(adjlist) @@ -68,21 +28,8 @@ function __getbondlist(::MolecularGraphJL,mol) end rvec = sort!(collect(res)) end -#parse substruct matches, use RDKitLib format. -function get_substruct_matches(::RDKitLib,mol,query,__bonds = nothing) - matches = RDKitMinimalLib.get_substruct_matches(mol,query) - #stabilize type of result - res = Dict{String,Vector{Int}}[] - for match in matches - dictᵢ = Dict{String,Vector{Int}}() - dictᵢ["atoms"] = convert(Vector{Int},match["atoms"]) - dictᵢ["bonds"] = convert(Vector{Int},match["bonds"]) - push!(res,dictᵢ) - end - return res -end -function get_substruct_matches(lib::MolecularGraphJL,mol,query,bonds = __getbondlist(lib,mol)) +function get_substruct_matches(mol,query,bonds = __getbondlist(mol)) matches = MolecularGraph.substruct_matches(mol,query) res = Dict{String,Vector{Int}}[] #convert to RDKitLib expected struct. @@ -118,12 +65,3 @@ function __getbonds_mg(list,atoms) res .-= 1 return res end -#select default molecule engine - -@static if Sys.iswindows() - const DEFAULTLIB = MolecularGraphJL() -else - const DEFAULTLIB = RDKitLib() -end - -export RDKitLib, MolecularGraphJL \ No newline at end of file diff --git a/test/test_group_search.jl b/test/test_group_search.jl index 3ae9e30..913cfa5 100644 --- a/test/test_group_search.jl +++ b/test/test_group_search.jl @@ -1,9 +1,7 @@ function test_gcmatch(groups,smiles,result) - evaluated_rdkit = Set(get_groups_from_smiles(smiles,groups,RDKitLib(),check = false)[2]) - evaluated_molgraphJL = Set(get_groups_from_smiles(smiles,groups,MolecularGraphJL(),check = false)[2]) + evaluated = Set(get_groups_from_smiles(smiles,groups,check = false)[2]) reference = Set(result) - @test isequal(evaluated_rdkit,reference) - @test isequal(evaluated_molgraphJL,reference) + @test isequal(evaluated,reference) end test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result) @@ -11,7 +9,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result) @testset "UNIFAC examples" begin #http://www.aim.env.uea.ac.uk/aim/info/UNIFACgroups.html unifac = test_gcmatch(UNIFACGroups) - + #alkane group unifac("CC",gcstring"CH3:2") #ethane unifac("CCCC",gcstring"CH3:2;CH2:2") #n-butane @@ -43,7 +41,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result) #water unifac("O",gcstring"H2O:1") #water - + #aromatic carbon-alcohol unifac("Oc1ccccc1",gcstring"ACH:5;ACOH:1") #phenol @@ -57,7 +55,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result) #acetate group unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage - + #formate group unifac("O=COCC",gcstring"CH3:1;CH2:1;HCOO:1") #ethyl formate @@ -66,7 +64,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result) unifac("CCOCC",gcstring"CH3:2;CH2:1;CH2O:1") #diethyl ether unifac("O(C(C)C)C(C)C",gcstring"CH3:4;CH:1;CHO:1") #diisopropyl ether unifac("C1CCOC1",gcstring"CY-CH2:2;THF:1") #tetrahydrofuran #check? - + #primary amine unifac("CN",gcstring"CH3NH2:1") #methylamine unifac("CCN",gcstring"CH3:1;CH2NH2:1") #ethylamine @@ -80,10 +78,10 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result) #tertiary amine unifac("CN(C)C",gcstring"CH3:2;CH3N:1") #trimethylamine unifac("CCN(CC)CC",gcstring"CH3:3;CH2:2;CH2N:1") #triethylamine - + #aromatic amine unifac("c1ccc(cc1)N",gcstring"ACH:5;ACNH2:1") #aniline - + #furfural unifac("c1cc(oc1)C=O",gcstring"FURFURAL:1") #furfural @@ -98,6 +96,7 @@ end # Test a highly branched hydrocarbon smiles = "c1ccccc1C(CCCC)(CCCC(C))C" (component, groups, connectivity) = get_groups_from_smiles("c1ccccc1C(CC)(CC(C))C", gcPPCSAFTGroups; connectivity=true) - @test isequal([ "CH3" => 3,"CH2" => 3,"C" => 1,"aCH" => 5,"aC" => 1],groups) - @test isequal([("CH3", "CH2") => 2,("CH3", "C") => 1,("CH2", "CH2") => 1,("CH2", "C") => 2,("C", "aC") => 1,("aCH", "aCH") => 4,("aCH", "aC") => 2], connectivity) + + @test isequal([ "CH3" => 3,"CH2" => 3,"C" => 1,"aCH" => 5,"aC" => 1] |> Set,Set(groups)) + @test isequal([("CH3", "CH2") => 2,("CH3", "C") => 1,("CH2", "CH2") => 1,("CH2", "C") => 2,("C", "aC") => 1,("aCH", "aCH") => 4,("aCH", "aC") => 2] |> Set,Set(connectivity)) end \ No newline at end of file