Merge pull request #3 from ClapeyronThermo/simple_search

Updated Group Search Method
ClapeyronThermo · Feb 14, 2024 · 425603c · 425603c
2 parents 1328fec + de21d60
commit 425603c
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 112 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,11 @@
 name = "GCIdentifier"
 uuid = "b7ea765e-cbac-4e4a-9b0d-5427cc302506"
-authors = ["Hon Wa Yew <yewhonwa@gmail.com>", "Pierre Walker <pwalker@mit.edu>", "Andrés Riedemann <andres.riedemann@gmail.com>"]
+authors = ["Pierre Walker <pjwalker@caltech.edu>", "Andrés Riedemann <andres.riedemann@gmail.com>"]
 version = "0.1.0"
 
 [deps]
 ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5"
-Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a"
+Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 MolecularGraph = "6c89ec66-9cd8-5372-9f91-fabc50dd27fd"
 RDKitMinimalLib = "44044271-7623-48dc-8250-42433c44e4b7"
 
@@ -20,6 +20,7 @@ GCIdentifierClapeyronExt = "Clapeyron"
 [compat]
 ChemicalIdentifiers = "0.1"
 Clapeyron = "0.4,0.5"
+Combinatorics = "1"
 MolecularGraph = "0.14,0.15,0.16"
 RDKitMinimalLib = "1"
 

diff --git a/src/GCIdentifier.jl b/src/GCIdentifier.jl
@@ -1,5 +1,6 @@
 module GCIdentifier
 import RDKitMinimalLib
+using Combinatorics
 
 @static if !isdefined(Base,:eachsplit)
     eachsplit(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true) = split(str,dlm;limit,keepempty)

diff --git a/src/database/UNIFAC.jl b/src/database/UNIFAC.jl
@@ -6,24 +6,24 @@ GCPair(raw"[CX3;H2]=[CX3;H1]","CH2=CH"),
 GCPair(raw"[CX3;H1]=[CX3;H1]","CH=CH"),
 GCPair(raw"[CX3;H2]=[CX3;H0]","CH2=C"),
 GCPair(raw"[CX3;H1]=[CX3;H0]","CH=C"),
-GCPair(raw"[cX3;H1]","ACH"),
-GCPair(raw"[cX3;H0]","AC"),
-GCPair(raw"[cX3;H0][CX4;H3]","ACCH3"),
-GCPair(raw"[cX3;H0][CX4;H2]","ACCH2"),
-GCPair(raw"[cX3;H0][CX4;H1]","ACCH"),
 GCPair(raw"[OH1;$([OH1][CX4H2])]","OH(P)"),
 GCPair(raw"[CX4;H3][OX2;H1]","CH3OH"),
 GCPair(raw"[OH2]","H2O"),
 GCPair(raw"[cX3;H0;R][OX2;H1]","ACOH"),
-GCPair(raw"[CX4;H3][CX3](=O)","CH3CO"),
-GCPair(raw"[CX4;H2][CX3](=O)","CH2CO"),
+GCPair(raw"[CX4;H3][CX3;!H1](=O)","CH3CO"),
+GCPair(raw"[CX4;H2][CX3;!H1](=O)","CH2CO"),
 GCPair(raw"[CX3H1](=O)","CHO"),
 GCPair(raw"[CH3][CX3;H0](=[O])[O]","CH3COO"),
 GCPair(raw"[CX4;H2][CX3](=[OX1])[OX2]","CH2COO"),
 GCPair(raw"[CX3;H1](=[OX1])[OX2]","HCOO"),
 GCPair(raw"[CH3;!R][OH0;!R]","CH3O"),
 GCPair(raw"[CH2;!R][OH0;!R]","CH2O"),
 GCPair(raw"[C;H1;!R][OH0;!R]","CHO"),
+GCPair(raw"[cX3;H1]","ACH"),
+GCPair(raw"[cX3;H0]","AC"),
+GCPair(raw"[cX3;H0][CX4;H3]","ACCH3"),
+GCPair(raw"[cX3;H0][CX4;H2]","ACCH2"),
+GCPair(raw"[cX3;H0][CX4;H1]","ACCH"),
 GCPair(raw"[CX4;H2;R][OX2;R][CX4;H2;R]","THF"),
 GCPair(raw"[CX4;H3][NX3;H2]","CH3NH2"),
 GCPair(raw"[CX4;H2][NX3;H2]","CH2NH2"),

diff --git a/src/group_search.jl b/src/group_search.jl
@@ -85,7 +85,7 @@ function get_grouplist end
 get_grouplist(x::Vector{GCPair}) = x
 
 """
-    get_groups_from_smiles(smiles::String,groups,lib = GCIdentifier.DEFAULTLIB;connectivity = false)
+    get_groups_from_smiles(smiles::String,groups,lib = DEFAULTLIB;connectivity = false)
 
 Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount.
 
@@ -112,13 +112,8 @@ end
 
 function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAULTLIB;connectivity=false,check = true)
     mol = get_mol(lib,smiles)
+    atoms = get_atoms(lib,mol)
     __bonds = __getbondlist(lib,mol)
-    group_list = Int[]
-    group_id = Int[]
-    group_occ_list = Int[]
-    atoms_list = Int[]
-    coverage_atoms = Vector{Int}[]
-    coverage_bonds = Vector{Int}[]
 
     smatches = []
     smatches_idx = Int[]
@@ -136,106 +131,68 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAU
     perm = sortperm(smatches,lt = _isless_smatch,rev = true)
     smatches = smatches[perm]
     smatches_idx = smatches_idx[perm]
-    #TODO: identify overlapped groups here?
-
-    for (idx,smatch) in pairs(smatches)
-        i = smatches_idx[idx]
-        for j in 1:length(smatch)
-            #add first match.
-            if isempty(atoms_list)
-                push!(group_list,i)
-                push!(group_id,i)
-                push!(group_occ_list,1)
-                append!(atoms_list,smatch[j]["atoms"])
-                push!(coverage_atoms,copy(smatch[j]["atoms"]))
-                push!(coverage_bonds,copy(smatch[j]["bonds"]))
-                continue #go to next iteration
-            end
 
-            # If no atoms covered by this group are already covered by other groups
-            atoms_covered_j = count(in(atoms_list),smatch[j]["atoms"])
-            if atoms_covered_j == 0
-                append!(atoms_list,smatch[j]["atoms"])
-                if !(i in group_list)
-                    push!(group_list,i)
-                    push!(group_id,i)
-                    push!(group_occ_list,1)
-                    push!(coverage_atoms,copy(smatch[j]["atoms"]))
-                    push!(coverage_bonds,copy(smatch[j]["bonds"]))
-                else
-                    group_occ_list[end] += 1
-                    append!(coverage_atoms[end],smatch[j]["atoms"])
-                    append!(coverage_bonds[end],smatch[j]["bonds"])
-                end
-                continue
-            end
+    # Expand smatches so that groups are listed
+    smatches_expanded = [smatches[i][j] for i in 1:length(smatches) for j in 1:length(smatches[i])]
+    smatches_idx_expanded = [smatches_idx[i] for i in 1:length(smatches) for j in 1:length(smatches[i])]
+
+    ngroups = length(smatches_expanded)
+    natoms = length(atoms)
+
+    # Create a matrix with the atoms that are in each group
+    bond_mat = zeros(Int64, ngroups, natoms)
+    for i in 1:ngroups
+        for j in 1:length(smatches_expanded[i]["atoms"])
+            bond_mat[i, smatches_expanded[i]["atoms"][j]+1] = 1
+        end
+    end
 
-            # Check which groups group i has an overlap with
-            id = 0
-            ng_rm = 0
-            for k in 1:length(group_id)
-                id += 1
-
-                # Does group 1 cover any atoms of group id
-                in_coverage_j = count(in(coverage_atoms[id]), smatch[j]["atoms"])
-                in_coverage_j == 0 && continue
-
-                # We only care if group i covers _more_ atoms than group k
-                length(smatch[j]["atoms"]) < length(coverage_atoms[id]) && continue
-
-                if ((length(smatch[j]["atoms"])>length(coverage_atoms[id])) &
-                    # Also make sure that group i covers all the atoms of group k
-                    (in_coverage_j == length(coverage_atoms[id]))) |
-                    (length(smatch[j]["bonds"])>length(coverage_bonds[id]))
-                    # find out which atoms are covered
-                    overlap_atoms = coverage_atoms[id][coverage_atoms[id] .∈ [smatch[j]["atoms"]]]
-                    id_rm = group_id[id]
-                    name_rm = group_list[id]
-                    #bond_rm =  coverage_bonds[id][coverage_bonds[id] .∈ [smatch[j]["bonds"]]]
-                    filter!(e -> e ∉ overlap_atoms,atoms_list)
-                    filter!(e -> e ∉ overlap_atoms,coverage_atoms[id])
-                    group_occ_list[id] -= 1
-
-                    # If group k no longer covers any atoms, remove it
-                    if group_occ_list[id] == 0
-                        filter!(e-> e ≠ 0,group_occ_list)
-                        filter!(e-> !isempty(e),coverage_atoms)
-                        deleteat!(coverage_bonds,id)
-                        filter!(e-> e ≠ id_rm,group_id)
-                        filter!(e-> e ≠ name_rm,group_list)
-                        id -= 1
-                    end
-                    ng_rm += 1
+    # Find all atoms that are in more than one group
+    overlap = findall(sum(bond_mat, dims=1)[:] .> 1)
+
+    # Split the groups in two sets: those that overlap and those that don't
+    overlap_groups = findall(sum(bond_mat[:, overlap], dims=2)[:] .> 0)
+    non_overlap_groups = findall(sum(bond_mat[:, overlap], dims=2)[:] .== 0)
+
+    if !isempty(overlap)
+        # Reduce the bond_mat to only the overlapping atoms
+        bond_mat_overlap = bond_mat[overlap_groups, overlap]
+
+        # Generate all possible combinations of groups which cover all atoms
+        candidate = []
+        for i in 1:size(bond_mat_overlap, 1)
+            combs = combinations(1:size(bond_mat_overlap, 1), i)
+            for comb in combs
+                # Test if the combination of groups covers all atoms
+                if sum(bond_mat_overlap[comb, :], dims=1) == ones(Int64, 1, size(bond_mat_overlap, 2))
+                    push!(candidate, comb)
                 end
             end
-
-            ng_rm == 0 && continue
-
-            if !(i in group_list)
-                push!(group_list,i)
-                push!(group_id,i)
-                push!(group_occ_list,1)
-                append!(coverage_atoms,[smatch[j]["atoms"]])
-                append!(coverage_bonds,[smatch[j]["bonds"]])
-                append!(atoms_list,smatch[j]["atoms"])
-            else
-                group_occ_list[end] += 1
-                append!(atoms_list,smatch[j]["atoms"])
-                append!(coverage_atoms[end],smatch[j]["atoms"])
-                append!(coverage_bonds[end],smatch[j]["bonds"])
+            # For the first combination that covers all atoms, stop (since it will use the fewest groups)
+            if length(candidate) > 0
+                break
             end
         end
+
+        # Select the groups that cover the most atoms
+        best_comb = overlap_groups[candidate[1]]
+        push!(non_overlap_groups, best_comb...)
     end
 
+    bond_mat_minimum = bond_mat[non_overlap_groups, :]
+    group_id_expanded = smatches_idx_expanded[non_overlap_groups]
+
+    group_id = unique(group_id_expanded)
+    group_occ_list = [sum(group_id_expanded .== i) for i in group_id]
+
+    gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]
+
     if check
-        atoms = get_atoms(lib,mol)
-        if !(count(in(atoms),atoms_list)==length(atoms))
+        if sum(bond_mat_minimum) != natoms
             error("Could not find all groups for "*smiles)
         end
     end
 
-    gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]
-    #unique_groups!(gcpairs) FIXME
     if connectivity
         return (smiles,gcpairs,get_connectivity(mol,group_id,groups,lib))
     else

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -56,11 +56,11 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
     unifac("O=C(CC)CC",gcstring"CH3:2;CH2:1;CH2CO:1") #pentanone-3
 
     #aldehyde
-    #unifac("CCC=O",gcstring"CH3:1;CH2:1;HCO:1") #propionaldehyde , fails at the matching stage
+    unifac("CCC=O",gcstring"CH3:1;CH2:1;CHO:1") #propionaldehyde , fails at the matching stage
 
     #acetate group
-    #unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage
-    #unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage
+    unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage
+    unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage
 
     #formate group
     unifac("O=COCC",gcstring"CH3:1;CH2:1;HCOO:1") #ethyl formate
@@ -69,7 +69,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
     unifac("COC",gcstring"CH3:1;CH3O:1") #dimethyl ether
     unifac("CCOCC",gcstring"CH3:2;CH2:1;CH2O:1") #diethyl ether
     unifac("O(C(C)C)C(C)C",gcstring"CH3:4;CH:1;CHO:1") #diisopropyl ether
-    #unifac("C1CCOC1",gcstring"CH2:3;THF:1") #tetrahydrofuran #check?
+    unifac("C1CCOC1",gcstring"CY-CH2:2;THF:1") #tetrahydrofuran #check?
 
     #primary amine
     unifac("CN",gcstring"CH3NH2:1") #methylamine
@@ -88,12 +88,10 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
     #aromatic amine
     unifac("c1ccc(cc1)N",gcstring"ACH:5;ACNH2:1") #aniline
 
-    #piridine
-    #unifac("c1ccncc1",gcstring"C5H5N:1")
-
-
-
     #furfural
     unifac("c1cc(oc1)C=O",gcstring"FURFURAL:1") #furfural
 
+    #non-unique group assignment
+    unifac("c1ccccc1COCCOCC",gcstring"ACH:5;ACCH2:1;CH2O:2;CH2:1;CH3:1") #phenol + diethylene glycol
+
 end