Skip to content

Commit

Permalink
Merge pull request #3 from ClapeyronThermo/simple_search
Browse files Browse the repository at this point in the history
Updated Group Search Method
  • Loading branch information
pw0908 committed Feb 14, 2024
2 parents 1328fec + de21d60 commit 425603c
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 112 deletions.
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name = "GCIdentifier"
uuid = "b7ea765e-cbac-4e4a-9b0d-5427cc302506"
authors = ["Hon Wa Yew <yewhonwa@gmail.com>", "Pierre Walker <pwalker@mit.edu>", "Andrés Riedemann <andres.riedemann@gmail.com>"]
authors = ["Pierre Walker <pjwalker@caltech.edu>", "Andrés Riedemann <andres.riedemann@gmail.com>"]
version = "0.1.0"

[deps]
ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5"
Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
MolecularGraph = "6c89ec66-9cd8-5372-9f91-fabc50dd27fd"
RDKitMinimalLib = "44044271-7623-48dc-8250-42433c44e4b7"

Expand All @@ -20,6 +20,7 @@ GCIdentifierClapeyronExt = "Clapeyron"
[compat]
ChemicalIdentifiers = "0.1"
Clapeyron = "0.4,0.5"
Combinatorics = "1"
MolecularGraph = "0.14,0.15,0.16"
RDKitMinimalLib = "1"

Expand Down
1 change: 1 addition & 0 deletions src/GCIdentifier.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module GCIdentifier
import RDKitMinimalLib
using Combinatorics

@static if !isdefined(Base,:eachsplit)
eachsplit(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true) = split(str,dlm;limit,keepempty)
Expand Down
14 changes: 7 additions & 7 deletions src/database/UNIFAC.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,24 @@ GCPair(raw"[CX3;H2]=[CX3;H1]","CH2=CH"),
GCPair(raw"[CX3;H1]=[CX3;H1]","CH=CH"),
GCPair(raw"[CX3;H2]=[CX3;H0]","CH2=C"),
GCPair(raw"[CX3;H1]=[CX3;H0]","CH=C"),
GCPair(raw"[cX3;H1]","ACH"),
GCPair(raw"[cX3;H0]","AC"),
GCPair(raw"[cX3;H0][CX4;H3]","ACCH3"),
GCPair(raw"[cX3;H0][CX4;H2]","ACCH2"),
GCPair(raw"[cX3;H0][CX4;H1]","ACCH"),
GCPair(raw"[OH1;$([OH1][CX4H2])]","OH(P)"),
GCPair(raw"[CX4;H3][OX2;H1]","CH3OH"),
GCPair(raw"[OH2]","H2O"),
GCPair(raw"[cX3;H0;R][OX2;H1]","ACOH"),
GCPair(raw"[CX4;H3][CX3](=O)","CH3CO"),
GCPair(raw"[CX4;H2][CX3](=O)","CH2CO"),
GCPair(raw"[CX4;H3][CX3;!H1](=O)","CH3CO"),
GCPair(raw"[CX4;H2][CX3;!H1](=O)","CH2CO"),
GCPair(raw"[CX3H1](=O)","CHO"),
GCPair(raw"[CH3][CX3;H0](=[O])[O]","CH3COO"),
GCPair(raw"[CX4;H2][CX3](=[OX1])[OX2]","CH2COO"),
GCPair(raw"[CX3;H1](=[OX1])[OX2]","HCOO"),
GCPair(raw"[CH3;!R][OH0;!R]","CH3O"),
GCPair(raw"[CH2;!R][OH0;!R]","CH2O"),
GCPair(raw"[C;H1;!R][OH0;!R]","CHO"),
GCPair(raw"[cX3;H1]","ACH"),
GCPair(raw"[cX3;H0]","AC"),
GCPair(raw"[cX3;H0][CX4;H3]","ACCH3"),
GCPair(raw"[cX3;H0][CX4;H2]","ACCH2"),
GCPair(raw"[cX3;H0][CX4;H1]","ACCH"),
GCPair(raw"[CX4;H2;R][OX2;R][CX4;H2;R]","THF"),
GCPair(raw"[CX4;H3][NX3;H2]","CH3NH2"),
GCPair(raw"[CX4;H2][NX3;H2]","CH2NH2"),
Expand Down
145 changes: 51 additions & 94 deletions src/group_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ function get_grouplist end
get_grouplist(x::Vector{GCPair}) = x

"""
get_groups_from_smiles(smiles::String,groups,lib = GCIdentifier.DEFAULTLIB;connectivity = false)
get_groups_from_smiles(smiles::String,groups,lib = DEFAULTLIB;connectivity = false)
Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount.
Expand All @@ -112,13 +112,8 @@ end

function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAULTLIB;connectivity=false,check = true)
mol = get_mol(lib,smiles)
atoms = get_atoms(lib,mol)
__bonds = __getbondlist(lib,mol)
group_list = Int[]
group_id = Int[]
group_occ_list = Int[]
atoms_list = Int[]
coverage_atoms = Vector{Int}[]
coverage_bonds = Vector{Int}[]

smatches = []
smatches_idx = Int[]
Expand All @@ -136,106 +131,68 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAU
perm = sortperm(smatches,lt = _isless_smatch,rev = true)
smatches = smatches[perm]
smatches_idx = smatches_idx[perm]
#TODO: identify overlapped groups here?

for (idx,smatch) in pairs(smatches)
i = smatches_idx[idx]
for j in 1:length(smatch)
#add first match.
if isempty(atoms_list)
push!(group_list,i)
push!(group_id,i)
push!(group_occ_list,1)
append!(atoms_list,smatch[j]["atoms"])
push!(coverage_atoms,copy(smatch[j]["atoms"]))
push!(coverage_bonds,copy(smatch[j]["bonds"]))
continue #go to next iteration
end

# If no atoms covered by this group are already covered by other groups
atoms_covered_j = count(in(atoms_list),smatch[j]["atoms"])
if atoms_covered_j == 0
append!(atoms_list,smatch[j]["atoms"])
if !(i in group_list)
push!(group_list,i)
push!(group_id,i)
push!(group_occ_list,1)
push!(coverage_atoms,copy(smatch[j]["atoms"]))
push!(coverage_bonds,copy(smatch[j]["bonds"]))
else
group_occ_list[end] += 1
append!(coverage_atoms[end],smatch[j]["atoms"])
append!(coverage_bonds[end],smatch[j]["bonds"])
end
continue
end
# Expand smatches so that groups are listed
smatches_expanded = [smatches[i][j] for i in 1:length(smatches) for j in 1:length(smatches[i])]
smatches_idx_expanded = [smatches_idx[i] for i in 1:length(smatches) for j in 1:length(smatches[i])]

ngroups = length(smatches_expanded)
natoms = length(atoms)

# Create a matrix with the atoms that are in each group
bond_mat = zeros(Int64, ngroups, natoms)
for i in 1:ngroups
for j in 1:length(smatches_expanded[i]["atoms"])
bond_mat[i, smatches_expanded[i]["atoms"][j]+1] = 1
end
end

# Check which groups group i has an overlap with
id = 0
ng_rm = 0
for k in 1:length(group_id)
id += 1

# Does group 1 cover any atoms of group id
in_coverage_j = count(in(coverage_atoms[id]), smatch[j]["atoms"])
in_coverage_j == 0 && continue

# We only care if group i covers _more_ atoms than group k
length(smatch[j]["atoms"]) < length(coverage_atoms[id]) && continue

if ((length(smatch[j]["atoms"])>length(coverage_atoms[id])) &
# Also make sure that group i covers all the atoms of group k
(in_coverage_j == length(coverage_atoms[id]))) |
(length(smatch[j]["bonds"])>length(coverage_bonds[id]))
# find out which atoms are covered
overlap_atoms = coverage_atoms[id][coverage_atoms[id] .∈ [smatch[j]["atoms"]]]
id_rm = group_id[id]
name_rm = group_list[id]
#bond_rm = coverage_bonds[id][coverage_bonds[id] .∈ [smatch[j]["bonds"]]]
filter!(e -> e overlap_atoms,atoms_list)
filter!(e -> e overlap_atoms,coverage_atoms[id])
group_occ_list[id] -= 1

# If group k no longer covers any atoms, remove it
if group_occ_list[id] == 0
filter!(e-> e 0,group_occ_list)
filter!(e-> !isempty(e),coverage_atoms)
deleteat!(coverage_bonds,id)
filter!(e-> e id_rm,group_id)
filter!(e-> e name_rm,group_list)
id -= 1
end
ng_rm += 1
# Find all atoms that are in more than one group
overlap = findall(sum(bond_mat, dims=1)[:] .> 1)

# Split the groups in two sets: those that overlap and those that don't
overlap_groups = findall(sum(bond_mat[:, overlap], dims=2)[:] .> 0)
non_overlap_groups = findall(sum(bond_mat[:, overlap], dims=2)[:] .== 0)

if !isempty(overlap)
# Reduce the bond_mat to only the overlapping atoms
bond_mat_overlap = bond_mat[overlap_groups, overlap]

# Generate all possible combinations of groups which cover all atoms
candidate = []
for i in 1:size(bond_mat_overlap, 1)
combs = combinations(1:size(bond_mat_overlap, 1), i)
for comb in combs
# Test if the combination of groups covers all atoms
if sum(bond_mat_overlap[comb, :], dims=1) == ones(Int64, 1, size(bond_mat_overlap, 2))
push!(candidate, comb)
end
end

ng_rm == 0 && continue

if !(i in group_list)
push!(group_list,i)
push!(group_id,i)
push!(group_occ_list,1)
append!(coverage_atoms,[smatch[j]["atoms"]])
append!(coverage_bonds,[smatch[j]["bonds"]])
append!(atoms_list,smatch[j]["atoms"])
else
group_occ_list[end] += 1
append!(atoms_list,smatch[j]["atoms"])
append!(coverage_atoms[end],smatch[j]["atoms"])
append!(coverage_bonds[end],smatch[j]["bonds"])
# For the first combination that covers all atoms, stop (since it will use the fewest groups)
if length(candidate) > 0
break
end
end

# Select the groups that cover the most atoms
best_comb = overlap_groups[candidate[1]]
push!(non_overlap_groups, best_comb...)
end

bond_mat_minimum = bond_mat[non_overlap_groups, :]
group_id_expanded = smatches_idx_expanded[non_overlap_groups]

group_id = unique(group_id_expanded)
group_occ_list = [sum(group_id_expanded .== i) for i in group_id]

gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]

if check
atoms = get_atoms(lib,mol)
if !(count(in(atoms),atoms_list)==length(atoms))
if sum(bond_mat_minimum) != natoms
error("Could not find all groups for "*smiles)
end
end

gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]
#unique_groups!(gcpairs) FIXME
if connectivity
return (smiles,gcpairs,get_connectivity(mol,group_id,groups,lib))
else
Expand Down
16 changes: 7 additions & 9 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
unifac("O=C(CC)CC",gcstring"CH3:2;CH2:1;CH2CO:1") #pentanone-3

#aldehyde
#unifac("CCC=O",gcstring"CH3:1;CH2:1;HCO:1") #propionaldehyde , fails at the matching stage
unifac("CCC=O",gcstring"CH3:1;CH2:1;CHO:1") #propionaldehyde , fails at the matching stage

#acetate group
#unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage
#unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage
unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage
unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage

#formate group
unifac("O=COCC",gcstring"CH3:1;CH2:1;HCOO:1") #ethyl formate
Expand All @@ -69,7 +69,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
unifac("COC",gcstring"CH3:1;CH3O:1") #dimethyl ether
unifac("CCOCC",gcstring"CH3:2;CH2:1;CH2O:1") #diethyl ether
unifac("O(C(C)C)C(C)C",gcstring"CH3:4;CH:1;CHO:1") #diisopropyl ether
#unifac("C1CCOC1",gcstring"CH2:3;THF:1") #tetrahydrofuran #check?
unifac("C1CCOC1",gcstring"CY-CH2:2;THF:1") #tetrahydrofuran #check?

#primary amine
unifac("CN",gcstring"CH3NH2:1") #methylamine
Expand All @@ -88,12 +88,10 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
#aromatic amine
unifac("c1ccc(cc1)N",gcstring"ACH:5;ACNH2:1") #aniline

#piridine
#unifac("c1ccncc1",gcstring"C5H5N:1")



#furfural
unifac("c1cc(oc1)C=O",gcstring"FURFURAL:1") #furfural

#non-unique group assignment
unifac("c1ccccc1COCCOCC",gcstring"ACH:5;ACCH2:1;CH2O:2;CH2:1;CH3:1") #phenol + diethylene glycol

end

0 comments on commit 425603c

Please sign in to comment.