Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated Group Search Method #3

Merged
merged 5 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name = "GCIdentifier"
uuid = "b7ea765e-cbac-4e4a-9b0d-5427cc302506"
authors = ["Hon Wa Yew <yewhonwa@gmail.com>", "Pierre Walker <pwalker@mit.edu>", "Andrés Riedemann <andres.riedemann@gmail.com>"]
authors = ["Pierre Walker <pjwalker@caltech.edu>", "Andrés Riedemann <andres.riedemann@gmail.com>"]
version = "0.1.0"

[deps]
ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5"
Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
MolecularGraph = "6c89ec66-9cd8-5372-9f91-fabc50dd27fd"
RDKitMinimalLib = "44044271-7623-48dc-8250-42433c44e4b7"

Expand All @@ -20,6 +20,7 @@ GCIdentifierClapeyronExt = "Clapeyron"
[compat]
ChemicalIdentifiers = "0.1"
Clapeyron = "0.4,0.5"
Combinatorics = "1"
MolecularGraph = "0.14,0.15,0.16"
RDKitMinimalLib = "1"

Expand Down
1 change: 1 addition & 0 deletions src/GCIdentifier.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module GCIdentifier
import RDKitMinimalLib
using Combinatorics

@static if !isdefined(Base,:eachsplit)
eachsplit(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true) = split(str,dlm;limit,keepempty)
Expand Down
14 changes: 7 additions & 7 deletions src/database/UNIFAC.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,24 @@ GCPair(raw"[CX3;H2]=[CX3;H1]","CH2=CH"),
GCPair(raw"[CX3;H1]=[CX3;H1]","CH=CH"),
GCPair(raw"[CX3;H2]=[CX3;H0]","CH2=C"),
GCPair(raw"[CX3;H1]=[CX3;H0]","CH=C"),
GCPair(raw"[cX3;H1]","ACH"),
GCPair(raw"[cX3;H0]","AC"),
GCPair(raw"[cX3;H0][CX4;H3]","ACCH3"),
GCPair(raw"[cX3;H0][CX4;H2]","ACCH2"),
GCPair(raw"[cX3;H0][CX4;H1]","ACCH"),
GCPair(raw"[OH1;$([OH1][CX4H2])]","OH(P)"),
GCPair(raw"[CX4;H3][OX2;H1]","CH3OH"),
GCPair(raw"[OH2]","H2O"),
GCPair(raw"[cX3;H0;R][OX2;H1]","ACOH"),
GCPair(raw"[CX4;H3][CX3](=O)","CH3CO"),
GCPair(raw"[CX4;H2][CX3](=O)","CH2CO"),
GCPair(raw"[CX4;H3][CX3;!H1](=O)","CH3CO"),
GCPair(raw"[CX4;H2][CX3;!H1](=O)","CH2CO"),
GCPair(raw"[CX3H1](=O)","CHO"),
GCPair(raw"[CH3][CX3;H0](=[O])[O]","CH3COO"),
GCPair(raw"[CX4;H2][CX3](=[OX1])[OX2]","CH2COO"),
GCPair(raw"[CX3;H1](=[OX1])[OX2]","HCOO"),
GCPair(raw"[CH3;!R][OH0;!R]","CH3O"),
GCPair(raw"[CH2;!R][OH0;!R]","CH2O"),
GCPair(raw"[C;H1;!R][OH0;!R]","CHO"),
GCPair(raw"[cX3;H1]","ACH"),
GCPair(raw"[cX3;H0]","AC"),
GCPair(raw"[cX3;H0][CX4;H3]","ACCH3"),
GCPair(raw"[cX3;H0][CX4;H2]","ACCH2"),
GCPair(raw"[cX3;H0][CX4;H1]","ACCH"),
GCPair(raw"[CX4;H2;R][OX2;R][CX4;H2;R]","THF"),
GCPair(raw"[CX4;H3][NX3;H2]","CH3NH2"),
GCPair(raw"[CX4;H2][NX3;H2]","CH2NH2"),
Expand Down
145 changes: 51 additions & 94 deletions src/group_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ function get_grouplist end
get_grouplist(x::Vector{GCPair}) = x

"""
get_groups_from_smiles(smiles::String,groups,lib = GCIdentifier.DEFAULTLIB;connectivity = false)
get_groups_from_smiles(smiles::String,groups,lib = DEFAULTLIB;connectivity = false)

Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount.

Expand All @@ -112,13 +112,8 @@ end

function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAULTLIB;connectivity=false,check = true)
mol = get_mol(lib,smiles)
atoms = get_atoms(lib,mol)
__bonds = __getbondlist(lib,mol)
group_list = Int[]
group_id = Int[]
group_occ_list = Int[]
atoms_list = Int[]
coverage_atoms = Vector{Int}[]
coverage_bonds = Vector{Int}[]

smatches = []
smatches_idx = Int[]
Expand All @@ -136,106 +131,68 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAU
perm = sortperm(smatches,lt = _isless_smatch,rev = true)
smatches = smatches[perm]
smatches_idx = smatches_idx[perm]
#TODO: identify overlapped groups here?

for (idx,smatch) in pairs(smatches)
i = smatches_idx[idx]
for j in 1:length(smatch)
#add first match.
if isempty(atoms_list)
push!(group_list,i)
push!(group_id,i)
push!(group_occ_list,1)
append!(atoms_list,smatch[j]["atoms"])
push!(coverage_atoms,copy(smatch[j]["atoms"]))
push!(coverage_bonds,copy(smatch[j]["bonds"]))
continue #go to next iteration
end

# If no atoms covered by this group are already covered by other groups
atoms_covered_j = count(in(atoms_list),smatch[j]["atoms"])
if atoms_covered_j == 0
append!(atoms_list,smatch[j]["atoms"])
if !(i in group_list)
push!(group_list,i)
push!(group_id,i)
push!(group_occ_list,1)
push!(coverage_atoms,copy(smatch[j]["atoms"]))
push!(coverage_bonds,copy(smatch[j]["bonds"]))
else
group_occ_list[end] += 1
append!(coverage_atoms[end],smatch[j]["atoms"])
append!(coverage_bonds[end],smatch[j]["bonds"])
end
continue
end
# Expand smatches so that groups are listed
smatches_expanded = [smatches[i][j] for i in 1:length(smatches) for j in 1:length(smatches[i])]
smatches_idx_expanded = [smatches_idx[i] for i in 1:length(smatches) for j in 1:length(smatches[i])]

ngroups = length(smatches_expanded)
natoms = length(atoms)

# Create a matrix with the atoms that are in each group
bond_mat = zeros(Int64, ngroups, natoms)
for i in 1:ngroups
for j in 1:length(smatches_expanded[i]["atoms"])
bond_mat[i, smatches_expanded[i]["atoms"][j]+1] = 1
end
end

# Check which groups group i has an overlap with
id = 0
ng_rm = 0
for k in 1:length(group_id)
id += 1

# Does group 1 cover any atoms of group id
in_coverage_j = count(in(coverage_atoms[id]), smatch[j]["atoms"])
in_coverage_j == 0 && continue

# We only care if group i covers _more_ atoms than group k
length(smatch[j]["atoms"]) < length(coverage_atoms[id]) && continue

if ((length(smatch[j]["atoms"])>length(coverage_atoms[id])) &
# Also make sure that group i covers all the atoms of group k
(in_coverage_j == length(coverage_atoms[id]))) |
(length(smatch[j]["bonds"])>length(coverage_bonds[id]))
# find out which atoms are covered
overlap_atoms = coverage_atoms[id][coverage_atoms[id] .∈ [smatch[j]["atoms"]]]
id_rm = group_id[id]
name_rm = group_list[id]
#bond_rm = coverage_bonds[id][coverage_bonds[id] .∈ [smatch[j]["bonds"]]]
filter!(e -> e ∉ overlap_atoms,atoms_list)
filter!(e -> e ∉ overlap_atoms,coverage_atoms[id])
group_occ_list[id] -= 1

# If group k no longer covers any atoms, remove it
if group_occ_list[id] == 0
filter!(e-> e ≠ 0,group_occ_list)
filter!(e-> !isempty(e),coverage_atoms)
deleteat!(coverage_bonds,id)
filter!(e-> e ≠ id_rm,group_id)
filter!(e-> e ≠ name_rm,group_list)
id -= 1
end
ng_rm += 1
# Find all atoms that are in more than one group
overlap = findall(sum(bond_mat, dims=1)[:] .> 1)

# Split the groups in two sets: those that overlap and those that don't
overlap_groups = findall(sum(bond_mat[:, overlap], dims=2)[:] .> 0)
non_overlap_groups = findall(sum(bond_mat[:, overlap], dims=2)[:] .== 0)

if !isempty(overlap)
# Reduce the bond_mat to only the overlapping atoms
bond_mat_overlap = bond_mat[overlap_groups, overlap]

# Generate all possible combinations of groups which cover all atoms
candidate = []
for i in 1:size(bond_mat_overlap, 1)
combs = combinations(1:size(bond_mat_overlap, 1), i)
for comb in combs
# Test if the combination of groups covers all atoms
if sum(bond_mat_overlap[comb, :], dims=1) == ones(Int64, 1, size(bond_mat_overlap, 2))
push!(candidate, comb)
end
end

ng_rm == 0 && continue

if !(i in group_list)
push!(group_list,i)
push!(group_id,i)
push!(group_occ_list,1)
append!(coverage_atoms,[smatch[j]["atoms"]])
append!(coverage_bonds,[smatch[j]["bonds"]])
append!(atoms_list,smatch[j]["atoms"])
else
group_occ_list[end] += 1
append!(atoms_list,smatch[j]["atoms"])
append!(coverage_atoms[end],smatch[j]["atoms"])
append!(coverage_bonds[end],smatch[j]["bonds"])
# For the first combination that covers all atoms, stop (since it will use the fewest groups)
if length(candidate) > 0
break
end
end

# Select the groups that cover the most atoms
best_comb = overlap_groups[candidate[1]]
push!(non_overlap_groups, best_comb...)
end

bond_mat_minimum = bond_mat[non_overlap_groups, :]
group_id_expanded = smatches_idx_expanded[non_overlap_groups]

group_id = unique(group_id_expanded)
group_occ_list = [sum(group_id_expanded .== i) for i in group_id]

gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]

if check
atoms = get_atoms(lib,mol)
if !(count(in(atoms),atoms_list)==length(atoms))
if sum(bond_mat_minimum) != natoms
error("Could not find all groups for "*smiles)
end
end

gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]
#unique_groups!(gcpairs) FIXME
if connectivity
return (smiles,gcpairs,get_connectivity(mol,group_id,groups,lib))
else
Expand Down
13 changes: 4 additions & 9 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
unifac("O=C(CC)CC",gcstring"CH3:2;CH2:1;CH2CO:1") #pentanone-3

#aldehyde
#unifac("CCC=O",gcstring"CH3:1;CH2:1;HCO:1") #propionaldehyde , fails at the matching stage
unifac("CCC=O",gcstring"CH3:1;CH2:1;CHO:1") #propionaldehyde , fails at the matching stage

#acetate group
#unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage
#unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage
unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage
unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage

#formate group
unifac("O=COCC",gcstring"CH3:1;CH2:1;HCOO:1") #ethyl formate
Expand All @@ -69,7 +69,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
unifac("COC",gcstring"CH3:1;CH3O:1") #dimethyl ether
unifac("CCOCC",gcstring"CH3:2;CH2:1;CH2O:1") #diethyl ether
unifac("O(C(C)C)C(C)C",gcstring"CH3:4;CH:1;CHO:1") #diisopropyl ether
#unifac("C1CCOC1",gcstring"CH2:3;THF:1") #tetrahydrofuran #check?
unifac("C1CCOC1",gcstring"CY-CH2:2;THF:1") #tetrahydrofuran #check?

#primary amine
unifac("CN",gcstring"CH3NH2:1") #methylamine
Expand All @@ -88,11 +88,6 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
#aromatic amine
unifac("c1ccc(cc1)N",gcstring"ACH:5;ACNH2:1") #aniline

#piridine
#unifac("c1ccncc1",gcstring"C5H5N:1")



#furfural
unifac("c1cc(oc1)C=O",gcstring"FURFURAL:1") #furfural

Expand Down
Loading