Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove RDKit from dependencies #6

Merged
merged 5 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5"
Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
MolecularGraph = "6c89ec66-9cd8-5372-9f91-fabc50dd27fd"
RDKitMinimalLib = "44044271-7623-48dc-8250-42433c44e4b7"

[weakdeps]
ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5"
Expand All @@ -23,12 +22,11 @@ ChemicalIdentifiers = "0.1"
Clapeyron = "0.4,0.5"
Combinatorics = "1"
MolecularGraph = "0.14,0.15,0.16"
RDKitMinimalLib = "1"
julia = "1.6"

[extras]
Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a"
ChemicalIdentifiers = "fa4ea961-1416-484e-bda2-883ee1634ba5"
Clapeyron = "7c7805af-46cc-48c9-995b-ed0ed2dc909a"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
Expand Down
4 changes: 2 additions & 2 deletions ext/GCIdentifierChemicalIdentifiersExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ function GC.get_groups_from_name(component::String,groups;connectivity = false)
return GC.get_groups_from_name(component,groups,connectivity=connectivity)
end

function GC.get_groups_from_name(component::String,groups::Vector{GC.GCPair},lib = GC.DEFAULTLIB;connectivity=false,check = true)
function GC.get_groups_from_name(component::String,groups::Vector{GC.GCPair};connectivity=false,check = true)
res = search_chemical(component)
smiles = res.smiles
gcpairs = get_groups_from_smiles(smiles,groups,lib;connectivity=connectivity,check = check)
gcpairs = get_groups_from_smiles(smiles,groups;connectivity=connectivity,check = check)
if connectivity == true
(smiles,groups_found,connectivity) = gcpairs
return (component,groups_found,connectivity)
Expand Down
1 change: 0 additions & 1 deletion src/GCIdentifier.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
module GCIdentifier
import RDKitMinimalLib
using Combinatorics

@static if !isdefined(Base,:eachsplit)
Expand Down
38 changes: 17 additions & 21 deletions src/group_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,12 @@ function get_grouplist end
get_grouplist(x::Vector{GCPair}) = x

"""
get_groups_from_smiles(smiles::String,groups,lib = DEFAULTLIB;connectivity = false)
get_groups_from_smiles(smiles::String,groups;connectivity = false)

Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount.

If `connectivity` is true, then it will additionally return a vector containing the amount of bonds between each pair.

`lib` allows the selection of a molecular library to perform the substructure matching. the two available options are:
- `RDKitLib()` : uses `RDKit` (via the `RDKitMinimalLib.jl` package) to perform the substructure matching. Default in Linux and Mac.
- `MolecularGraphJL()` : uses `MolecularGraph.jl`to perform the substructure matching. Default in Windows (due to a bug in RDKit in this particular Operating System).

## Examples

```julia
Expand All @@ -112,13 +108,13 @@ function get_groups_from_smiles(smiles::String,groups;connectivity = false)
return get_groups_from_smiles(smiles,groups;connectivity = connectivity)
end

function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAULTLIB;connectivity=false,check = true)
mol = get_mol(lib,smiles)
atoms = get_atoms(lib,mol)
function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivity=false,check = true)
mol = get_mol(smiles)
atoms = get_atoms(mol)
natoms = length(atoms)
__bonds = __getbondlist(lib,mol)
__bonds = __getbondlist(mol)

smatches_idx_expanded, bond_mat = find_covered_atoms(mol, groups, lib, atoms, __bonds, check)
smatches_idx_expanded, bond_mat = find_covered_atoms(mol, groups, atoms, __bonds, check)
# Find all atoms that are in more than one group
overlap = findall(sum(bond_mat, dims=1)[:] .> 1)
# non_overlap = findall(sum(bond_mat, dims=1)[:] .== 1)
Expand Down Expand Up @@ -172,21 +168,21 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair},lib =DEFAU
end

if connectivity
return (smiles,gcpairs,get_connectivity(mol,group_id,groups,lib))
return (smiles,gcpairs,get_connectivity(mol,group_id,groups))
else
return (smiles,gcpairs)
end
end

function find_covered_atoms(mol, groups, lib, atoms, __bonds, check)
function find_covered_atoms(mol, groups, atoms, __bonds, check)
smatches = []
smatches_idx = Int[]

#step 0.a, find all groups that could get a match
for i in 1:length(groups)
query_i = get_qmol(lib,smarts(groups[i]))
if has_substruct_match(lib,mol,query_i)
push!(smatches,get_substruct_matches(lib,mol,query_i,__bonds))
query_i = get_qmol(smarts(groups[i]))
if has_substruct_match(mol,query_i)
push!(smatches,get_substruct_matches(mol,query_i,__bonds))
push!(smatches_idx,i)
end
end
Expand Down Expand Up @@ -218,7 +214,7 @@ function find_covered_atoms(mol, groups, lib, atoms, __bonds, check)
return smatches_idx_expanded, bond_mat
end

function get_connectivity(mol,group_id,groups,lib = DEFAULTLIB)
function get_connectivity(mol,group_id,groups)

ngroups = length(group_id)
A = zeros(ngroups,ngroups)
Expand All @@ -227,19 +223,19 @@ function get_connectivity(mol,group_id,groups,lib = DEFAULTLIB)
gci = groups[group_id[i]]
smart1 = smarts(gci)
smart2 = smarts(gci)
querie = get_qmol(lib,smart1*smart2)
smatch = get_substruct_matches(lib,mol,querie)
querie = get_qmol(smart1*smart2)
smatch = get_substruct_matches(mol,querie)
name_i = name(gci)
A[i,i] = length(smatch)
A[i,i] = length(unique(smatch))
if A[i,i]!=0
append!(connectivity,[(name_i,name_i)=>Int(A[i,i])])
end

for j in i+1:ngroups
gcj = groups[group_id[j]]
smart2 = smarts(gcj)
querie = get_qmol(lib,smart1*smart2)
smatch = get_substruct_matches(lib,mol,querie)
querie = get_qmol(smart1*smart2)
smatch = get_substruct_matches(mol,querie)
A[i,j] = length(smatch)
name_j = name(gcj)
if A[i,j]!=0
Expand Down
16 changes: 8 additions & 8 deletions src/missing_groups.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
find_missing_groups_from_smiles(smiles::String, groups, lib = DEFAULTLIB;max_group_size = nothing, environment=false, reduced=false)
find_missing_groups_from_smiles(smiles::String, groups;max_group_size = nothing, environment=false, reduced=false)

Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of potential groups (`new_groups::Vector{GCPair}`) which could cover those atoms not covered within `groups`. If no `groups` vector is provided, it will simply generate all possible groups for the molecule.

Expand Down Expand Up @@ -27,15 +27,15 @@ julia> find_missing_groups_from_smiles("CC(=O)O")
GCIdentifier.GCPair("[CX3;H0;!R](=[OX1;H0;!R])([OX2;H1;!R])", "C=O=OH")
```
"""
function find_missing_groups_from_smiles(smiles, groups=nothing, lib=MolecularGraphJL(); max_group_size=nothing, environment=false, reduced=false)
mol = get_mol(lib, smiles)
__bonds = __getbondlist(lib,mol)
atoms = get_atoms(lib,mol)
function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=nothing, environment=false, reduced=false)
mol = get_mol(smiles)
__bonds = __getbondlist(mol)
atoms = get_atoms(mol)

if isnothing(groups)
missing_atoms = ones(Bool, length(atoms))
else
smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, groups, lib, atoms, __bonds, false)
smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, groups, atoms, __bonds, false)
missing_atoms = (sum(atom_coverage, dims=1) .== 0)[:]
end

Expand Down Expand Up @@ -188,9 +188,9 @@ function find_missing_groups_from_smiles(smiles, groups=nothing, lib=MolecularGr
occurrence = zeros(Int, length(unique_smarts))
for i in 1:length(unique_smarts)
push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]])
query_i = get_qmol(lib,unique_smarts[i])
query_i = get_qmol(unique_smarts[i])

occurrence[i] = length(get_substruct_matches(lib,mol,query_i,__bonds))
occurrence[i] = length(get_substruct_matches(mol,query_i,__bonds))

# println(unique_smarts[i], " ", unique_names[i], " ", occurrence[i])
end
Expand Down
74 changes: 6 additions & 68 deletions src/prelude.jl
Original file line number Diff line number Diff line change
@@ -1,64 +1,24 @@
#we use MolecularGraph for windows, but we need the same api.

"""
RDKitLib

Struct used to select the RDKit library (via `RDKitMinimalLib.jl` package) to perform the group search. Default in Linux and Mac.
"""
struct RDKitLib end

"""
MolecularGraphJL

Struct used to select the `MolecularGraph.jl` library to perform the group search. Default in Windows.
"""
struct MolecularGraphJL end

#get useful structure from SMILES
function get_mol(::RDKitLib,smiles)
mol = RDKitMinimalLib.get_mol(smiles)
return mol
end

function get_mol(::MolecularGraphJL,smiles)
function get_mol(smiles)
mol = MolecularGraph.smilestomol(smiles)
return mol
end

#get useful structure from SMARTS
function get_qmol(::RDKitLib,smarts)
return RDKitMinimalLib.get_qmol(smarts)
end

function get_qmol(::MolecularGraphJL,smarts)
function get_qmol(smarts)
return MolecularGraph.smartstomol(smarts)
end


#used in the check function
function get_atoms(::RDKitLib,mol)
0:(length(RDKitMinimalLib.get_substruct_matches(mol,mol)[1]["atoms"]) - 1)
end

function get_atoms(::MolecularGraphJL,mol)
function get_atoms(mol)
0:(length(mol.graph.fadjlist) - 1)
end


#check if query has an substruct match
function has_substruct_match(::RDKitLib,mol,query)
!isempty(RDKitMinimalLib.get_substruct_match(mol,query))
end

function has_substruct_match(::MolecularGraphJL,mol,query)
function has_substruct_match(mol,query)
MolecularGraph.has_substruct_match(mol,query)
end

#function used for MolecularGraph.jl. the RDKitLib version returns nothing

__getbondlist(::RDKitLib,mol) = nothing

function __getbondlist(::MolecularGraphJL,mol)
function __getbondlist(mol)
res = Set{NTuple{2,Int}}()
adjlist = mol.graph.fadjlist
for (i,xi) in pairs(adjlist)
Expand All @@ -68,21 +28,8 @@ function __getbondlist(::MolecularGraphJL,mol)
end
rvec = sort!(collect(res))
end
#parse substruct matches, use RDKitLib format.
function get_substruct_matches(::RDKitLib,mol,query,__bonds = nothing)
matches = RDKitMinimalLib.get_substruct_matches(mol,query)
#stabilize type of result
res = Dict{String,Vector{Int}}[]
for match in matches
dictᵢ = Dict{String,Vector{Int}}()
dictᵢ["atoms"] = convert(Vector{Int},match["atoms"])
dictᵢ["bonds"] = convert(Vector{Int},match["bonds"])
push!(res,dictᵢ)
end
return res
end

function get_substruct_matches(lib::MolecularGraphJL,mol,query,bonds = __getbondlist(lib,mol))
function get_substruct_matches(mol,query,bonds = __getbondlist(mol))
matches = MolecularGraph.substruct_matches(mol,query)
res = Dict{String,Vector{Int}}[]
#convert to RDKitLib expected struct.
Expand Down Expand Up @@ -118,12 +65,3 @@ function __getbonds_mg(list,atoms)
res .-= 1
return res
end
#select default molecule engine

@static if Sys.iswindows()
const DEFAULTLIB = MolecularGraphJL()
else
const DEFAULTLIB = RDKitLib()
end

export RDKitLib, MolecularGraphJL
23 changes: 11 additions & 12 deletions test/test_group_search.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
function test_gcmatch(groups,smiles,result)
evaluated_rdkit = Set(get_groups_from_smiles(smiles,groups,RDKitLib(),check = false)[2])
evaluated_molgraphJL = Set(get_groups_from_smiles(smiles,groups,MolecularGraphJL(),check = false)[2])
evaluated = Set(get_groups_from_smiles(smiles,groups,check = false)[2])
reference = Set(result)
@test isequal(evaluated_rdkit,reference)
@test isequal(evaluated_molgraphJL,reference)
@test isequal(evaluated,reference)
end

test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)

@testset "UNIFAC examples" begin
#http://www.aim.env.uea.ac.uk/aim/info/UNIFACgroups.html
unifac = test_gcmatch(UNIFACGroups)

#alkane group
unifac("CC",gcstring"CH3:2") #ethane
unifac("CCCC",gcstring"CH3:2;CH2:2") #n-butane
Expand Down Expand Up @@ -43,7 +41,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)

#water
unifac("O",gcstring"H2O:1") #water

#aromatic carbon-alcohol
unifac("Oc1ccccc1",gcstring"ACH:5;ACOH:1") #phenol

Expand All @@ -57,7 +55,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
#acetate group
unifac("CCCCOC(=O)C",gcstring"CH3:1;CH2:3;CH3COO:1") #Butyl acetate #fails at the matching stage
unifac("O=C(OC)CC",gcstring"CH3:2;CH2COO:1") #methyl propionate #fails at the matching stage

#formate group
unifac("O=COCC",gcstring"CH3:1;CH2:1;HCOO:1") #ethyl formate

Expand All @@ -66,7 +64,7 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
unifac("CCOCC",gcstring"CH3:2;CH2:1;CH2O:1") #diethyl ether
unifac("O(C(C)C)C(C)C",gcstring"CH3:4;CH:1;CHO:1") #diisopropyl ether
unifac("C1CCOC1",gcstring"CY-CH2:2;THF:1") #tetrahydrofuran #check?

#primary amine
unifac("CN",gcstring"CH3NH2:1") #methylamine
unifac("CCN",gcstring"CH3:1;CH2NH2:1") #ethylamine
Expand All @@ -80,10 +78,10 @@ test_gcmatch(groups) = (smiles,result) -> test_gcmatch(groups,smiles,result)
#tertiary amine
unifac("CN(C)C",gcstring"CH3:2;CH3N:1") #trimethylamine
unifac("CCN(CC)CC",gcstring"CH3:3;CH2:2;CH2N:1") #triethylamine

#aromatic amine
unifac("c1ccc(cc1)N",gcstring"ACH:5;ACNH2:1") #aniline

#furfural
unifac("c1cc(oc1)C=O",gcstring"FURFURAL:1") #furfural

Expand All @@ -98,6 +96,7 @@ end
# Test a highly branched hydrocarbon
smiles = "c1ccccc1C(CCCC)(CCCC(C))C"
(component, groups, connectivity) = get_groups_from_smiles("c1ccccc1C(CC)(CC(C))C", gcPPCSAFTGroups; connectivity=true)
@test isequal([ "CH3" => 3,"CH2" => 3,"C" => 1,"aCH" => 5,"aC" => 1],groups)
@test isequal([("CH3", "CH2") => 2,("CH3", "C") => 1,("CH2", "CH2") => 1,("CH2", "C") => 2,("C", "aC") => 1,("aCH", "aCH") => 4,("aCH", "aC") => 2], connectivity)

@test isequal([ "CH3" => 3,"CH2" => 3,"C" => 1,"aCH" => 5,"aC" => 1] |> Set,Set(groups))
@test isequal([("CH3", "CH2") => 2,("CH3", "C") => 1,("CH2", "CH2") => 1,("CH2", "C") => 2,("C", "aC") => 1,("aCH", "aCH") => 4,("aCH", "aC") => 2] |> Set,Set(connectivity))
end
Loading