Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Indexing #87

Merged
merged 3 commits into from
Oct 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/src/samples_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ and contains some number of `AbstractFeature`s, such as taxa or gene functions.

At its most basic, an `AbstractSample` simply encodes a `name`
(which should be a unique identifier), and a place to hold metadata.
The concrete type `MicrobiomeSample` is implemented with these two fields,
The concrete type [`MicrobiomeSample`](@ref) is implemented with these two fields,
the latter of which is a `Dictionary` from [`Dictionaries.jl`](https://github.com/andyferris/Dictionaries.jl).

```@docs
Expand All @@ -19,7 +19,7 @@ MicrobiomeSample
## Feature Types

`AbstractFeature` types also have a `name`, but other fields are optional.
`Microbiome.jl` defines two concrete `AbstractFeature` types, `Taxon` and `GeneFunction`.
`Microbiome.jl` defines two concrete `AbstractFeature` types, [`Taxon`](@ref) and [`GeneFunction`](@ref).

```@docs
Taxon
Expand Down
4 changes: 2 additions & 2 deletions src/comm_joins.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ function commjoin(c1::CommunityProfile, comms::CommunityProfile...)
length(intersect(samplenames(c1), samplenames.(comms)...)) == 0 || error("Duplicate sample names detected: $(intersect(samplenames(c1), samplenames.(comms)...))")

all_samples = vcat(samples(c1), samples.(comms)...)
sample_dict = dictionary(zip(name.(all_samples), eachindex(all_samples)))
sample_dict = dictionary(zip(all_samples, eachindex(all_samples)))
all_features = unique(vcat(features(c1), features.(comms)...))
feature_dict = dictionary(zip(all_features, eachindex(all_features)))

mat = spzeros(length(all_features), length(all_samples))
for comm in (c1, comms...)
for sample in samplenames(comm)
for sample in samples(comm)
for feature in features(comm)
mat[feature_dict[feature], sample_dict[sample]] = comm[feature, sample]
end
Expand Down
42 changes: 33 additions & 9 deletions src/profiles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,12 @@ samples(at::AbstractAbundanceTable) = axes(at.aa, 2) |> keys

Returns sample in `at` with name `name`.
"""
samples(at::AbstractAbundanceTable, name::AbstractString) = samples(at)[axes(at.aa, 2)[name]]
function samples(at::AbstractAbundanceTable, name::AbstractString)
idx = findall(==(name), samplenames(at))
length(idx) == 0 && throw(IndexError("No samples called $name"))
length(idx) > 1 && throw(IndexError("More than one sample matches name $name"))
return samples(at)[axes(at.aa, 2)][first(idx)]
end

profiletype(at::AbstractAbundanceTable) = eltype(features(at))
clades(at::AbstractAbundanceTable) = clade.(features(at))
Expand Down Expand Up @@ -132,27 +137,46 @@ function _index_profile(at, idx, inds)
end
end

function _toinds(arr, inds::AbstractVector{Regex})
return findall(a-> any(ind-> contains(a, ind), inds), arr)
end

function _toinds(arr, inds::AbstractVector{<: Union{AbstractSample, AbstractFeature, AbstractString}})
return findall(a-> any(==(a), inds), arr)
end

# fall back ↑
_toinds(arr, ind::Union{AbstractSample, AbstractFeature, AbstractString, Regex}) = _toinds(arr, [ind])

# if inds are integers, just return them
_toinds(_, ind::Int) = ind
_toinds(_, inds::AbstractVector{Int}) = inds

function Base.getindex(at::CommunityProfile, inds...)
idx = at.aa[inds...]

_index_profile(at, idx, inds)
end

## Special lookup for gene function comm profiles
function Base.getindex(at::CommunityProfile{<:Real, <:GeneFunction, <:AbstractSample}, rowind::AbstractString, colind)
rows = findall(==(rowind), featurenames(at))
function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex}, colind)
rows = _toinds(featurenames(at), rowind)
idx = at.aa[rows, colind]

_index_profile(at, idx, (rowind, colind))
_index_profile(at, idx, (rows, colind))
end

function Base.getindex(at::CommunityProfile{<:Real, <:GeneFunction, <:AbstractSample}, rowind::AbstractVector{<:AbstractString}, colind)
rows = findall(fn-> any(==(fn), rowind), featurenames(at))
idx = at.aa[rows, colind]
function Base.getindex(at::CommunityProfile, rowind, colind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex})
cols = _toinds(samplenames(at), colind)
idx = at.aa[rowind, cols]

_index_profile(at, idx, (rowind, colind))
_index_profile(at, idx, (rowind, cols))
end

function Base.getindex(at::CommunityProfile, rowind::Union{T, AbstractVector{<:T}} where T<:Union{AbstractString,Regex},
colind::Union{S, AbstractVector{<:S}} where S<:Union{AbstractString,Regex})
rows = _toinds(featurenames(at), rowind)
at[rows, colind]
end

## -- EcoBase Translations -- ##
# see src/ecobase.jl for Microbiome function names
Expand Down
18 changes: 3 additions & 15 deletions src/samples_features.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,9 @@ metadata(as::AbstractSample) = as.metadata

name(as::AbstractFeature) = as.name

# Allows Strings to be used to index (since `Thing("name", missing)` will be == `Thing("name", something)`)
Base.String(as::AbstractSample) = name(as)
Base.String(af::AbstractFeature) = name(af)

Base.:(==)(s1::T, s2::T) where {T <: Union{AbstractSample, AbstractFeature}} = name(s1) == name(s2)

## getters and setters for metadata ##
function Base.getindex(as::AbstractSample, prop::Symbol)
prop in _restricted_fields(as) && error("Do not use getindex to access $prop of $(typeof(as)). Use accessor function or getfield instead.")
Expand Down Expand Up @@ -326,6 +323,9 @@ Taxon(n::AbstractString, clade::Int) = 0 <= clade <= 9 ?
error("Invalid clade $clade, must be one of $_clades")
Taxon(n::AbstractString) = Taxon(n, missing)

Base.String(t::Taxon) = hasclade(t) ? string(first(string(clade(t))), "__", name(t)) : name(t)


"""
clade(t::Union{Taxon, missing})

Expand Down Expand Up @@ -386,15 +386,3 @@ clade(gf::GeneFunction) = clade(taxon(gf))
Pretty self-explanatory.
"""
hasclade(gf::GeneFunction) = hastaxon(gf) && !ismissing(clade(gf))


# override equality for GeneFunction
function Base.:(==)(gf1::GeneFunction, gf2::GeneFunction)
if all(!hastaxon, (gf1, gf2)) # if gf doesn't have taxon, check that names match
return name(gf1) == name(gf2)
elseif !all(hastaxon, (gf1, gf2)) # if one gf has taxon and other doesn't, return false
return false
else # if both have taxa, check that names of gf and taxa match
return name(gf1) == name(gf2) && taxon(gf1) == taxon(gf2)
end
end
10 changes: 6 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ import Microbiome.MultivariateStats: MDS
@test clade(tx) == c
@test tx === Taxon("taxon", i-1)
@test tx !== txm
@test tx == txm
end

@test_throws ErrorException Taxon("taxon", :invalid)
Expand All @@ -70,14 +69,14 @@ import Microbiome.MultivariateStats: MDS
gf2 = GeneFunction("gene", Taxon("sp1"))
@test name(gf1) == "gene"

@test gf1 == gf2
@test gf1 != gfm
@test gf1 === GeneFunction("gene", Taxon("sp1", :species))
@test gf2 === GeneFunction("gene", Taxon("sp1"))
@test gf1 !== gf2
@test hastaxon(gf1)
@test !ismissing(taxon(gf1))
@test taxon(gf1) == taxon(gf2)
@test taxon(gf1) == Taxon("sp1", :species)
@test taxon(gf1) != taxon(gf2)
@test hasclade(gf1)
@test clade(gf1) == :species
end
Expand Down Expand Up @@ -126,7 +125,7 @@ end
@test_throws ErrorException cladefilter(comm, 10)
@test_throws ErrorException cladefilter(cladefilter(comm, :species), :genus) # will be empty

@test filter(f-> hasclade(f) && clade(f) == :species, comm) == cladefilter(comm, :species)
@test featurenames(filter(f-> hasclade(f) && clade(f) == :species, comm)) == featurenames(cladefilter(comm, :species))

@test present(0.1)
@test !present(0.1, 0.2)
Expand Down Expand Up @@ -256,6 +255,9 @@ end
@test abundances(comm["taxon$i", :]) == mat[[i], :]
end

@test abundances(comm[r"taxon1", :]) == abundances(comm[["taxon1", "taxon10"], :]) == abundances(comm[[1,10], :])
@test abundances(comm[:, r"sample[13]"]) == abundances(comm[:,["sample1", "sample3"]]) == abundances(comm[:, [1,3]])

for (i, col) in enumerate(Tables.columns(comm))
if i == 1
@test col == name.(txs)
Expand Down