Skip to content

Commit

Permalink
Add Container and Block for Text (#207)
Browse files Browse the repository at this point in the history
* Add basic Text module and sample recipe.

Co-authored-by: lorenzoh <lorenz.ohly@gmail.com>
Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
  • Loading branch information
3 people committed May 12, 2022
1 parent f0fe1a2 commit 2f227aa
Show file tree
Hide file tree
Showing 10 changed files with 160 additions and 9 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ version = "0.4.2"

[deps]
Animations = "27a7e980-b3e6-11e9-2bcd-0b925532e340"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
ColorVectorSpace = "c3611d14-8923-5661-9e6a-0046d554d3a4"
Expand Down
17 changes: 8 additions & 9 deletions src/FastAI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ export Vision
include("Tabular/Tabular.jl")
@reexport using .Tabular

include("Textual/Textual.jl")
@reexport using .Textual

include("deprecations.jl")
export
Expand All @@ -127,16 +129,16 @@ export

include("interpretation/makie/stub.jl")
function __init__()
@require Makie="ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" begin
@require Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" begin
import .Makie as M
include("interpretation/makie/showmakie.jl")
include("interpretation/makie/lrfind.jl")
end
end

module Models
using ..FastAI.Tabular: TabularModel
using ..FastAI.Vision.Models: xresnet18, xresnet50, UNetDynamic
using ..FastAI.Tabular: TabularModel
using ..FastAI.Vision.Models: xresnet18, xresnet50, UNetDynamic
end


Expand Down Expand Up @@ -173,6 +175,7 @@ export
TableRow,
Continuous,
Image,
Paragraph,

# encodings
encode,
Expand All @@ -182,9 +185,7 @@ export
Only,
Named,
augs_projection, augs_lighting,
TabularPreprocessing,

SupervisedTask,
TabularPreprocessing, SupervisedTask,
BlockTask,
describetask,
checkblock,
Expand Down Expand Up @@ -222,9 +223,7 @@ export
lrfind,
savetaskmodel,
loadtaskmodel,
accuracy_thresh,

gpu,
accuracy_thresh, gpu,
plot


Expand Down
31 changes: 31 additions & 0 deletions src/Textual/Textual.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
module Textual


using ..FastAI
using ..FastAI:
# blocks
Block, WrapperBlock, AbstractBlock, OneHotTensor, OneHotTensorMulti, Label,
LabelMulti, wrapped, Continuous, getencodings, getblocks, encodetarget, encodeinput,
# encodings
Encoding, StatefulEncoding, OneHot,
# visualization
ShowText,
# other
Context, Training, Validation, FASTAI_METHOD_REGISTRY, registerlearningtask!

import Requires: @require

using InlineTest
using Random

include("recipes.jl")
include("blocks/text.jl")
include("transform.jl")

function __init__()
_registerrecipes()
end

export Paragraph
end

29 changes: 29 additions & 0 deletions src/Textual/blocks/text.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Paragraph() <: Block
[`Block`](#) for a text paragraph containing one or more
sentences (basically, a single observation in the textual dataset).
`data` is valid for `Paragraph` if it is of type string.
Example valid Paragraphs:
```julia
@test checkblock(Paragraph(), "Hello world!")
@test checkblock(Paragraph(), "Hello world!, How are you?")
```
You can create a random observation using [`mockblock`](#):
{cell=main}
```julia
using FastAI
FastAI.mockblock(Paragraph())
```
"""

struct Paragraph <: Block end

FastAI.checkblock(::Paragraph, ::String) = true
FastAI.mockblock(::Paragraph) = randstring(" ABCEEFGHIJKLMNOPQESRUVWXYZ 1234567890 abcdefghijklmnopqrstynwxyz\n\t.,", rand(10:40))
1 change: 1 addition & 0 deletions src/Textual/makie.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# # No Makie recipes yet, text is better I guess
51 changes: 51 additions & 0 deletions src/Textual/recipes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
TextFolders(textfile; labelfn = parentname, split = false)
Recipe for loading a single-label text classification dataset
stored in hierarchical folder format.
"""
Base.@kwdef struct TextFolders <: Datasets.DatasetRecipe
labelfn = parentname
split::Bool = false
filefilterfn = _ -> true
end

Datasets.recipeblocks(::Type{TextFolders}) = Tuple{Paragraph,Label}

function Datasets.loadrecipe(recipe::TextFolders, path)
isdir(path) || error("$path is not a directory")
data = loadfolderdata(
path,
filterfn=f -> istextfile(f) && recipe.filefilterfn(f),
loadfn=(loadfile, recipe.labelfn),
splitfn=recipe.split ? grandparentname : nothing)

(recipe.split ? length(data) > 0 : nobs(data) > 0) || error("No text files found in $path")

labels = recipe.split ? first(values(data))[2] : data[2]
blocks = (Paragraph(), Label(unique(eachobs(labels))))
length(blocks[2].classes) > 1 || error("Expected multiple different labels, got: $(blocks[2].classes))")
return data, blocks
end

# Registering recipes

const RECIPES = Dict{String,Vector{Datasets.DatasetRecipe}}(
"imdb" => [TextFolders(
filefilterfn=f -> !occursin(r"tmp_clas|tmp_lm|unsup", f)
)],
)

function _registerrecipes()
for (name, recipes) in RECIPES, recipe in recipes
Datasets.registerrecipe!(Datasets.FASTAI_DATA_REGISTRY, name, recipe)
end
end


## Tests


@testset "TextFolders [Recipe]" begin
@test length(finddatasets(name="imdb")) >= 1
end
34 changes: 34 additions & 0 deletions src/Textual/transform.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
replace_all_caps(String)
Replace tokens in ALL CAPS by their lower version and add xxup before.
"""

function replace_all_caps(t)
t = replace(t, r"([A-Z]+[^a-z\s]*)(?=(\s|$))" => s"xxup \1")
return replace(t, r"([A-Z]*[^a-z\s]+)(?=(\s|$))" => lowercase)
end

"""
replace_sentence_case(String)
Replace tokens in Sentence Case by their lower verions and add xxmaj before.
"""
function replace_sentence_case(t)
t = replace(t, r"(?<!\w)([A-Z][A-Z0-9]*[a-z0-9]+)(?!\w)" => s"xxmaj \1")
return replace(t, r"(?<!\w)([A-Z][A-Z0-9]*[a-z0-9]+)(?!\w)" => lowercase)
end

convert_lowercase(t) = string("xxbos ", lowercase(t))


## Tests


@testset "Text Transforms" begin
str1 = "Hello WORLD CAPITAL Sentence Case"

@test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence Case"
@test replace_sentence_case(str1) == "xxmaj hello WORLD CAPITAL xxmaj sentence xxmaj case"
@test convert_lowercase(str1) == "xxbos hello world capital sentence case"
end
1 change: 1 addition & 0 deletions src/datasets/Datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ export

# utilities
isimagefile,
istextfile,
matches,
loadfile,
loadmask,
Expand Down
2 changes: 2 additions & 0 deletions src/datasets/containers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ function loadfile(file::String)
return FileIO.load(file)
elseif endswith(file, ".csv")
return DataFrame(CSV.File(file))
elseif endswith(file, ".txt")
return read(file, String)
else
return FileIO.load(file)
end
Expand Down
2 changes: 2 additions & 0 deletions src/datasets/load.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ matches(re::Regex) = f -> matches(re, f)
matches(re::Regex, f) = !isnothing(match(re, f))
const RE_IMAGEFILE = r".*\.(gif|jpe?g|tiff?|png|webp|bmp)$"i
isimagefile(f) = matches(RE_IMAGEFILE, f)
const RE_TEXTFILE = r".*\.(txt|csv|json|md|html?|xml|yaml|toml)$"i
istextfile(f) = matches(RE_TEXTFILE, f)


maskfromimage(a::AbstractArray{<:Gray{T}}, classes) where T = maskfromimage(reinterpret(T, a), classes)
Expand Down

0 comments on commit 2f227aa

Please sign in to comment.