-
-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Container and Block for Text (#207)
* Add basic Text module and sample recipe. Co-authored-by: lorenzoh <lorenz.ohly@gmail.com> Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
- Loading branch information
1 parent
f0fe1a2
commit 2f227aa
Showing
10 changed files
with
160 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
module Textual | ||
|
||
|
||
using ..FastAI | ||
using ..FastAI: | ||
# blocks | ||
Block, WrapperBlock, AbstractBlock, OneHotTensor, OneHotTensorMulti, Label, | ||
LabelMulti, wrapped, Continuous, getencodings, getblocks, encodetarget, encodeinput, | ||
# encodings | ||
Encoding, StatefulEncoding, OneHot, | ||
# visualization | ||
ShowText, | ||
# other | ||
Context, Training, Validation, FASTAI_METHOD_REGISTRY, registerlearningtask! | ||
|
||
import Requires: @require | ||
|
||
using InlineTest | ||
using Random | ||
|
||
include("recipes.jl") | ||
include("blocks/text.jl") | ||
include("transform.jl") | ||
|
||
function __init__() | ||
_registerrecipes() | ||
end | ||
|
||
export Paragraph | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
""" | ||
Paragraph() <: Block | ||
[`Block`](#) for a text paragraph containing one or more | ||
sentences (basically, a single observation in the textual dataset). | ||
`data` is valid for `Paragraph` if it is of type string. | ||
Example valid Paragraphs: | ||
```julia | ||
@test checkblock(Paragraph(), "Hello world!") | ||
@test checkblock(Paragraph(), "Hello world!, How are you?") | ||
``` | ||
You can create a random observation using [`mockblock`](#): | ||
{cell=main} | ||
```julia | ||
using FastAI | ||
FastAI.mockblock(Paragraph()) | ||
``` | ||
""" | ||
|
||
struct Paragraph <: Block end | ||
|
||
FastAI.checkblock(::Paragraph, ::String) = true | ||
FastAI.mockblock(::Paragraph) = randstring(" ABCEEFGHIJKLMNOPQESRUVWXYZ 1234567890 abcdefghijklmnopqrstynwxyz\n\t.,", rand(10:40)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# # No Makie recipes yet, text is better I guess |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
""" | ||
TextFolders(textfile; labelfn = parentname, split = false) | ||
Recipe for loading a single-label text classification dataset | ||
stored in hierarchical folder format. | ||
""" | ||
Base.@kwdef struct TextFolders <: Datasets.DatasetRecipe | ||
labelfn = parentname | ||
split::Bool = false | ||
filefilterfn = _ -> true | ||
end | ||
|
||
Datasets.recipeblocks(::Type{TextFolders}) = Tuple{Paragraph,Label} | ||
|
||
function Datasets.loadrecipe(recipe::TextFolders, path) | ||
isdir(path) || error("$path is not a directory") | ||
data = loadfolderdata( | ||
path, | ||
filterfn=f -> istextfile(f) && recipe.filefilterfn(f), | ||
loadfn=(loadfile, recipe.labelfn), | ||
splitfn=recipe.split ? grandparentname : nothing) | ||
|
||
(recipe.split ? length(data) > 0 : nobs(data) > 0) || error("No text files found in $path") | ||
|
||
labels = recipe.split ? first(values(data))[2] : data[2] | ||
blocks = (Paragraph(), Label(unique(eachobs(labels)))) | ||
length(blocks[2].classes) > 1 || error("Expected multiple different labels, got: $(blocks[2].classes))") | ||
return data, blocks | ||
end | ||
|
||
# Registering recipes | ||
|
||
const RECIPES = Dict{String,Vector{Datasets.DatasetRecipe}}( | ||
"imdb" => [TextFolders( | ||
filefilterfn=f -> !occursin(r"tmp_clas|tmp_lm|unsup", f) | ||
)], | ||
) | ||
|
||
function _registerrecipes() | ||
for (name, recipes) in RECIPES, recipe in recipes | ||
Datasets.registerrecipe!(Datasets.FASTAI_DATA_REGISTRY, name, recipe) | ||
end | ||
end | ||
|
||
|
||
## Tests | ||
|
||
|
||
@testset "TextFolders [Recipe]" begin | ||
@test length(finddatasets(name="imdb")) >= 1 | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
""" | ||
replace_all_caps(String) | ||
Replace tokens in ALL CAPS by their lower version and add xxup before. | ||
""" | ||
|
||
function replace_all_caps(t) | ||
t = replace(t, r"([A-Z]+[^a-z\s]*)(?=(\s|$))" => s"xxup \1") | ||
return replace(t, r"([A-Z]*[^a-z\s]+)(?=(\s|$))" => lowercase) | ||
end | ||
|
||
""" | ||
replace_sentence_case(String) | ||
Replace tokens in Sentence Case by their lower verions and add xxmaj before. | ||
""" | ||
function replace_sentence_case(t) | ||
t = replace(t, r"(?<!\w)([A-Z][A-Z0-9]*[a-z0-9]+)(?!\w)" => s"xxmaj \1") | ||
return replace(t, r"(?<!\w)([A-Z][A-Z0-9]*[a-z0-9]+)(?!\w)" => lowercase) | ||
end | ||
|
||
convert_lowercase(t) = string("xxbos ", lowercase(t)) | ||
|
||
|
||
## Tests | ||
|
||
|
||
@testset "Text Transforms" begin | ||
str1 = "Hello WORLD CAPITAL Sentence Case" | ||
|
||
@test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence Case" | ||
@test replace_sentence_case(str1) == "xxmaj hello WORLD CAPITAL xxmaj sentence xxmaj case" | ||
@test convert_lowercase(str1) == "xxbos hello world capital sentence case" | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,6 +66,7 @@ export | |
|
||
# utilities | ||
isimagefile, | ||
istextfile, | ||
matches, | ||
loadfile, | ||
loadmask, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters