In [6]:
using TextAnalysis, CorpusLoaders, MultiResolutionIterators, LinearAlgebra

In [7]:
test_set = CorpusLoaders.load(CoNLL(), "test") # test set
test_dataset = flatten_levels(test_set, lvls(CoNLL, :document)) |> full_consolidate

function obtain_X_Y(dataset)
    X = [CorpusLoaders.word.(sent) for sent in dataset]
    Y = [TextAnalysis.remove_ner_label_prefix.(CorpusLoaders.named_entity.(sent)) for sent in dataset]
    return X, Y
end

X, Y = obtain_X_Y(test_dataset)

(Array{String,1}[["SOCCER", "-", "JAPAN", "GET", "LUCKY", "WIN", ",", "CHINA", "IN", "SURPRISE", "DEFEAT", "."], ["Nadim", "Ladki"], ["AL-AIN", ",", "United", "Arab", "Emirates", "1996-12-06"], ["Japan", "began", "the", "defence", "of", "their", "Asian", "Cup", "title", "with"  …  "Syria", "in", "a", "Group", "C", "championship", "match", "on", "Friday", "."], ["But", "China", "saw", "their", "luck", "desert", "them", "in", "the", "second"  …  "crashing", "to", "a", "surprise", "2-0", "defeat", "to", "newcomers", "Uzbekistan", "."], ["China", "controlled", "most", "of", "the", "match", "and", "saw", "several", "chances"  …  "the", "advancing", "Chinese", "keeper", "and", "into", "an", "empty", "net", "."], ["Oleg", "Shatskiku", "made", "sure", "of", "the", "win", "in", "injury", "time"  …  "unstoppable", "left", "foot", "shot", "from", "just", "outside", "the", "area", "."], ["The", "former", "Soviet", "republic", "was", "playing", "in", "an", "Asian", "Cup", "finals", "tie", "for", "t

In [8]:
ner = NERTagger()

function eval_ner_tagger(ner_m, x_seq) 
    ner_m(x_seq)
end


eval_ner_tagger (generic function with 1 method)

In [9]:
function try_outs(ner_m, x_in, y_in, eval_func)
    unique_labels = unique(ner.model.labels)
    num_labels = length(unique_labels)
    confusion_matrix = zeros(Int, (num_labels, num_labels))

    for (x_seq, y_seq) in zip(x_in, y_in)

        preds = eval_func(ner_m, x_seq)
        length(preds) != length(y_seq) && continue

        for (pred, logit) in zip(preds, y_seq)
            (logit == "MISC" || pred == "INVALID") && continue
            confusion_matrix[findfirst(x -> x==pred, unique_labels), findfirst(x -> x==logit, unique_labels)] += 1
        end
    end

    s1 = sum(confusion_matrix, dims=2)
    s2 = sum(confusion_matrix, dims=1)'
    dg = diag(confusion_matrix)
    s1 = [s1[1:2]..., s1[4:5]...]
    s2 = [s2[1:2]..., s2[4:5]...]
    dg = [dg[1:2]..., dg[4:5]...]

    unique_labels = unique(ner.model.labels)
    deleteat!(unique_labels, findfirst(x -> x=="MISC", unique_labels))
    # Don't count MISC
    
    f1s = []

    for (p, r, d, tag) in zip(s1, s2, dg, unique_labels)
        println("For tag `$tag`")
        prec = d/p
        recall = d/r
        f1 = (2 * prec * recall) /(prec + recall)
        println("The precision is $prec")
        println("The recall is $recall")
        println("f1 is $f1")
        println()
        push!(f1s, f1)
    end

    a = sum(dg ./ s1) / length(unique_labels)
    b = sum(dg ./ s2) / length(unique_labels)
    println("Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is ", (2 * a * b)/ (a + b))
    println("Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is ", sum(f1s)/ length(f1s))
end

try_outs (generic function with 1 method)

In [10]:
try_outs(ner, X, Y, eval_ner_tagger)

For tag `ORG`
The precision is 0.8897058823529411
The recall is 0.8241185897435898
f1 is 0.855657237936772

For tag `O`
The precision is 0.9901429018462501
The recall is 0.9907888213344467
f1 is 0.9904657562833405

For tag `PER`
The precision is 0.9708029197080292
The recall is 0.9592499098449333
f1 is 0.9649918374750589

For tag `LOC`
The precision is 0.9051504334523203
The recall is 0.922077922077922
f1 is 0.9135357694287184

Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is 0.9314451550144375
Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is 0.9311626502809724


In [11]:
using PyCall, WordTokenizers
spacy = pyimport("spacy")
nlp = spacy.load("en_core_web_sm")

PyObject <spacy.lang.en.English object at 0x7f7b74043ac8>

In [20]:
function eval_spacy_tagger(ner_m, x_seq)
    preds = String[]
    ents = ner_m(join(x_seq, " ")).ents

    idx = 1
    i = 1
    while i <= length(x_seq)
        if idx <= length(ents) && x_seq[i] == tokenize(ents[idx].text)[1]
            l = length(tokenize(ents[idx].text))

            for k in 1:l
                pred = ents[idx].label_
                if (pred == "PERSON")
                    push!(preds, "PER")
                elseif ( pred == "LOC" || pred == "GPE")
                    push!(preds, "LOC")
                elseif (pred == "ORG")
                    push!(preds, "ORG")
                else
                    push!(preds, "INVALID")
                end
            end
            i = i + l - 1
            idx += 1
        else
            push!(preds, "O")
        end
        i += 1
    end

    return preds
end


eval_spacy_tagger (generic function with 1 method)

In [21]:
try_outs(nlp, X, Y, eval_spacy_tagger)

For tag `ORG`
The precision is 0.5077850326469111
The recall is 0.4298469387755102
f1 is 0.4655767902371633

For tag `O`
The precision is 0.9367631609664994
The recall is 0.9720800654089574
f1 is 0.9540949012497958

For tag `PER`
The precision is 0.744390243902439
The recall is 0.5626843657817109
f1 is 0.6409071818563629

For tag `LOC`
The precision is 0.706060606060606
The recall is 0.6223290598290598
f1 is 0.6615559341283361

Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is 0.6830785077457464
Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is 0.6805337018679145


In [14]:
nltk = pyimport("nltk")
nltk_chunker = nltk.load(nltk.chunk._MULTICLASS_NE_CHUNKER)
nltk_ner(x) = nltk_chunker._tagger.tag(nltk.pos_tag((x)))

nltk_ner (generic function with 1 method)

In [15]:
function eval_nltk_tagger(ner_m, x_seq) 
    obtain_ner(x) = (x[2]) == "O" ? "O" : (x[2])[3:end]
    preds = obtain_ner.(ner_m(x_seq))

    for i in eachindex(preds)
        preds[i] == "O" && continue

        if preds[i] == "PERSON"
            preds[i] = "PER"
        elseif preds[i] == "ORGANIZATION"
            preds[i] = "ORG"
        elseif preds[i] ∈ ("LOCATION", "GPE")
            preds[i] = "LOC"
        else
            preds[i] = "INVALID"
        end
    end
    return preds
end


eval_nltk_tagger (generic function with 1 method)

In [16]:
try_outs(nltk_ner, X, Y, eval_nltk_tagger)

For tag `ORG`
The precision is 0.5396301188903567
The recall is 0.3294354838709677
f1 is 0.4091136705057587

For tag `O`
The precision is 0.9622844001533154
The recall is 0.9827761684803883
f1 is 0.9724223410024013

For tag `PER`
The precision is 0.7286072323666308
The recall is 0.7349223546406645
f1 is 0.7317511686443725

For tag `LOC`
The precision is 0.6288187372708758
The recall is 0.6719260065288357
f1 is 0.6496580746975277

Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is 0.696859107260689
Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is 0.6907363137125151
