In [1]:
using TextAnalysis, CorpusLoaders, MultiResolutionIterators, LinearAlgebra

┌ Info: Recompiling stale cache file /home/ayushk4/.julia/compiled/v1.0/TextAnalysis/5Mwet.ji for TextAnalysis [a2db99b7-8b79-58f8-94bf-bbc811eef33d]
└ @ Base loading.jl:1190
│ - If you have TextAnalysis checked out for development and have
│   added Libdl as a dependency but haven't updated your primary
│   environment's manifest file, try `Pkg.resolve()`.
│ - Otherwise you may need to report an issue with TextAnalysis


In [3]:
test_set = CorpusLoaders.load(CoNLL(), "test") # test set
test_dataset = flatten_levels(test_set, lvls(CoNLL, :document)) |> full_consolidate

function obtain_X_Y(dataset)
    X = [CorpusLoaders.word.(sent) for sent in dataset]
    Y = [TextAnalysis.remove_ner_label_prefix.(CorpusLoaders.named_entity.(sent)) for sent in dataset]
    return X, Y
end

X, Y = obtain_X_Y(test_dataset)

CorpusLoaders.Document{Array{Array{CorpusLoaders.NERTaggedWord,1},1},String}("test.txt", Array{CorpusLoaders.NERTaggedWord,1}[[NERTaggedWord("O", "B-NP", "NN", "SOCCER"), NERTaggedWord("O", "O", ":", "-"), NERTaggedWord("B-LOC", "B-NP", "NNP", "JAPAN"), NERTaggedWord("O", "B-VP", "VB", "GET"), NERTaggedWord("O", "B-NP", "NNP", "LUCKY"), NERTaggedWord("O", "I-NP", "NNP", "WIN"), NERTaggedWord("O", "O", ",", ","), NERTaggedWord("B-PER", "B-NP", "NNP", "CHINA"), NERTaggedWord("O", "B-PP", "IN", "IN"), NERTaggedWord("O", "B-NP", "DT", "SURPRISE"), NERTaggedWord("O", "I-NP", "NN", "DEFEAT"), NERTaggedWord("O", "O", ".", ".")], [NERTaggedWord("B-PER", "B-NP", "NNP", "Nadim"), NERTaggedWord("I-PER", "I-NP", "NNP", "Ladki")], [NERTaggedWord("B-LOC", "B-NP", "NNP", "AL-AIN"), NERTaggedWord("O", "O", ",", ","), NERTaggedWord("B-LOC", "B-NP", "NNP", "United"), NERTaggedWord("I-LOC", "I-NP", "NNP", "Arab"), NERTaggedWord("I-LOC", "I-NP", "NNPS", "Emirates"), NERTaggedWord("O", "I-NP", "CD", "1996-

In [110]:
ner = NERTagger()

function eval_ner_tagger(ner_m, x_seq) 
    ner_m(x_seq)
end


eval_ner_tagger (generic function with 2 methods)

In [111]:

function try_outs(ner_m, x_in, y_in, eval_func)
    unique_labels = unique(ner.model.labels)
    num_labels = length(unique_labels)
    confusion_matrix = zeros(Int, (num_labels, num_labels))

    for (x_seq, y_seq) in zip(x_in, y_in)

        preds = eval_func(ner_m, x_seq)
        length(preds) != length(y_seq) && continue

        for (pred, logit) in zip(preds, y_seq)
            (logit == "MISC" || pred == "INVALID") && continue
            confusion_matrix[findfirst(x -> x==pred, unique_labels), findfirst(x -> x==logit, unique_labels)] += 1
        end
    end

    s1 = sum(confusion_matrix, dims=2)
    s2 = sum(confusion_matrix, dims=1)'
    dg = diag(confusion_matrix)
    s1 = [s1[1:2]..., s1[4:5]...]
    s2 = [s2[1:2]..., s2[4:5]...]
    dg = [dg[1:2]..., dg[4:5]...]

    unique_labels = unique(ner.model.labels)
    deleteat!(unique_labels, findfirst(x -> x=="MISC", unique_labels))
    # Don't count MISC
    
    f1s = []

    for (p, r, d, tag) in zip(s1, s2, dg, unique_labels)
        println("For tag `$tag`")
        prec = d/p
        recall = d/r
        f1 = (2 * prec * recall) /(prec + recall)
        println("The precision is $prec")
        println("The recall is $recall")
        println("f1 is $f1")
        println()
        push!(f1s, f1)
    end

    a = sum(dg ./ s1) / length(unique_labels)
    b = sum(dg ./ s2) / length(unique_labels)
    println("Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is ", (2 * a * b)/ (a + b))
    println("Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is ", sum(f1s)/ length(f1s))
end

try_outs (generic function with 1 method)

In [7]:
try_outs(ner, X, Y, eval_ner_tagger)

For tag `ORG`
The precision is 0.8897058823529411
The recall is 0.8241185897435898
f1 is 0.855657237936772

For tag `O`
The precision is 0.9901429018462501
The recall is 0.9907888213344467
f1 is 0.9904657562833405

For tag `PER`
The precision is 0.9708029197080292
The recall is 0.9592499098449333
f1 is 0.9649918374750589

For tag `LOC`
The precision is 0.9051504334523203
The recall is 0.922077922077922
f1 is 0.9135357694287184

Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is 0.9314451550144375
Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is 0.9311626502809724


In [8]:
using PyCall, WordTokenizers
spacy = pyimport("spacy")
nlp = spacy.load("en_core_web_sm")

PyObject <spacy.lang.en.English object at 0x7fbe2d7d29e8>

In [163]:
function eval_spacy_tagger(ner_m, x_seq)
    preds = String[]
    ents = ner_m(join(x_seq, " ")).ents

    idx = 1
    i = 1
    while i <= length(x_seq)
        if idx <= length(ents) && x_seq[i] == tokenize(ents[idx].text)[1]
            l = length(tokenize(ents[idx].text))

            for k in 1:l
                pred = ents[idx].label_
                if (pred == "PERSON")
                    push!(preds, "PER")
                elseif ( pred == "LOC")
                    push!(pred == "GPE" || preds, "LOC")
                elseif (pred == "ORG")
                    push!(preds, "ORG")
                else
                    push!(preds, "INVALID")
                end
            end
            i = i + l - 1
            idx += 1
        else
            push!(preds, "O")
        end
        i += 1
    end

    return preds
end


eval_spacy_tagger (generic function with 2 methods)

In [164]:
try_outs(nlp, X, Y, eval_spacy_tagger)

For tag `ORG`
The precision is 0.5077850326469111
The recall is 0.504995004995005
f1 is 0.5063861758076634

For tag `O`
The precision is 0.9367631609664994
The recall is 0.9736131752145825
f1 is 0.9548327607489699

For tag `PER`
The precision is 0.744390243902439
The recall is 0.5715355805243446
f1 is 0.6466101694915254

For tag `LOC`
The precision is 0.5232558139534884
The recall is 0.0598404255319149
f1 is 0.10739856801909307

Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is 0.5933715492462358
Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is 0.5538069185168129


In [149]:
nltk = pyimport("nltk")
nltk_chunker = nltk.load(nltk.chunk._MULTICLASS_NE_CHUNKER)
nltk_ner(x) = nltk_chunker._tagger.tag(nltk.pos_tag((x)))

nltk_ner (generic function with 1 method)

In [162]:
function eval_nltk_tagger(ner_m, x_seq) 
    obtain_ner(x) = (x[2]) == "O" ? "O" : (x[2])[3:end]
    preds = obtain_ner.(ner_m(x_seq))

    for i in eachindex(preds)
        preds[i] == "O" && continue

        if preds[i] == "PERSON"
            preds[i] = "PER"
        elseif preds[i] == "ORGANIZATION"
            preds[i] = "ORG"
        elseif preds[i] ∈ ("LOCATION", "GPE")
            preds[i] = "LOC"
        else
            preds[i] = "INVALID"
        end
    end
    return preds
end


eval_nltk_tagger (generic function with 2 methods)

In [151]:
try_outs(nltk_ner, X, Y, eval_nltk_tagger)

For tag `ORG`
The precision is 0.5396301188903567
The recall is 0.3294354838709677
f1 is 0.4091136705057587

For tag `O`
The precision is 0.9622844001533154
The recall is 0.9827761684803883
f1 is 0.9724223410024013

For tag `PER`
The precision is 0.7286072323666308
The recall is 0.7349223546406645
f1 is 0.7317511686443725

For tag `LOC`
The precision is 0.6288187372708758
The recall is 0.6719260065288357
f1 is 0.6496580746975277

Overall Micro f1 for NER (excluding MISC) on CoNLL 2003 is 0.696859107260689
Overall Macro f1 for NER (excluding MISC) on CoNLL 2003 is 0.6907363137125151
