### Preprocessing:

In [12]:
import nltk
from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.corpus import conll2000
from nltk.metrics import precision, recall, f_measure
from collections import defaultdict

# 1. Load the dataset (English data from CoNLL 2000 shared task)
nltk.download('conll2000')
train_sents = list(conll2000.iob_sents('train.txt'))
test_sents = list(conll2000.iob_sents('test.txt'))

# 2. We are going to define a custom NER chunker.
# The main idea behind this chunker is to utilize POS tags as the main feature for recognizing named entities.
class NEChunkParser(ChunkParserI):
    def __init__(self, train_sents):
        # We are using a combination of unigram and bigram taggers for better accuracy.
        # First, let's convert the IOB tree format data to a simple tag sequence.
        train_data = [[(t, c) for w, t, c in sent] for sent in train_sents]
        self.tagger = BigramTagger(train_data, backoff=UnigramTagger(train_data))

    def parse(self, sentence):
        # Extract POS tags from the sentence.
        pos_tags = [pos for (word, pos, chunktag) in sentence]
        # Predict chunk tags (B-ORG, I-ORG, O, etc.)
        tagged_pos_tags = self.tagger.tag(pos_tags)
        # Combine word, pos tag, and chunk tag into a format suitable for the tree representation.
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for (word, pos, chunktag) in sentence]
        # Convert to tree
        return conlltags2tree(conlltags)

# 3. Train the NER chunker.
chunker = NEChunkParser(train_sents)

# 4. Test the chunker on unseen data and convert results to conll format.
gold = [tree2conlltags(conlltags2tree([(word, pos, ne_tag) for word, pos, ne_tag in sent])) for sent in test_sents]
test = [tree2conlltags(chunker.parse(sent)) for sent in test_sents]

# 5. Compute performance metrics.
gold_chunks = set(gold[0] + gold[1])
test_chunks = set(test[0] + test[1])

precision_score = precision(gold_chunks, test_chunks)
recall_score = recall(gold_chunks, test_chunks)
f_measure_score = f_measure(gold_chunks, test_chunks)

print('Precision:', precision_score)
print('Recall:', recall_score)
print('F-measure:', f_measure_score)

# 6. Display sample predictions
for sent in test_sents[:5]:
    print(chunker.parse(sent))

[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


Precision: 1.0
Recall: 1.0
F-measure: 1.0
(S
  (NP Rockwell/NNP International/NNP Corp./NNP)
  (NP 's/POS Tulsa/NNP unit/NN)
  (VP said/VBD)
  (NP it/PRP)
  (VP signed/VBD)
  (NP a/DT tentative/JJ agreement/NN)
  (VP extending/VBG)
  (NP its/PRP$ contract/NN)
  (PP with/IN)
  (NP Boeing/NNP Co./NNP)
  (VP to/TO provide/VB)
  (NP structural/JJ parts/NNS)
  (PP for/IN)
  (NP Boeing/NNP)
  (NP 's/POS 747/CD jetliners/NNS)
  ./.)
(S
  (NP Rockwell/NNP)
  (VP said/VBD)
  (NP the/DT agreement/NN)
  (VP calls/VBZ)
  (SBAR for/IN)
  (NP it/PRP)
  (VP to/TO supply/VB)
  (NP 200/CD additional/JJ so-called/JJ shipsets/NNS)
  (PP for/IN)
  (NP the/DT planes/NNS)
  ./.)
(S
  (NP These/DT)
  (VP include/VBP)
  ,/,
  (PP among/IN)
  (NP other/JJ parts/NNS)
  ,/,
  (NP each/DT jetliner/NN)
  (NP 's/POS two/CD major/JJ bulkheads/NNS)
  ,/,
  (NP a/DT pressure/NN floor/NN)
  ,/,
  (NP torque/NN box/NN)
  ,/,
  (NP fixed/VBN leading/VBG edges/NNS)
  (PP for/IN)
  (NP the/DT wings/NNS)
  and/CC
  (NP an/D