In [9]:
# remember to run `make install-spacy` to install the spaCy model

import os
from pathlib import Path
import spacy

# Get the project root directory and data path
project_root = Path.cwd().parent  # Go up one level from notebooks directory
data_dir = project_root / "data" / "actions"

# Get the first text file
text_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
first_file = os.path.join(data_dir, text_files[0])

# Read the contents
with open(first_file, 'r') as f:
    text = f.read()

#Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Process the text
doc = nlp(text)

In [11]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

the Sustainable Farming Incentive (SFI WORK_OF_ART
2024 DATE
SFI ORG
5 years DATE
215 MONEY
the year DATE
each year DATE
each year DATE
5-year DATE
summer 2024 DATE
winter DATE
summer DATE
section 6 ‘Eligible LAW
National Insurance ORG


In [13]:
from transformers import pipeline

nlp = pipeline("ner", model="dslim/bert-base-NER")
# sentence = "Apple is looking at buying U.K. startup for $1 billion"
ner_results = nlp(text)

for entity in ner_results:
    print(entity['word'], entity['entity'])

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sustainable B-MISC
Farm I-MISC
##ing I-MISC
Inc I-MISC
##ent I-MISC
##ive I-MISC
SF B-MISC
SF B-MISC


In [14]:
ner_results

[{'entity': 'B-MISC',
  'score': np.float32(0.6266002),
  'index': 7,
  'word': 'Sustainable',
  'start': 25,
  'end': 36},
 {'entity': 'I-MISC',
  'score': np.float32(0.64785767),
  'index': 8,
  'word': 'Farm',
  'start': 37,
  'end': 41},
 {'entity': 'I-MISC',
  'score': np.float32(0.56808525),
  'index': 9,
  'word': '##ing',
  'start': 41,
  'end': 44},
 {'entity': 'I-MISC',
  'score': np.float32(0.7891627),
  'index': 10,
  'word': 'Inc',
  'start': 45,
  'end': 48},
 {'entity': 'I-MISC',
  'score': np.float32(0.79436713),
  'index': 11,
  'word': '##ent',
  'start': 48,
  'end': 51},
 {'entity': 'I-MISC',
  'score': np.float32(0.80697817),
  'index': 12,
  'word': '##ive',
  'start': 51,
  'end': 54},
 {'entity': 'B-MISC',
  'score': np.float32(0.8021642),
  'index': 14,
  'word': 'SF',
  'start': 56,
  'end': 58},
 {'entity': 'B-MISC',
  'score': np.float32(0.76270396),
  'index': 29,
  'word': 'SF',
  'start': 112,
  'end': 114}]

In [21]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/adamfletcher/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/adamfletcher/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/adamfletcher/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/adamfletcher/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/adamfletcher/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru

(S
  This/DT
  is/VBZ
  an/DT
  action/NN
  in/IN
  the/DT
  (ORGANIZATION Sustainable/JJ Farming/NNP Incentive/NNP)
  (/(
  (ORGANIZATION SFI/NNP)
  )/)
  scheme/NN
  :/:
  expanded/VBN
  offer/NN
  for/IN
  2024/CD
  ./.
  You/PRP
  must/MD
  read/VB
  the/DT
  (ORGANIZATION SFI/NNP)
  scheme/NN
  information/NN
  to/TO
  understand/VB
  the/DT
  scheme/NN
  rules/NNS
  and/CC
  how/WRB
  to/TO
  apply/VB
  ./.
  5/CD
  years/NNS
  £215/JJ
  per/IN
  hectare/NN
  (/(
  ha/NN
  )/)
  per/IN
  year/NN
  This/DT
  action/NN
  ’/VBZ
  s/JJ
  aim/NN
  is/VBZ
  that/IN
  there/EX
  ’/NNP
  s/VBD
  a/DT
  well-managed/JJ
  ,/,
  intact/JJ
  grass/NN
  sward/NN
  growing/VBG
  over/IN
  the/DT
  historic/JJ
  or/CC
  archaeological/JJ
  feature/NN
  throughout/IN
  the/DT
  year/NN
  ,/,
  with/IN
  minimal/JJ
  scrub/NN
  cover/NN
  and/CC
  bare/JJ
  ground/NN
  ./.
  The/DT
  purpose/NN
  of/IN
  this/DT
  is/VBZ
  to/TO
  :/:
  You/PRP
  can/MD
  do/VB
  this/DT
  action/NN
  on/IN
  lan

In [22]:

# sentence = "Apple is looking at buying U.K. startup for $1 billion"
tokens = word_tokenize(text)
tags = pos_tag(tokens)
entities = ne_chunk(tags)

print(entities)

(S
  This/DT
  is/VBZ
  an/DT
  action/NN
  in/IN
  the/DT
  (ORGANIZATION Sustainable/JJ Farming/NNP Incentive/NNP)
  (/(
  (ORGANIZATION SFI/NNP)
  )/)
  scheme/NN
  :/:
  expanded/VBN
  offer/NN
  for/IN
  2024/CD
  ./.
  You/PRP
  must/MD
  read/VB
  the/DT
  (ORGANIZATION SFI/NNP)
  scheme/NN
  information/NN
  to/TO
  understand/VB
  the/DT
  scheme/NN
  rules/NNS
  and/CC
  how/WRB
  to/TO
  apply/VB
  ./.
  5/CD
  years/NNS
  £215/JJ
  per/IN
  hectare/NN
  (/(
  ha/NN
  )/)
  per/IN
  year/NN
  This/DT
  action/NN
  ’/VBZ
  s/JJ
  aim/NN
  is/VBZ
  that/IN
  there/EX
  ’/NNP
  s/VBD
  a/DT
  well-managed/JJ
  ,/,
  intact/JJ
  grass/NN
  sward/NN
  growing/VBG
  over/IN
  the/DT
  historic/JJ
  or/CC
  archaeological/JJ
  feature/NN
  throughout/IN
  the/DT
  year/NN
  ,/,
  with/IN
  minimal/JJ
  scrub/NN
  cover/NN
  and/CC
  bare/JJ
  ground/NN
  ./.
  The/DT
  purpose/NN
  of/IN
  this/DT
  is/VBZ
  to/TO
  :/:
  You/PRP
  can/MD
  do/VB
  this/DT
  action/NN
  on/IN
  lan