In [3]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.data import Dictionary
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings, TransformerWordEmbeddings
from flair.trainers import ModelTrainer
#from datasets import load_dataset

# Load the conll2003 dataset with custom code enabled
#raw_datasets = load_dataset("conll2003", trust_remote_code=True, cache_dir="./cache")
# 1. Load the CoNLL-03 corpus
# Specify the folder where your dataset is located and column structure
data_folder = 'training'
columns = {0: 'text', 1: 'ner'}

# Load the custom dataset
corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', test_file='test_corrected.txt', dev_file='dev.txt')

# 2. Define the tag type to predict
tag_type = 'ner'

# 3. Create the tag dictionary from the corpus
# tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
tag_dictionary = Dictionary(add_unk=True)
tag_dictionary.add_item('O')
tag_dictionary.add_item('B-PER')
tag_dictionary.add_item('I-PER')
tag_dictionary.add_item('B-ORG')
tag_dictionary.add_item('I-ORG')
tag_dictionary.add_item('B-LOC')
tag_dictionary.add_item('I-LOC')

# 4. Initialize embeddings
embedding_types = [
    WordEmbeddings('glove'), 
    TransformerWordEmbeddings('roberta-base')
]
embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. Initialize the sequence tagger
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type
)

# 6. Initialize the trainer
trainer = ModelTrainer(tagger, corpus)

# 7. Train the model
trainer.train(
    'resources/taggers/ner-english',
    train_with_dev=True,
    max_epochs=15
)

2024-11-14 15:06:35,241 Reading data from training
2024-11-14 15:06:35,242 Train: training\train.txt
2024-11-14 15:06:35,243 Dev: training\dev.txt
2024-11-14 15:06:35,243 Test: training\test_corrected.txt


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


2024-11-14 15:07:24,889 SequenceTagger predicts: Dictionary with 8 tags: <unk>, O, B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC
2024-11-14 15:07:24,900 ----------------------------------------------------------------------------------------------------
2024-11-14 15:07:24,901 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): TransformerWordEmbeddings(
      (model): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50266, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (

  scaler = torch.cuda.amp.GradScaler(enabled=use_amp and flair.device.type != "cpu")


2024-11-14 15:07:33,299 epoch 1 - iter 1/15 - loss 1.52928654 - time (sec): 8.38 - samples/sec: 84.56 - lr: 0.100000 - momentum: 0.000000
2024-11-14 15:07:42,443 epoch 1 - iter 2/15 - loss 1.08233734 - time (sec): 17.53 - samples/sec: 76.56 - lr: 0.100000 - momentum: 0.000000
2024-11-14 15:07:50,357 epoch 1 - iter 3/15 - loss 0.97037187 - time (sec): 25.44 - samples/sec: 80.02 - lr: 0.100000 - momentum: 0.000000
2024-11-14 15:07:58,643 epoch 1 - iter 4/15 - loss 0.88852212 - time (sec): 33.73 - samples/sec: 82.90 - lr: 0.100000 - momentum: 0.000000
2024-11-14 15:08:05,809 epoch 1 - iter 5/15 - loss 0.80554789 - time (sec): 40.89 - samples/sec: 85.10 - lr: 0.100000 - momentum: 0.000000
2024-11-14 15:08:14,302 epoch 1 - iter 6/15 - loss 0.75727072 - time (sec): 49.39 - samples/sec: 83.34 - lr: 0.100000 - momentum: 0.000000
2024-11-14 15:08:23,102 epoch 1 - iter 7/15 - loss 0.70130977 - time (sec): 58.19 - samples/sec: 82.97 - lr: 0.100000 - momentum: 0.000000
2024-11-14 15:08:34,470 epoc

100%|██████████| 3/3 [00:10<00:00,  3.65s/it]

2024-11-14 15:38:24,600 
Results:
- F-score (micro) 0.1788
- F-score (macro) 0.1571
- Accuracy 0.1046

By class:
              precision    recall  f1-score   support

         ORG     0.2000    0.0303    0.0526        66
         PER     0.4800    0.4444    0.4615        27
         LOC     1.0000    0.0606    0.1143        33
         nk>     0.0000    0.0000    0.0000         0

   micro avg     0.3019    0.1270    0.1788       126
   macro avg     0.4200    0.1338    0.1571       126
weighted avg     0.4695    0.1270    0.1564       126

2024-11-14 15:38:24,600 ----------------------------------------------------------------------------------------------------





{'test_score': 0.1787709497206704}

In [None]:
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup

# url = "https://www.foxnews.com/politics/17-retired-military-officials-raise-alarm-bidens-electric-vehicle-push"
# html = urlopen(url).read()
with open('fox1.html', 'r', encoding='utf-8') as file:
    html = file.read()
soup = BeautifulSoup(html, features="html.parser")
main_content = soup.find('div', class_='article-body')# called 'article-body' in Fox News
paragraphs = main_content.find_all('p')
with open("txt_output.txt", "w", encoding="utf-8") as file:
    # Iterate through paragraphs and write each to the file
    for p in paragraphs:
        text = p.get_text()
        # Split the text into sentences
        sentences = text.split('. ')
        # Check and write sentences that are not in all caps
        for sentence in sentences:
            # If the sentence is not in all uppercase, write it to the file
            if not sentence.isupper():
                file.write(sentence + ".\n")

#######################################################################################################################################################################################################################################################################################################################
# load tagger
tagger = SequenceTagger.load("resources/taggers/ner-english/final-model.pt")
# make example sentence
# sentence = 'One week after challenging the Biden administrations sweeping new emissions standards for cars and light-duty vehicles, a coalition of agricultural, manufacturing and energy production concerns launched lawsuits to block impending rules on trucks and buses. The new standards apply to "heavy-duty vocational vehicles," which also include garbage trucks and bobtails. While the Biden administration and its agencies have strenuously denied the new rules represent a forthcoming "ban" on internal combustion engines, plaintiffs were not convinced as they filed petitions in Washington, D.C., federal court this week. The American Petroleum Institute (API) led one suit, which included the American Farm Bureau Federation, Corn Growers Association and Owner-Operator Independent Drivers Association. EPA SUED BY CONSUMER, MANUFACTURING, AGRICULTURAL COALITIONS OVER BIDENS NEW VEHICLE EMISSIONS RULE  The groups named the Environmental Protection Agency (EPA) and Biden-appointed administrator Michael Regan as defendants, while a group headlined by the American Fuel'
# Read in a single txt file line by line and save as string
file_path = 'txt_output.txt'
with open(file_path, encoding = "utf-8") as file:
    file_content = ''
    line = file.readline()
     
    while line:
        file_content += line
        line = file.readline()

# characters_to_keep = "-'&\".:()"
# translator = str.maketrans('', '', string.punctuation.translate(str.maketrans('', '', characters_to_keep)))
# Use translate to remove punctuation
# cleaned_sentence = file_content.translate(translator)

sent = Sentence(file_content)
print(sent.get_spans())
# predict NER tags
tagger.predict(sent)

# print sentence
for entity in sent.get_spans('ner'):
    print(entity)

# print('The following NER tags are found:')

# output_file_path = 'ner_results_tested.txt'

# with open(output_file_path, 'w', encoding="utf-8") as out_file:
    # out_file.write('The following NER tags are found:\n')
    # for entity in sent.get_spans('ner'):
        # out_file.write(f'{entity}\n')  # write each entity to the file

# for entity in sent.get_spans('ner'):
#    output_file.write(entity)
#    print(entity)
# output_file.close()

In [None]:
output_file_path = 'ner_results.txt'

with open(output_file_path, 'w', encoding="utf-8") as out_file:
    out_file.write('The following NER tags are found:\n')
    for entity in sent.get_spans('ner'):
        out_file.write(f'{entity}\n')  # write each entity to the file