In [None]:
!pip install spacy
!pip install tqdm

In [None]:
import spacy
import json
import os
from spacy.tokens import Doc
from spacy.tokens import DocBin
from spacy.util import filter_spans
from spacy.training.example import Example
from spacy.scorer import Scorer
from collections import defaultdict
from sklearn.metrics import classification_report

path = "/content/drive/MyDrive/NER_Spacy/"

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
# Multiple Text and Ann files of Train and Test into one single JSON file

def convert_ann_to_json(ann_file_path):
    annotations = []
    with open(ann_file_path, "r") as file:
        for line in file.readlines():
            if line.startswith("T") and ";" not in line:
                annotation_data = {
                    "id": line.split()[0],
                    "tag": line.split()[1],
                    "start": line.split()[2],
                    "end": line.split()[3],
                    "text": " ".join(line.split()[4:]),
                }
                annotations.append(annotation_data)

    return annotations


def convert_multiple_files_to_json(ann_folder_path, output_json_file):
    all_data = []
    for filename in os.listdir(ann_folder_path):
        if filename.endswith(".ann"):
            ann_file_path = os.path.join(ann_folder_path, filename)
            annotations = convert_ann_to_json(ann_file_path)

            txt_file_path = os.path.join(
                ann_folder_path, filename.replace(".ann", ".txt")
            )
            if os.path.exists(txt_file_path):
                with open(txt_file_path, "r") as txt_file:
                    text_content = txt_file.read()
            else:
                text_content = "No text content found for this file."

            filename_without_extension = os.path.splitext(filename)[0]

            file_info = {
                "file_name": filename_without_extension,
                "content": text_content,
                "annotations": annotations,
                "file_start": 0,
            }
            all_data.append(file_info)

    with open(output_json_file, "w") as json_file:
        json.dump(all_data, json_file, indent=4)


train_ann_folder_path = path + "train"
train_output_json_file = path + "train_txt_ann.json"
test_ann_folder_path = path + "test"
test_output_json_file = path + "test_txt_ann.json"

convert_multiple_files_to_json(train_ann_folder_path, train_output_json_file)
convert_multiple_files_to_json(test_ann_folder_path, test_output_json_file)

In [None]:
# Build Data

with open(path + 'train_txt_ann.json', 'r') as f:
    full_data = json.load(f)

training_data = []
for data in full_data[:150]:
    temp_dict = {}
    temp_dict["text"] = data["content"]
    temp_dict["entities"] = []
    for annotation in data["annotations"]:
        start = int(annotation["start"])
        end = int(annotation["end"])
        label = annotation["tag"].upper()
        temp_dict["entities"].append((start, end, label))
    training_data.append(temp_dict)

print(len(training_data))

150


In [None]:
# Load a Blank Model

nlp = spacy.blank("en")
doc_bin = DocBin()

In [None]:
# Build Train Data

for training_example in training_data:
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk(path + "train.spacy")

In [None]:
# Build config file

!python -m spacy init fill-config /content/drive/MyDrive/NER_Spacy/base_config.cfg /content/drive/MyDrive/NER_Spacy/config.cfg
# Edit config.cfg to necessary epochs

In [None]:
%%time
# Train

!python -m spacy train /content/drive/MyDrive/NER_Spacy/config.cfg --output /content/drive/MyDrive/NER_Spacy/ --paths.train /content/drive/MyDrive/NER_Spacy/train.spacy --paths.dev /content/drive/MyDrive/NER_Spacy/train.spacy

2023-12-07 04:09:51.621453: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-07 04:09:51.621523: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-07 04:09:51.621569: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Saving to output directory: /content/drive/MyDrive/NER_Spacy[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  -

In [None]:
# All Trained models
Model_10f_100e = "model-best-10f-100e-0.001lr"
Model_25f_100e = "model-best-25f-100e-0.001lr"
Model_50f_100e = "model-best-50f-100e-0.001lr"
Model_100f_100e = "model-best-100f-100e-0.001lr"
Model_150f_100e = "model-best-150f-100e-0.001lr"

In [None]:
# Load Best Model
nlp_ner = spacy.load(path + Model_150f_100e)

Named Entity Recognition on Test file

In [None]:
with open(path + 'test/105446.txt', 'r') as f:
    text = f.read()

doc = nlp_ner(text)

colors = {"DRUG": "#1c7ea0", "STRENGTH": "#97cdca", "REASON":"#20d133","FORM":"#0c8d8e","FREQUENCY":"#b47359","ADE":"#ef3921","DOSAGE":"#8ec0e9","DURATION":"#820bf0","ROUTE":"#9a989f"}
options = {"colors": colors}

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

Scores of All the Test files

In [None]:
%%time

with open(path + 'test_txt_ann.json', 'r') as file:
    list_of_json_data = json.load(file)

accuracy_results = []

colors = {"DRUG": "#1c7ea0", "STRENGTH": "#97cdca", "REASON":"#20d133","FORM":"#0c8d8e","FREQUENCY":"#b47359","ADE":"#ef3921","DOSAGE":"#8ec0e9","DURATION":"#820bf0","ROUTE":"#9a989f"}
options = {"colors": colors}

with open(path + "Accuracy_details.txt", "w") as accuracy_file, open(path + "Entity_details.txt", "w") as entity_file, open(path + "Score_details.txt", "w") as score_file, open(path + "All_Scores_details.txt", "w") as all_scores_file:

  precision_scores = defaultdict(list)
  recall_scores = defaultdict(list)
  f1_scores = defaultdict(list)

  # [:5] for first 5 records
  for json_data in list_of_json_data:

    # Load Model
    nlp = spacy.load(path + Model_150f_100e)

    file_name = json_data["file_name"]

    test_text = json_data["content"]

    doc = nlp(test_text)

    spacy_entities = [(ent.text, ent.label_) for ent in doc.ents]

    annotated_entities = [(annotation["text"], annotation["tag"].upper()) for annotation in json_data["annotations"]]

    # spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

    correct_count = 0
    for spacy_entity in spacy_entities:
        if spacy_entity in annotated_entities:
            correct_count += 1

    # Calculating Accuracy
    accuracy = (correct_count / len(annotated_entities)) * 100 if spacy_entities else 0
    accuracy_results.append({
        "file_name": file_name,
        "accuracy": accuracy
    })

    accuracy_file.write(f"Entities for file: {file_name}\n")
    accuracy_file.write(f"Accuracy: {accuracy}\n\n")

    entity_file.write(f"Entities for file: {file_name}\n")
    entity_file.write(f"Accuracy: {accuracy}\n\n")
    entity_file.write(f"No of Entities discovered by SpaCy: {len(spacy_entities)}\n\n")
    entity_file.write(f"No of Actual Entities: {len(annotated_entities)}\n\n")
    for entity, label in spacy_entities:
        entity_file.write(f"{entity} --- {label}\n")
    entity_file.write("\n\n")

    # Calculate F1 score
    true_positive = defaultdict(int)
    false_positive = defaultdict(int)
    false_negative = defaultdict(int)

    for annotated_entity in annotated_entities:
        if annotated_entity in spacy_entities:
            true_positive[annotated_entity[1]] += 1
        else:
            false_negative[annotated_entity[1]] += 1

    for spacy_entity in spacy_entities:
        if spacy_entity not in annotated_entities:
            false_positive[spacy_entity[1]] += 1

    tags = list(set(list(true_positive.keys()) + list(false_positive.keys()) + list(false_negative.keys())))

    score_file.write(f"Score for file: {file_name}\n")

    for tag in tags:
        precision = true_positive[tag] / (true_positive[tag] + false_positive[tag]) if (true_positive[tag] + false_positive[tag]) > 0 else 0
        recall = true_positive[tag] / (true_positive[tag] + false_negative[tag]) if (true_positive[tag] + false_negative[tag]) > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        precision_scores[tag].append(precision)
        recall_scores[tag].append(recall)
        f1_scores[tag].append(f1_score)

        score_file.write(f"Tag: {tag}\n")
        score_file.write(f"Precision: {precision}\n")
        score_file.write(f"Recall: {recall}\n")
        score_file.write(f"F1-Score: {f1_score}\n")
        score_file.write(f"\n")
    score_file.write(f"---------------\n")

  # Calculate mean of precision, recall and f1 score
  all_scores_file.write(f"Average Scores:\n")
  all_scores_file.write(f"---------------------------\n")

  mean_precision_scores = {}
  mean_recall_scores = {}
  mean_f1_scores = {}

  # Mean of Precision Scores
  all_scores_file.write(f"Mean Precision Scores:\n")
  for tag, precision_score in precision_scores.items():
      mean_precision_scores[tag] = sum(precision_score) / len(precision_score)
  for tag, mean_precision in mean_precision_scores.items():
      all_scores_file.write(f"Tag {tag}: {mean_precision}\n")
  all_scores_file.write("---------------------------\n")

  # Mean of Recall Scores
  all_scores_file.write(f"Mean Recall Scores:\n")
  for tag, recall_score in recall_scores.items():
      mean_recall_scores[tag] = sum(recall_score) / len(recall_score)
  for tag, mean_recall in mean_recall_scores.items():
      all_scores_file.write(f"Tag {tag}: {mean_recall}\n")
  all_scores_file.write("---------------------------\n")

  # Mean of F1 Scores
  all_scores_file.write(f"Mean F1 Scores:\n")
  for tag, f1_score in f1_scores.items():
      mean_f1_scores[tag] = sum(f1_score) / len(f1_score)
  for tag, mean_f1 in mean_f1_scores.items():
      all_scores_file.write(f"Tag {tag}: {mean_f1}\n")
  all_scores_file.write("---------------------------\n")

  mean_accuracy = sum([result["accuracy"] for result in accuracy_results]) / len(accuracy_results)
  all_scores_file.write(f"Mean Accuracy of test result: {mean_accuracy}\n")

print(f"Final Mean Accuracy: {mean_accuracy}")

Final Mean Accuracy: 81.87999063024598
CPU times: user 8min 39s, sys: 1min 52s, total: 10min 32s
Wall time: 12min 13s
