## training and testing dataset generation

In [1]:
import random
from dictionaries import generate_time_patterns, situations, symptoms, injury_causes, ambulance_equipment, location_of_occurrence, vital_signs, drugs, dosages, quantities, routes, sentence_templates
from sentence_templates import generate_sentence 

label_counter = {
    "QUANTITY": 0,
    "DOSAGE": 0,
    "DRUG_NAME": 0,
    "ROUTE": 0,
    "TIME_OF_ACTION": 0,
    "SITUATION": 0,
    "SYMPTOM": 0,
    "INJURY": 0,
    "EQUIPMENT": 0,
    "LOCATION": 0,
    "VITAL_SIGN": 0
}
TRAIN_DATA = []
for _ in range(20): 
    # Randomly choose elements from the dictionaries
    situation = random.choice(situations)
    drug = random.choice(drugs)
    dosage = random.choice(dosages)
    quantity = random.choice(quantities)
    route = random.choice(routes)
    time_of_action = generate_time_patterns(1)
    symptom = random.choice(symptoms)
    injury_cause = random.choice(injury_causes)
    equipment = random.choice(ambulance_equipment)
    location = random.choice(location_of_occurrence)
    vital_sign = random.choice(vital_signs)

    sentence = generate_sentence(situation, drug, dosage, quantity, route, time_of_action, symptom, injury_cause, equipment, location, vital_sign)

    #print(sentence)
    entities = []
    used_spans = set()
    
    def add_entity(label, value):
        start = sentence.find(value)
        end = start + len(value)
        if start != -1 and (start, end) not in used_spans:
            entities.append((start, end, label))
            used_spans.add((start, end))
    
    add_entity("QUANTITY", quantity)
    add_entity("DOSAGE", dosage)
    add_entity("DRUG_NAME", drug)
    add_entity("ROUTE", route)
    add_entity("TIME_OF_ACTION", time_of_action)
    add_entity("SITUATION", situation)
    add_entity("SYMPTOM", symptom)
    add_entity("INJURY", injury_cause)
    add_entity("EQUIPMENT", equipment)
    add_entity("LOCATION", location)
    add_entity("VITAL_SIGN", vital_sign)



    for _, _, label in entities:
        if label in label_counter:
            label_counter[label] += 1
    
    TRAIN_DATA.append((sentence, {"entities": entities}))

# Print the generated dataset
#for entry in TRAIN_DATA:
    #print(entry)
    #print(entry[0])  

with open("train_data.txt", "w") as f:
    for entry in TRAIN_DATA:
        sentence = entry[0]  # The generated sentence
        entities = entry[1]["entities"]  # The entities in the format of (start, end, label)
        
        # Write sentence and entities directly without additional labels
        f.write(f"{sentence}\n")
        f.write(f"{entities}\n")

In [2]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and test (20%)
train_data, test_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

print(f"Training data: {len(train_data)}")
print(f"Test data: {len(test_data)}")

Training data: 16
Test data: 4


## testing before retraining

In [5]:
import spacy
from spacy.training.example import Example
import random

# Initialize the blank English model
nlp = spacy.blank("en")



for text, annotations in test_data:
    doc = nlp(text)
    
    # Print the entities detected by the model
    print(f"Text: {text}")
    print("\n--- Predicted Entities ---")
    for ent in doc.ents:
        print(f"{ent.text:30} → {ent.label_}")
    
    print("\n--- Ground Truth ---")
    for start, end, label in annotations['entities']:
        print(f"{text[start:end]:30} → {label}")
    print("\n")


Text: After The patient was found shivering and unresponsive due to hypothermia., Hematemesis was detected. The Blood pressure monitor in Public administrative building helped stabilize the patient while Heart rhythm was checked regularly.

--- Predicted Entities ---

--- Ground Truth ---
The patient was found shivering and unresponsive due to hypothermia. → SITUATION
Hematemesis                    → SYMPTOM
Blood pressure monitor         → EQUIPMENT
Public administrative building → LOCATION
Heart rhythm                   → VITAL_SIGN


Text: While at the Recreation area, the EMS team delivered 5 sprays of Calcium Chloride / Lactate / Potassium Chloride / Sodium Chloride 10mg subcutaneous during appointment. A patient experiencing severe chest pain was assessed for possible myocardial infarction or aortic dissection.

--- Predicted Entities ---

--- Ground Truth ---
5 sprays                       → QUANTITY
10mg                           → DOSAGE
Calcium Chloride / Lactate / Potassium 

## retraining

In [7]:
import spacy
from spacy.training import offsets_to_biluo_tags
from spacy.tokens import DocBin

# Load your trained or base model (use 'en_core_web_sm' if unsure)
nlp = spacy.blank("en")  # or spacy.load("en_core_web_sm")


def check_entity_alignment(train_data):
    aligned_data = []
    misaligned_samples = []

    for text, ann in train_data:
        doc = nlp.make_doc(text)
        try:
            tags = offsets_to_biluo_tags(doc, ann["entities"])
            if "-" in tags:
                print(f"[MISALIGNED] -> '{text}'")
                print(f"Entities: {ann['entities']}\n")
                misaligned_samples.append((text, ann))
            else:
                aligned_data.append((text, ann))
        except Exception as e:
            print(f"[ERROR] -> '{text}' caused: {e}")
            misaligned_samples.append((text, ann))

    print(f"\n✅ Aligned: {len(aligned_data)} | ❌ Misaligned: {len(misaligned_samples)}")
    return aligned_data, misaligned_samples

# Run alignment check
aligned, misaligned = check_entity_alignment(train_data)



✅ Aligned: 16 | ❌ Misaligned: 0


In [68]:
for text, annot in train_data:
    doc = nlp.make_doc(text)
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is None:
            print(f"Misaligned: {text[start:end]} [{start}:{end}]")


In [9]:
ner = nlp.add_pipe("ner", last=True)

for _, annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])
optimizer = nlp.begin_training()

# Training loop
for epoch in range(20):  # Adjust number of epochs based on your needs
    random.shuffle(train_data)
    losses = {}
    
    # Iterate through each example in the training data
    for text, annotations in train_data:
        # Create an Example object for each training sample
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], drop=0.5, losses=losses)  # Drop used for regularization
    
    print(f"Epoch {epoch}, Losses: {losses}")
# Save the trained model to disk
nlp.to_disk("custom_ner_model")


Epoch 0, Losses: {'ner': 514.7792704105377}
Epoch 1, Losses: {'ner': 223.17344353627107}
Epoch 2, Losses: {'ner': 158.75673583450344}
Epoch 3, Losses: {'ner': 180.2761879007569}
Epoch 4, Losses: {'ner': 157.48548239551974}
Epoch 5, Losses: {'ner': 173.57948865865762}
Epoch 6, Losses: {'ner': 165.79927477186308}
Epoch 7, Losses: {'ner': 192.41176317237745}
Epoch 8, Losses: {'ner': 186.50564323110444}
Epoch 9, Losses: {'ner': 188.79866420181634}
Epoch 10, Losses: {'ner': 198.95356079624486}
Epoch 11, Losses: {'ner': 146.048731084184}
Epoch 12, Losses: {'ner': 159.13642855974297}
Epoch 13, Losses: {'ner': 154.32410323300402}
Epoch 14, Losses: {'ner': 138.7464992616022}
Epoch 15, Losses: {'ner': 148.75618713417288}
Epoch 16, Losses: {'ner': 132.73636298270995}
Epoch 17, Losses: {'ner': 127.21273677925308}
Epoch 18, Losses: {'ner': 129.774627314753}
Epoch 19, Losses: {'ner': 121.0059259135441}


## testing after retraining

In [11]:
from spacy import scorer
nlp = spacy.load("custom_ner_model")

sc = scorer.Scorer()

examples = []  

for text, annotations in test_data:
    doc = nlp.make_doc(text)  
    example = Example.from_dict(doc, annotations)
    examples.append(example)

scores = sc.score(examples)


print("Evaluation Results:")
print(sc)

# Print the results
from pprint import pprint
print("Evaluation Results:")
pprint(scores)

for text, annotations in test_data:
    doc = nlp(text)
    
    # Print the entities detected by the model
    print(f"Text: {text}")
    print("\n--- Predicted Entities ---")
    for ent in doc.ents:
        print(f"{ent.text:30} → {ent.label_}")
    
    print("\n--- Ground Truth ---")
    for start, end, label in annotations['entities']:
        print(f"{text[start:end]:30} → {label}")
    print("\n")

Evaluation Results:
<spacy.scorer.Scorer object at 0x0000011F4C57E660>
Evaluation Results:
{'cats_auc_per_type': {},
 'cats_f_per_type': {},
 'cats_macro_auc': 0.0,
 'cats_macro_f': 0.0,
 'cats_macro_p': 0.0,
 'cats_macro_r': 0.0,
 'cats_micro_f': 0.0,
 'cats_micro_p': 0.0,
 'cats_micro_r': 0.0,
 'cats_score': 0.0,
 'cats_score_desc': 'macro F',
 'dep_las': None,
 'dep_las_per_type': None,
 'dep_uas': None,
 'ents_f': 0.0,
 'ents_p': 0.0,
 'ents_per_type': {'DOSAGE': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'DRUG_NAME': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'EQUIPMENT': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'LOCATION': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'QUANTITY': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'ROUTE': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'SITUATION': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'SYMPTOM': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                   'TIME_OF_ACTION': {'f': 0.0, 'p': 0.0