Mounted Gdrive


In [9]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# NER Model training

In [11]:
!pip install spacy-lookups-data
import pandas as pd
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

# Load the scibert model
nlp = spacy.load("en_core_web_sm")

# Load the CSV data and drop any rows with missing values in the text, start, and end columns
data = pd.read_csv('/content/gdrive/MyDrive/social_history.csv').dropna(subset=['text', 'start', 'end'])

# Convert the CSV data into training examples
def get_examples(nlp):
    examples = []
    for _, row in data.iterrows():
        start = int(row['start'])
        end_value = row['end']
        if pd.isna(end_value) or end_value == '':
            continue
        try:
            end = int(end_value)
        except ValueError:
            continue
        entity_type = row['sbdh']
        text = row['text']
        example_dict = Example.from_dict(
            nlp.make_doc(text), 
            {"entities": [(start, end, entity_type)]}
        ).to_dict()
        if 'entities' in example_dict:
            examples.append(Example.from_dict(nlp.make_doc(text), example_dict))
    return examples

# Add the entity labels
entity_labels = set([example.to_dict()['entities'][0][2] for example in get_examples(nlp)])

for label in entity_labels:
    nlp.entity.add_label(label)

# Initialize the model
n_iter = 100
batch_size = 4
dropout = 0.5

# Train the model
optimizer = nlp.begin_training()
examples = get_examples(nlp)

for i in range(n_iter):
    losses = {}
    random.shuffle(examples)
    batches = minibatch(examples, size=compounding(batch_size, batch_size*2, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)

# Save the model
nlp.to_disk('op1')

# Use the model to process input text and generate NER tags
nlp = spacy.load('op1')
doc = nlp("the patient is suffering from brain cancer")
for ent in doc.ents:
    print(ent.text, ent.label_)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/



MEDIC..." with entities "[(534, 540, 'behavior_tobacco')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
..." with entities "[(2234, 2240, 'behavior_tobacco')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
retired electrical engineer with 7..." with entities "[(1793, 1800, 'behavior_tobacco')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Quit tobacco 30 years ago
Rarely d..." with entities "[(655, 661, 'behavior_tobacco')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Lives alone, has pets
Single, neve..." with entities "[(3170, 3177, 'behavior_

the ORG
patient ORG
is ORG
suffering ORG
from ORG
brain ORG
cancer ORG


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


###Copy to Gdrive

In [13]:
!cp -r "/content/op1" "/content/gdrive/MyDrive"

##Evaluation Metrics for trained model

In [73]:
import spacy
from spacy.tokens import Doc
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import time

# Load the trained model
nlp = spacy.load('/content/op1')

# Define the evaluation function
def evaluate_model(docs):
    start_time = time.time()
    true_labels = []
    pred_labels = []
    for doc in docs:
        true_labels_binary = [int(ent.label_ in doc.text[ent.start_char:ent.end_char]) for ent in doc.ents]
        doc = nlp(doc.text)
        pred_labels_binary = [int(ent.label_ in [ent.label_ for ent in doc.ents]) for ent in doc.ents]
        true_labels.extend(true_labels_binary)
        pred_labels.extend(pred_labels_binary)
    # Calculate the evaluation metrics
    strict_metrics = precision_recall_fscore_support(true_labels, pred_labels, average='binary')
    partial_metrics = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')
    # Calculate the processing time
    processing_time = (time.time() - start_time) / len(docs)
    return strict_metrics, partial_metrics, processing_time

# Load the test datasocial_history
test_data = pd.read_csv('/test.csv')

# Convert the test data into Doc objects
#docs = [nlp(row['text']) for _, row in test_data.iterrows()]
docs = [nlp(str(row['text'])) for _, row in test_data.iterrows()]

# Evaluate the model
strict_metrics, partial_metrics, processing_time = evaluate_model(docs)

# Print the evaluation results
print('Strict matching:\nPrecision: {}\nRecall: {}\nF1 score: {}'.format(strict_metrics[0], strict_metrics[1], strict_metrics[2]))
print('Partial matching:\nPrecision: {}\nRecall: {}\nF1 score: {}'.format(partial_metrics[0], partial_metrics[1], partial_metrics[2]))
print('Processing time per document: {} seconds'.format(processing_time))


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


Strict matching:
Precision: 5.522512522297145e-06
Recall: 1.0
F1 score: 1.104496404864202e-05
Partial matching:
Precision: 3.049814455892877e-11
Recall: 5.522512522297145e-06
F1 score: 6.099595226694733e-11
Processing time per document: 0.9195466117858887 seconds


  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
import spacy
from spacy.tokens import Doc, Span
from spacy.training.example import Example
import csv

# load the trained NER model
nlp = spacy.load("/content/op1")

def read_test_data(file_path):
    examples = []
    with open(file_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) == 4:
                text = row[0]
                start = int(row[1]) if row[1] else None
                end = int(row[2]) if row[2] else None
                label = row[3]
                entities = [(start, end, label)]
                example = Example.from_dict(nlp.make_doc(text), {"entities": entities})
                examples.append(example)
    return examples



# read in the test data
test_data = read_test_data("/test.csv")

import random
random.shuffle(test_data)

# evaluate the trained model on the test data
scores = nlp.evaluate(test_data)

# print the evaluation scores
print(scores)


..." with entities "[(2234, 2240, 'behavior_tobacco')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
retired electrical engineer with 7..." with entities "[(1793, 1800, 'behavior_tobacco')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Quit tobacco 30 years ago
Rarely d..." with entities "[(655, 661, 'behavior_tobacco')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Lives alone, has pets
Single, neve..." with entities "[(3170, 3177, 'behavior_alcohol')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

PHYSICAL ..." with entities "[(1979, 1986, 'sdo

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'tag_acc': None, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'lemma_acc': None, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'ents_per_type': {'ORG': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_environment': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'behavior_tobacco': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'behavior_alcohol': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_education': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_economics': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_community': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'behavior_drug': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'speed': 8259.039281952346}


In [67]:
import random
random.shuffle(test_data)

# evaluate the trained model on the test data
scores = nlp.evaluate(test_data)

# print the evaluation scores
print(scores)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'tag_acc': None, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'lemma_acc': None, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'ents_per_type': {'ORG': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'behavior_tobacco': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_community': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_environment': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'behavior_alcohol': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'behavior_drug': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_education': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'sdoh_economics': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'speed': 8493.372564218998}
