In [None]:
!pip install spacy



In [None]:
!python -m spacy download en_core_web_md



Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
!pip install spacy-lookups-data


Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.5-py2.py3-none-any.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.5


In [None]:
import spacy
import torch
from transformers import AutoModel, AutoTokenizer
import json
from spacy.training.example import Example

# Load spaCy model for linguistic preprocessing
nlp = spacy.load("en_core_web_md")

# Load BERT model and tokenizer
bert_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

# Load the JSON dataset
with open('training_data.json', 'r') as json_file:
    data = json.load(json_file)

# Define a function to obtain BERT embeddings for a given text
def get_bert_embeddings(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**tokens)
    return outputs.last_hidden_state

# Process the data and create examples for training
train_data = []

for item in data:
    text = item["text"]
    annotations = item["annotations"]

    doc = nlp.make_doc(text)
    bert_embeddings = get_bert_embeddings(text)

    entities = []

    for ann in annotations:
        start, end, label = ann["start"], ann["end"], ann["label"]
        entities.append((start, end, label))

    example = Example.from_dict(doc, {"entities": entities})
    example.reference.user_data["bert_embeddings"] = bert_embeddings

    train_data.append(example)

# Define a blank spaCy NER model
nlp_ner = spacy.blank("en")

# Add a named entity recognition component to the spaCy pipeline with the correct transition scheme
from spacy.pipeline.ner import BiluoPushDown

ner = nlp_ner.add_pipe("ner", config={"transition_scheme": BiluoPushDown.transition_scheme})

# Add labels to the NER model
for example in train_data:
    for ent in example.reference.ents:
        label = ent.label_
        if label not in ner.labels:
            ner.add_label(label)

# Train the NER model using the combined data
nlp_ner.begin_training()

for example in train_data:
    ner.update([example])

# Save the trained NER model to a file
nlp_ner.to_disk('custom_ner_model')


ImportError: ignored