# Custom NER (ML NER)

## #1. Setup development environment

### Update & import Python modules

In [None]:
# install and download spaCy related modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg

# spaCy
import spacy
from spacy.tokens import DocBin

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# general Python modules
import json
import datetime
import requests
import csv
import random
import warnings
from collections import Counter
from pprint import pprint

### Get access to Firebase and Drive

In [None]:
# remount drive, forced if needed
drive.mount("/content/gdrive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/gdrive/My Drive"

# open Firebase credentials
with open(DRIVE_PATH + "/information_extraction/credentials/firebase_credentials.json") as f:
  credential = json.load(f)
credential = credentials.Certificate(credential)

# create Firestore database instance
firebase_admin.initialize_app(credential)
db = firestore.client()
print("Stablished access to Firestore")

Mounted at /content/gdrive/
Stablished access to Google Drive
Stablished access to Firestore


## #2. Train Custom NER model

In [None]:
with open(DRIVE_PATH + "/ie_course/output/joe_biden_ner_corpus.json") as f:
  TRAIN_DATA = json.load(f)

def convert(TRAIN_DATA, output_name):
  nlp = spacy.blank("en") # create a blank NLP pipeline
  doc_bin = DocBin()
  for text, annot in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
      span = doc.char_span(start, end, label=label)
      if not span:
        msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
        warnings.warn(msg)
      else:
        ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)
  doc_bin.to_disk(DRIVE_PATH + f"/ie_course/output/ml_custom_ner/{output_name}")

convert(TRAIN_DATA, "train.spacy")
print("Saved trained NER component in pipeline")
# convert(TRAIN_DATA, "valid.spacy")
# print("Saved trained (validated) NER component in pipeline")

Saved trained NER component in pipeline
