### Creating the NER model for use in the auto admin app (personal details removed)
Run on Colab

In [1]:
#mounting the drive to access datafiles
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import & installs

!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install spacy-transformers
!spacy download en_core_web_trf

import spacy
import pandas as pd

In [3]:
#getting the basic small English spacy
#nlp=spacy.load('en_core_web_sm')

#getting the spacy transformers pipeline
nlp=spacy.load('en_core_web_trf')

#getting the ner pipeline component
ner=nlp.get_pipe("ner")

In [None]:
#getting the emails dataset - for testing examples

df = pd.read_csv('full_emails.csv')
df = df.loc[df['Booking'] == True] #getting only booking emails
df.drop('Unnamed: 0', axis=1, inplace=True)
print(df.info())

#getting sample data for training spacy examples
pd.set_option('max_colwidth', 800) #so can read full email
tail = df.tail(2000) #just looking at mot recent format as thats what mostly recieve now
sample = tail.sample(50) #getting a random sample

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33858 entries, 168 to 153196
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     33858 non-null  object
 1   Subject  33858 non-null  object
 2   Body     33858 non-null  object
 3   From:    33858 non-null  object
 4   To:      33858 non-null  object
 5   Booking  33858 non-null  bool  
 6   Cleaned  33858 non-null  object
dtypes: bool(1), object(6)
memory usage: 1.8+ MB
None


TRAIN_DATA and TEST_DATA were then created but they have been removed for privacy preservation but an example of how it was created is below:

TRAIN_DATA = [
    ("[ new order: #0000 you’ve received the following order from XXXXX XXXXXXX: [order #0000] (01 june, 2022) product quantity price transfer from XXXXXXXX aéroport to XXXXXXX XXXXXXXX camp site by shuttle (private) on 02 june, 2022 at 15:30 people: 8 extras: 6 x 2. checked luggage, 6 x 1. hand luggage (#transfers_transfer_000000) 1 80,00€ subtotal: 80,00€ payment method: check availability total: 80,00€ deposit amount 40,00€ second payment amount 40,00€ billing address XXXXX XXXXXXX 3 fr0000, {"entities": [(14, 19, "ORDER"), (61, 74, "PERSON"), (143, 160, "FROM"), (164, 190, "TO"), (215, 228, "DATE"), (232, 236, "TIME"), (138, 147, "PAX"), (148, 299, "EXTRAS"), (389, 403, "TOTAL"), (404, 425, "DEPOSIT"), (471, 484, "PERSON"), (487, 493, "DETAILS")]})]

In [5]:
#adding the new ner labels
new_labels = ["ORDER", "FROM", "TO", "TOTAL", "DEPOSIT", "PAX", "EXTRAS", "DETAILS", "ADDRESS", "EMAIL", "PHONE"]	
for i in new_labels:
  ner.add_label(i)

#checking labels are added:
nlp.get_pipe("ner").labels

('ADDRESS',
 'CARDINAL',
 'DATE',
 'DEPOSIT',
 'DETAILS',
 'EMAIL',
 'EVENT',
 'EXTRAS',
 'FAC',
 'FROM',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDER',
 'ORDINAL',
 'ORG',
 'PAX',
 'PERCENT',
 'PERSON',
 'PHONE',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'TO',
 'TOTAL',
 'WORK_OF_ART')

In [None]:
#coverting datasets to Spacy 3.0 format
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans

nlp = spacy.blank("en") # load a new blank spacy model
db = DocBin() # create a DocBin object

#making training set
for text, annot in tqdm(TRAIN_DATA): # data in current format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = filter_spans(ents) # to deal with overlapping spans
    #doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("NER model/train.spacy") # save the docbin object

In [None]:
nlp = spacy.blank("en") # load a new blank spacy model
db = DocBin() # create a DocBin object

#making test set
for text, annot in tqdm(TEST_DATA): # data in current format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    #doc.ents = ents # label the text with the ents
    doc.ents = filter_spans(ents) # to deal with overlapping spans
    db.add(doc)

db.to_disk("NER model/test.spacy") # save the docbin object

In [10]:
#code to fix ssue where locale was ansi_x3.4 not utf8
import locale
print(locale.getpreferredencoding())

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
print(locale.getpreferredencoding())

ANSI_X3.4-1968
UTF-8


The config file was cretaed on https://spacy.io/usage/training#config with following settings:
English; ner, GPU, accuracy
The resulting base-config.cfg file was download and is completed below


In [11]:
!python -m spacy init fill-config "NER model/base_config.cfg" "NER model/config.cfg"

2023-03-13 18:10:05.830782: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-13 18:10:05.832402: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/E1 - Final project/NER model UPDATED 2/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
#run the following function before training to stop colab disconnecting
%%javascript
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}setInterval(ClickConnect,60000)


<IPython.core.display.Javascript object>

In [13]:
#training and evaluating on the train and test sets created
!python -m spacy train "NER model/config.cfg" --output "NER model/output" --paths.train "NER model/train.spacy" --paths.dev "NER model/test.spacy" --gpu-id 0

2023-03-13 18:11:05.328544: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-13 18:11:05.328686: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
[38;5;2m✔ Created output directory: /content/drive/MyDrive/E1 - Final
project/NER model UPDATED 2/output[0m
[38;5;4mℹ Saving to output directory: /content/drive/MyDrive/E1 - Final
project/NER model UPDATED 2/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-03-13 18:11:17,999] [INFO] Set up nlp object from config
[2023-03-13 18:11:18,032] [INFO] Pipeline: ['transformer', 'ner']
[2023-03-13 18:11:18,045] [INFO] Created vocabulary


In [14]:
#load the best model
nlp = spacy.load(r"NER model/output/model-best") 


In [None]:
#results aren't good for DEPOSIT example for trying to add an entity rule for it:
nlp = spacy.load(r"NER model/output/model-best") 

#Create the Ruler and Add it to pipe
cfg = {"overwrite_ents": True}
ruler = nlp.add_pipe("entity_ruler", after='ner', config=cfg)

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                #{"label": "DEPOSIT", "pattern": [{"TEXT": "deposit amount"}, {}]}
                {"label": "DEPOSIT", "pattern": [{"LOWER":"deposit amount"}, {}]}
          ]

#add patterns to ruler
ruler.add_patterns(patterns)

#checking have pipelines etc as needed
print(nlp.pipe_names)
print(nlp.pipeline)

['transformer', 'ner', 'entity_ruler']
[('transformer', <spacy_transformers.pipeline_component.Transformer object at 0x7fcb421ba640>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fcb4231ac80>), ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler object at 0x7fcb421aca40>)]


To see the nlp applied to a test email run the below code 

In [None]:
#testing the model
doc = nlp(put sample sentence here)


#formatting return for jupyter
spacy.displacy.render(doc, style="ent", jupyter=True)