In [1]:
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
train = [    
("Shares of SVB Financial Group plummeted following the announcement of a major banking crisis.",{"entities": [(10, 13, "BLK_ORG")]}),
("The SVB crisis has sent shockwaves through the banking industry, with many other financial institutions bracing for impact.", {"entities": [(4, 7, "BLK_ORG")]}),
("SVB Financial Group, the parent company of Silicon Valley Bank, has been hit hard by the recent banking crisis.", {"entities": [(0,3, "BLK_ORG")]}),
("Customers of SVB have been advised to monitor their accounts closely following the banking crisis.", {"entities": [(13, 16, "BLK_ORG")]}),
("Many analysts are predicting that SVB Financial Group will need to take drastic action to survive the banking crisis.", {"entities": [(34, 37, "BLK_ORG")]}),
("Credit Suisse has been hit with billions of dollars in losses due to the Archegos Capital Management scandal.", {"entities": [(0, 14, "BLK_ORG"),]}),
("The Credit Suisse scandal has caused concern among investors and regulators alike.", {"entities": [(4, 18, "BLK_ORG")]}),
("Credit Suisse is facing major legal and financial repercussions as a result of the Greensill Capital collapse.", {"entities": [(0, 14, "BLK_ORG")]}),
("The future of Credit Suisse is uncertain in the wake of multiple high-profile scandals.", {"entities": [(14, 27, "BLK_ORG")]}),
("Credit Suisse CEO Thomas Gottstein has announced plans to step down in the wake of the bank's scandals.", {"entities": [(0, 14, "BLK_ORG")]}),
("The Federal Reserve (FED) announced an increase in repo rates.", {"entities": [(51, 61, "BLK_INT_RATE"), (4, 19, "BLK_COUNTRY"), (21, 24, "BLK_COUNTRY")]}),
("The Bank of England has decided to keep interest rates unchanged.", {"entities": [(40, 55, "BLK_INT_RATE"), (12, 19, "BLK_COUNTRY")]}),
("The European Central Bank lowered interest rates in an effort to stimulate the economy.", {"entities": [(34, 47, "BLK_INT_RATE"), (4, 12, "BLK_COUNTRY")]}),
("Many experts predict that interest rates will continue to rise in USA the coming months.", {"entities": [(26, 40, "BLK_INT_RATE"), (66, 69, "BLK_COUNTRY")]}),
("The Reserve Bank of Australia raised interest rates for the first time in over a year.", {"entities": [(37, 50, "BLK_INT_RATE"), (20, 29, "BLK_COUNTRY")]}),
("The Bank of Japan has kept interest rates at record lows for several years.", {"entities": [(27, 41, "BLK_INT_RATE"), (12, 17, "BLK_COUNTRY")]})
]

In [4]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

In [5]:
db = DocBin() # create a DocBin object

for text, annot in tqdm(train): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

100%|████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 1332.64it/s]


In [8]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4m[i] Saving to output directory: output[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     46.14    0.00    0.00    0.00    0.00
 66     200         57.10   1070.27  100.00  100.00  100.00    1.00
163     400          0.00      0.00  100.00  100.00  100.00    1.00
263     600          0.00      0.00  100.00  100.00  100.00    1.00
363     800          0.00      0.00  100.00  100.00  100.00    1.00
561    1000          0.00      0.00  100.00  100.00  100.00    1.00
761    1200          0.00      0.00  100.00  100.00  100.00    1.00
961    1400          0.00      0.00  100.00  100.00  100.00    1.00
1161    1600          0.00      0.00  100.00  100.00  100.00    1.00
1361    1800          0.00      0.0

[2023-04-15 18:30:47,199] [INFO] Set up nlp object from config
[2023-04-15 18:30:47,207] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-04-15 18:30:47,209] [INFO] Created vocabulary
[2023-04-15 18:30:47,210] [INFO] Finished initializing nlp object
[2023-04-15 18:30:47,288] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [6]:
nlp1 = spacy.load(r"./output/model-best") #load the best model
# Shares of SVB Financial Group plummeted following the announcement of a major banking crisis.
# Testing if our model is able to capture interest rate and repo rate
doc = nlp1("Shares of SVB Financial Group plummeted following the announcement of a major banking crisis.") # input sample text
displacy.render(doc,style="ent",jupyter=True)

In [24]:
doc.ents[0]

SVB

In [25]:
def get_entity(doc):
    entity_label = {}
    tup = doc.ents
    for i in range(len(tup)):
        entity_label[tup[i]] = tup[i].label_
    return entity_label

In [26]:
get_entity(doc)

{SVB: 'BLK_ORG'}