#Call Center calls queries NER Modeling for Finance/Banking Companies

> In this notebook I have tried to create a simple custom NER model with 13 entries and 100 iters



In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
#Default avl pipelines in NLP
nlp.pipe_names

['tagger', 'parser', 'ner']

In [3]:
doc = nlp("Australia wants to to force Facebook and Google to pay media comnpanies for news")

In [4]:
for ent in doc.ents:
  print(ent.text,ent.start_char, ent.end_char , ent.label_)

Australia 0 9 GPE
Facebook and Google 28 47 ORG


In [5]:
doc_2 = nlp("I do not have money to pay fot my credit card account")

In [6]:
for ent in doc_2.ents:
  print(ent.text,ent.start_char, ent.end_char , ent.label_)

In [7]:
doc = nlp("What is the process to open a new savings account")


For Training Spacy NER models require a min of 200 training examples but here we are training a batch of 13.

In [8]:
for ent in doc.ents:
  print(ent.text,ent.start_char, ent.end_char, ent.label_)

In [9]:
# The type of queries coming through
train = [
         ("Money transfer from my checking account is not working",{"entities":[(6,13,"ACTIVITY"),(23,39,"PRODUCT")]}),
         ("I want to check balance in my savings account",{"entities":[(16,23,"ACTIVITY"),(30,45,"PRODUCT")]}),
         ("I suspect a fraud in my credit card account",{"entities":[(12,17,"ACTIVITY"),(24,35,"PRODUCT")]}),
         ("I am here for opening a new savings account",{"entities":[(14,21,"ACTIVITY"),(28,43,"PRODUCT")]}),
         ("Your mortgage is in delinquent status",{"entities":[(20,30,"ACTIVITY"),(5,13,"PRODUCT")]}),
         ("My loan account is still not approved and funded",{"entities":[(25,37,"ACTIVITY"),(3,15,"PRODUCT"),(42,48,"PRODUCT")]}),
         ("Your credit card is in past due status",{"entities":[(23,31,"ACTIVITY"),(5,16,"PRODUCT")]}),
         ("How do I open a new loan account",{"entities":[(9,13,"ACTIVITY"),(20,32,"PRODUCT")]}),
         ("What are the charges on Investment account",{"entities":[(13,20,"ACTIVITY"),(24,42,"PRODUCT")]}),
         ("Can you explain late charges on my credit card",{"entities":[(21,28,"ACTIVITY"),(35,46,"PRODUCT")]}),
         ("I want to open a new loan account",{"entities":[(10,14,"ACTIVITY"),(21,33,"PRODUCT")]}),
         ("Can you help updating payment on my credit card",{"entities":[(22,29,"ACTIVITY"),(36,47,"PRODUCT")]}),
         ("When is the payment due date on my card",{"entities":[(12,19,"ACTIVITY"),(35,39,"PRODUCT")]})
          ]

In [10]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [14]:
#use only ner for pipelines
ner = nlp.get_pipe("ner")

In [15]:
for _,annotations in train:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [16]:
#As we do not need tagger and parser component 
disable_pipes =[pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [23]:
import random 
from spacy.util import minibatch,compounding
from pathlib import Path

with nlp.disable_pipes(*disable_pipes):
  optimizer =nlp.resume_training()

  for iteration in range(100):

#To make sure data does not repeat so we shuffle the train
    random.shuffle(train)
    losses = {}

    batches = minibatch(train,size=compounding(1.0,4.0,1.001))
    for batch in batches:
      text,annotation = zip(*batch)
      nlp.update(
          text,
          annotation,
          drop=0.5,
          losses = losses,
          sgd = optimizer
      )
      print("Losses",losses)


Losses {'ner': 9.061678750970785}
Losses {'ner': 16.78925304456032}
Losses {'ner': 22.73115491180097}
Losses {'ner': 33.847403506863614}
Losses {'ner': 39.8866674832848}
Losses {'ner': 47.37228221915798}
Losses {'ner': 51.69672068182168}
Losses {'ner': 63.26859943680662}
Losses {'ner': 76.84848164133925}
Losses {'ner': 83.90768381333051}
Losses {'ner': 93.6566311688721}
Losses {'ner': 102.99161310583351}
Losses {'ner': 108.19538562627805}
Losses {'ner': 8.045475088489184}
Losses {'ner': 12.551957287704116}
Losses {'ner': 20.334529085865487}
Losses {'ner': 29.05719798884309}
Losses {'ner': 35.89879009988579}
Losses {'ner': 48.757305213477935}
Losses {'ner': 59.285787158262224}
Losses {'ner': 70.0248584380264}
Losses {'ner': 78.06569043219508}
Losses {'ner': 84.98247084346087}
Losses {'ner': 91.80274326741215}
Losses {'ner': 98.85689719233648}
Losses {'ner': 108.79169187547956}
Losses {'ner': 9.228723882042686}
Losses {'ner': 15.567602562340653}
Losses {'ner': 18.83070635418244}
Losses {

In [26]:
for text, _ in train:

  doc =nlp(text)
  print('Entities',[(ent.text,ent.label_) for ent in doc.ents])

Entities [('credit card', 'PRODUCT')]
Entities [('credit card', 'PRODUCT')]
Entities [('open', 'ACTIVITY'), ('loan account', 'PRODUCT')]
Entities [('credit card', 'PRODUCT')]
Entities [('checking account', 'PRODUCT')]
Entities []
Entities [('opening', 'ACTIVITY'), ('savings account', 'PRODUCT')]
Entities [('credit card', 'PRODUCT')]
Entities [('Investment account', 'PRODUCT')]
Entities [('balance', 'ACTIVITY'), ('savings account', 'PRODUCT')]
Entities [('open', 'ACTIVITY'), ('loan account', 'PRODUCT')]
Entities [('loan account', 'PRODUCT'), ('funded', 'PRODUCT')]
Entities [('mortgage', 'PRODUCT')]


#Now lets try on new data

```
# This is formatted as code
```



In [29]:
from spacy import displacy

doc = nlp("what is the process to open a new savings account?")
for ent in doc.ents:
  print(ent.text,ent.start_char, ent.end_char , ent.label_)
displacy.render(nlp(doc.text),style='ent',jupyter=True)

open 23 27 ACTIVITY
savings account 34 49 PRODUCT


In [31]:
from spacy import displacy

doc = nlp("My credit card payment will be delayed")
for ent in doc.ents:
  print(ent.text,ent.start_char, ent.end_char , ent.label_)
displacy.render(nlp(doc.text),style='ent',jupyter=True)

credit card 3 14 PRODUCT


here delayed is not correctly labelled as activity

In [33]:
from spacy import displacy

doc = nlp("I lost my investment account password and cannot open my account now ")
for ent in doc.ents:
  print(ent.text,ent.start_char, ent.end_char , ent.label_)
displacy.render(nlp(doc.text),style='ent',jupyter=True)

investment account 10 28 PRODUCT
account now 57 68 PRODUCT


In [34]:
#note that open is not correctly labbeled as activity

In [32]:
from spacy import displacy

doc = nlp("What are the charges on credit card late payment in Bank of America ")
for ent in doc.ents:
  print(ent.text,ent.start_char, ent.end_char , ent.label_)
displacy.render(nlp(doc.text),style='ent',jupyter=True)

credit card 24 35 PRODUCT
payment 41 48 ACTIVITY


We can see the model is generalizing poorly on this because 

1.  less data 13 entries .
2.  We need a minimum of 800 to do well.
3.  Also we see that the problem called catastrophic forgetting



