In [1]:
import os
import random
import string

import spacy
from spacy.training import Example

In [2]:
MODEL_DIR = "../../bank_dets_model"
if not os.path.isdir(MODEL_DIR):
    os.mkdir(MODEL_DIR)

In [3]:
ENTITIES = [
        "InvoiceNum",
        "PO",
        "InvoiceDate",
        "DueDate",
        "ABN"
    ]

BANK_DETAILS_ENTITIES = ["ABN", "AccountName", "AccountNum", "BankName", "BSB", "SwiftCode"]

In [4]:
def create_test_train_split(folder):

    files = os.listdir(folder)
    # files = [os.path.join(folder, fil) for fil in files]
    files = sorted(list(set([fil[:-4] for fil in files])))
    random.shuffle(files)

    train_folder = os.path.join(folder, "train")
    test_folder = os.path.join(folder, "test")
    if not os.path.isdir(train_folder):
        os.mkdir(train_folder)
    if not os.path.isdir(test_folder):
        os.mkdir(test_folder)

    split_point = int(len(files)*0.8)
    print("Training files: ", split_point)
    train_files = files[:split_point]
    test_files = files[split_point:]

    for fil in files:
        if fil in train_files:
            os.rename(os.path.join(folder, fil+".txt"), os.path.join(train_folder, fil+".txt"))
            os.rename(os.path.join(folder, fil+".ann"), os.path.join(train_folder, fil+".ann"))
        else:
            os.rename(os.path.join(folder, fil+".txt"), os.path.join(test_folder, fil+".txt"))
            os.rename(os.path.join(folder, fil+".ann"), os.path.join(test_folder, fil+".ann"))

In [5]:
def prepare_train_data(folder):
    files1 = os.listdir(folder)
    # files = [tf for tf in files if tf!="annotation.conf"]
    files1 = list(set([tf[:-4] for tf in files1]))

    data = []
    for tf in sorted(files1):
        print(tf)
        entities_dict = {"entities": []}
        try:
            with open(os.path.join(folder, tf+".txt"), "r") as f:
                text = f.read()
        except UnicodeDecodeError as e:
            with open(os.path.join(folder, tf+".txt"), "r", encoding="mbcs") as f:
                text = f.read()
#         print("Text: ", text)
        with open(os.path.join(folder, tf+".ann"), "r") as f:
            annotation = f.readlines()
        for line in annotation:
            print("Line: ", line)
            line_parts = line.split("\t")
            if len(line_parts) == 3:
                relevant_bit = line_parts[1]
                entity_text = line_parts[2]
                entity_text = entity_text.strip()
                ent, start, end = relevant_bit.split(" ")
                start = text.find(entity_text)
                end = start + len(entity_text)
                print(start, end, ent)
                entities_dict["entities"].append((int(start), int(end), ent))
        entities_dict["entities"] = list(set(entities_dict["entities"]))
        data.append((text, entities_dict))
    random.shuffle(data)
#     data = trim_entity_spans(data)
    return data

In [8]:
def train(train_data, num_epochs=25):
    model_name = "ner_model"
    nlp = spacy.load("en_core_web_sm")

    # Getting the ner component
    ner = nlp.get_pipe('ner')

    for ent in BANK_DETAILS_ENTITIES:
        ner.add_label(ent)

    # Resume training
    optimizer = nlp.resume_training()
    move_names = list(ner.move_names)

    # List of pipes you want to train
    pipe_exceptions = ["ner"]

    # List of pipes which should remain unaffected in training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

    with nlp.disable_pipes(*other_pipes):
        for itn in range(num_epochs):
            for batch in spacy.util.minibatch(train_data, size=8):
                for text, annotations in batch:
                    #             print(text[:10])
                    #             print(annotations)
                    losses = {}
                    # create Example
                    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
                    text = text.translate(translator)
                    doc = nlp.make_doc(text)
#                     print(type(doc))
#                     print(type(annotations))
#                     print("Annot: ", annotations)
#                     dlks
                    example = Example.from_dict(doc, annotations)
                    #             print(example)
                    # Update the model
                    nlp.update([example], losses=losses, drop=0.3)
                    print("Losses", losses)

    nlp.to_disk(os.path.join(MODEL_DIR, model_name))
    return nlp

In [6]:
TRAIN_DATA = prepare_train_data("../../data/train")
TEST_DATA = prepare_train_data("../../data/test")

00028104
Line:  T1	BSB 1387 1394	062-198

1279 1286 BSB
Line:  T2	AccountNum 1409 1418	1058-3208

1301 1310 AccountNum
Line:  T3	AccountName 1433 1458	About Face Salon Supplies

64 89 AccountName
Line:  T4	ABN 52 66	16 123 351 422

48 62 ABN
02081934
Line:  T1	BSB 1189 1196	064 173

1100 1107 BSB
Line:  T2	AccountNum 1207 1216	0012 5328

1118 1127 AccountNum
Line:  T3	SwiftCode 1230 1238	CTBAAU2S

1141 1149 SwiftCode
Line:  T4	AccountName 2084 2098	Data#3 Limited

1009 1023 AccountName
Line:  T5	BSB 2115 2122	064 173

1100 1107 BSB
Line:  T6	AccountNum 2133 2142	0012 5328

1118 1127 AccountNum
Line:  T7	SwiftCode 2156 2164	CTBAAU2S

1141 1149 SwiftCode
Line:  T8	ABN 1438 1452	31 010 545 267

1344 1358 ABN
082172_Adjustment Credit Note_CCE04694 (ID 5951179)
Line:  T1	ABN 77 91	63 000 341 819

74 88 ABN
Line:  T2	AccountNum 3480 3489	775394025

3365 3374 AccountNum
Line:  T3	BSB 3378 3385	014-002

3266 3273 BSB
082172_Tax Invoice_CIY76735 (ID 5952822)
Line:  T1	ABN 77 91	63 000 341 819



Line:  T1	ABN 172 183	38113072755

161 172 ABN
Line:  T2	ABN 360 374	87 166 205 287

337 351 ABN
Line:  T3	BSB 733 739	124010

688 694 BSB
Line:  T4	AccountNum 746 755	022451345

700 709 AccountNum
Line:  T5	AccountName 711 726	Thesoco Pty Ltd

31 46 AccountName
Invoice INV-4600
Line:  T1	ABN 690 704	61 610 143 863

255 269 ABN
Line:  T2	AccountName 628 648	MTP Services Pty Ltd

39 59 AccountName
Line:  T3	AccountNum 605 616	90-751-8182

566 577 AccountNum
Line:  T4	BSB 584 591	082-902

546 553 BSB
Line:  T5	ABN 274 288	61 610 143 863

255 269 ABN
Invoice PSINV009249
Line:  T1	ABN 206 220	67 111 307 361

190 204 ABN
Line:  T2	AccountNum 919 927	15073065

856 864 AccountNum
Line:  T3	BankName 897 900	CBA

836 839 BankName
Line:  T4	BSB 935 942	062-000

870 877 BSB
Invoice Sightline-8138
Line:  T1	ABN 91 105	86 143 688 546

86 100 ABN
Line:  T2	BankName 988 1005	Commonwealth Bank

938 955 BankName
Line:  T3	BSB 1012 1019	062-192

961 968 BSB
Line:  T4	AccountNum 1027 1036	1044 0166

975 

In [53]:
TRAIN_DATA[12]#[1]['entities']

("TIKTOK AUSTRALIA PTY LTD\nAddress: Level 10, 68 Pitt Street, Sydney NSW 2000\nTikTok\nABN: 15 637 464 638\nTAX INVOICE\nBill To\nClient Name:\nINSURANCE AUSTRALIA LIMITED\nInvoice #:\nTTAU2021081279\nClient ID:\n6993126001425253121\nInvoice Date:\n05, December, 2021\nBilling Contact:\nDue Date:\n04, January, 2022\nBilling Email:\nContract #:\nCON6508843\nBilling Tel:\nABN #:\n11 000 016 722\nBilling Address:\nDarling Park, Tower Two, 201 Sussex St, Sydney\nNSW, SYDNEY, NSW 2000\nBilling Period\n30, November, 2021 ~ 30, November, 2021\nAdvertiser Name\nDescription\nNote\nAmount in AUD\nROLLiN' | Auction\nAdvertising Fees\n6,923.21\nRemarks:\nSubtotal\n6,923.21\nGST@10%\n692.32\nTotal incl. GST\n7,615.53\nPayment method:\nBank Name: Citibank, N.A. (Sydney Branch)\nAccount Number: 248037100763\nAccount Name: TIKTOK AUSTRALIA PTY LTD\nSWIFT Code: CITIAU2X\nBank Address: 2 Park Street, SYDNEY NSW 2000\nBSB Code: 248037\nFor domestic AUD transfers, please use the last 6 digits of the accou

In [9]:
trained_model = train(TRAIN_DATA, num_epochs=50)

Losses {'ner': 85.29296957739122}
Losses {'ner': 64.03043944574893}
Losses {'ner': 56.49548473006598}
Losses {'ner': 19.693392547642816}
Losses {'ner': 18.20112617250193}
Losses {'ner': 11.726825103465671}
Losses {'ner': 19.149596950528917}


Icon Agency
IP Australia
INVOICE NUMBE..." with entities "[(671, 677, 'BSB'), (640, 665, 'AccountName'), (15...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Losses {'ner': 10.763713662491543}
Losses {'ner': 9.727037744610602}
Losses {'ner': 12.011854553900081}
Losses {'ner': 8.237378467402348}


Copy of Tax Invoice
NTT Australia Pty Ltd
Tran..." with entities "[(1114, 1132, 'AccountName'), (1153, 1162, 'Accoun...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Losses {'ner': 6.661388276567594}


GROUP
TAX INVOICE
Invoice Date
Kondinin I..." with entities "[(1409, 1420, 'AccountNum'), (352, 366, 'ABN'), (1...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Losses {'ner': 9.990165076615535}
Losses {'ner': 6.767715774055091}
Losses {'ner': 8.106728238714282}
Losses {'ner': 6.915874581239542}
Losses {'ner': 67.83521580696106}
Losses {'ner': 7.769754349036134}
Losses {'ner': 8.20802305532925}
Losses {'ner': 7.6275565052501975}
Losses {'ner': 10.535300221152283}
Losses {'ner': 8.080054103291332}
Losses {'ner': 6.039320661202407}
Losses {'ner': 9.322375862328263}
Losses {'ner': 8.194266718674115}
Losses {'ner': 7.60308397659287}
Losses {'ner': 6.977751027463341}
Losses {'ner': 9.829424573451423}
Losses {'ner': 7.373369495766951}
Losses {'ner': 2.3728595822904026}
Losses {'ner': 11.177932791665853}
Losses {'ner': 3.9652208419938386}
Losses {'ner': 5.547767163619696}
Losses {'ner': 5.455694787233746}
Losses {'ner': 11.682963295813881}
Losses {'ner': 9.944055961691019}
Losses {'ner': 9.404856976704195}
Losses {'ner': 11.895720165246018}
Losses {'ner': 9.458188146724751}
Losses {'ner': 9.54341474181578}
Losses {'ner': 6.903888657034258}
Losses {'n

TAX INVOICE
Invoice Date
GoLogic Group Pt..." with entities "[(761, 769, 'SwiftCode'), (715, 718, 'BankName'), ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Address  Level 10  68 Pit..." with entities "[(751, 757, 'AccountNum'), (809, 817, 'SwiftCode')...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Losses {'ner': 7.125831763492897}


Tax Invoice
CANON BUSINESS
SERVICES ANZ
Cano..." with entities "[(741, 750, 'AccountNum'), (-1, 5, 'BSB'), (699, 7...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Losses {'ner': 8.075151186454129}
Losses {'ner': 11.19765164402017}
Losses {'ner': 4.810797416543664}
Losses {'ner': 5.732436612645873}
Losses {'ner': 6.4351475873477995}
Losses {'ner': 6.955632027588329}
Losses {'ner': 7.276399400923053}
Losses {'ner': 8.444710301542695}
Losses {'ner': 5.635435600537058}
Losses {'ner': 10.832310450425963}
Losses {'ner': 4.682520892570387}
Losses {'ner': 7.23650774103726}
Losses {'ner': 12.879228573758725}
Losses {'ner': 6.611304068920639}
Losses {'ner': 5.423648982083994}
Losses {'ner': 76.48776428690012}
Losses {'ner': 3.5298105564515816}
Losses {'ner': 6.848746678557433}
Losses {'ner': 8.14148069677995}
Losses {'ner': 8.494690454994936}
Losses {'ner': 5.037252211117524}
Losses {'ner': 58.16483703681577}
Losses {'ner': 108.16311550140381}
Losses {'ner': 66.20893088824397}
Losses {'ner': 91.6432667744558}
Losses {'ner': 25.483910057844263}
Losses {'ner': 38.10747445611388}
Losses {'ner': 25.80756020325717}
Losses {'ner': 10.749580182928952}
Losses {'n

Losses {'ner': 6.902483119475525}
Losses {'ner': 9.09167194093391}
Losses {'ner': 4.4333213205873685}
Losses {'ner': 2.920250238629344}
Losses {'ner': 2.500428250005201}
Losses {'ner': 0.9731884776460948}
Losses {'ner': 5.164013718549855}
Losses {'ner': 5.38069510496593}
Losses {'ner': 8.916005540506024}
Losses {'ner': 3.850140190020282}
Losses {'ner': 3.9814276424340855}
Losses {'ner': 9.435218986179228}
Losses {'ner': 0.8720929119484382}
Losses {'ner': 3.143008401560336}
Losses {'ner': 2.3347730233191553}
Losses {'ner': 5.295968350609291}
Losses {'ner': 3.4258081009310133}
Losses {'ner': 3.6110845667231177}
Losses {'ner': 3.9427159317565836}
Losses {'ner': 2.9542966075961603}
Losses {'ner': 3.3691988254017056}
Losses {'ner': 4.8370079359288205}
Losses {'ner': 5.6347413767924355}
Losses {'ner': 4.7369440673842265}
Losses {'ner': 6.549174352723602}
Losses {'ner': 6.106842987439297}
Losses {'ner': 0.7989923978191966}
Losses {'ner': 1.3906050463487596}
Losses {'ner': 1.9054756229789758}


Losses {'ner': 0.1610579392390316}
Losses {'ner': 2.767745180911941}
Losses {'ner': 4.439549338534145}
Losses {'ner': 0.23438384718614402}
Losses {'ner': 1.9931908211025506}
Losses {'ner': 5.126209279488239}
Losses {'ner': 0.10528844222395722}
Losses {'ner': 0.8876062432440439}
Losses {'ner': 2.033712089570674}
Losses {'ner': 1.9238128605895188}
Losses {'ner': 4.302890121845112}
Losses {'ner': 1.2076152991607119}
Losses {'ner': 3.956185780704649}
Losses {'ner': 8.775627645831301}
Losses {'ner': 3.3741657039394166}
Losses {'ner': 2.0936779590689527}
Losses {'ner': 1.7275588765892091}
Losses {'ner': 0.38719105141858395}
Losses {'ner': 1.4944648102946623}
Losses {'ner': 1.1137252363772259}
Losses {'ner': 1.0877491311188732}
Losses {'ner': 1.7365451155871434}
Losses {'ner': 2.3115913688880942}
Losses {'ner': 12.51548936331329}
Losses {'ner': 0.0021006143527564167}
Losses {'ner': 1.3219619926504378}
Losses {'ner': 0.5308391485539714}
Losses {'ner': 2.414030401408225}
Losses {'ner': 1.134407

Losses {'ner': 1.9161097135584613}
Losses {'ner': 2.6733781844662534}
Losses {'ner': 1.6271123851213833}
Losses {'ner': 0.0005984766866655426}
Losses {'ner': 0.2714520848855914}
Losses {'ner': 0.0004493236590970358}
Losses {'ner': 2.070110467987987}
Losses {'ner': 0.004983938533281152}
Losses {'ner': 7.114163856136431}
Losses {'ner': 0.00040938114740998467}
Losses {'ner': 3.6678325122106714}
Losses {'ner': 0.0020464684778488994}
Losses {'ner': 1.9637106649667648}
Losses {'ner': 2.531629906592412}
Losses {'ner': 0.7790754378428091}
Losses {'ner': 0.002639553140423524}
Losses {'ner': 5.667819480379316}
Losses {'ner': 3.756842237043388}
Losses {'ner': 0.019637498228232467}
Losses {'ner': 1.9179316960976525}
Losses {'ner': 3.8632123192275456}
Losses {'ner': 0.004514307418357275}
Losses {'ner': 0.0001777653355748693}
Losses {'ner': 0.3950254381289582}
Losses {'ner': 0.45989062296064964}
Losses {'ner': 2.063692785804741}
Losses {'ner': 0.5365418843964098}
Losses {'ner': 1.915553144274197}
Lo

Losses {'ner': 2.851608351770353e-05}
Losses {'ner': 3.847551033943126}
Losses {'ner': 1.2275072293614513}
Losses {'ner': 0.004167896668882434}
Losses {'ner': 2.5191217429370143}
Losses {'ner': 0.0008398804400551521}
Losses {'ner': 10.527365578534713}
Losses {'ner': 0.007168938778545533}
Losses {'ner': 0.14246044156806964}
Losses {'ner': 0.006765387456239792}
Losses {'ner': 1.7807579291027646}
Losses {'ner': 0.06065606590530183}
Losses {'ner': 2.507144022094028}
Losses {'ner': 0.3840241287388053}
Losses {'ner': 0.2969992502106825}
Losses {'ner': 2.0856564820008763}
Losses {'ner': 0.010114881150989064}
Losses {'ner': 0.04152309541288842}
Losses {'ner': 1.7614614126781216}
Losses {'ner': 3.981520294422732}
Losses {'ner': 1.0062326345753443}
Losses {'ner': 1.6317984889129014}
Losses {'ner': 0.5265547177199645}
Losses {'ner': 5.573727121508429e-07}
Losses {'ner': 0.05273481318787372}
Losses {'ner': 0.006026382172200294}
Losses {'ner': 2.60853550473823}
Losses {'ner': 0.0005092214188538062}

Losses {'ner': 6.498851920640455}
Losses {'ner': 0.15843541570934486}
Losses {'ner': 1.4638106159611548}
Losses {'ner': 2.019811293408533}
Losses {'ner': 1.0495057692956407}
Losses {'ner': 0.026830042871986225}
Losses {'ner': 1.2377835980111138}
Losses {'ner': 4.481862321480886}
Losses {'ner': 0.023313952604254607}
Losses {'ner': 2.147166379786253}
Losses {'ner': 1.761354507383655}
Losses {'ner': 0.002053428553104244}
Losses {'ner': 4.492699634296202e-06}
Losses {'ner': 0.05750738310953597}
Losses {'ner': 0.2452288968903065}
Losses {'ner': 1.341949724160874}
Losses {'ner': 0.18491604435732353}
Losses {'ner': 0.7834125532376139}
Losses {'ner': 2.985755504293001}
Losses {'ner': 1.182075582875291}
Losses {'ner': 2.7713741436150134}
Losses {'ner': 0.0026542916303848437}
Losses {'ner': 5.0120106271870753e-05}
Losses {'ner': 1.0452420078054159}
Losses {'ner': 4.8205636583217465e-05}
Losses {'ner': 0.05337350993309994}
Losses {'ner': 0.003123605226149649}
Losses {'ner': 0.05233228108344117}
L

Losses {'ner': 0.0021944629826435302}
Losses {'ner': 8.3300901823749}
Losses {'ner': 6.250336970213712e-10}
Losses {'ner': 2.811514429172937e-05}
Losses {'ner': 3.305460045682427e-06}
Losses {'ner': 5.342238026993614}
Losses {'ner': 0.0009181767003867188}
Losses {'ner': 1.3397301106783495}
Losses {'ner': 1.3570909214066385}
Losses {'ner': 0.04105285287743975}
Losses {'ner': 0.22638934425459567}
Losses {'ner': 1.599605281392649}
Losses {'ner': 1.9985685141372136}
Losses {'ner': 0.0015336501180853563}
Losses {'ner': 1.5934942192602757}
Losses {'ner': 9.906217744558084}
Losses {'ner': 6.255381218902115e-07}
Losses {'ner': 0.029736490908441965}
Losses {'ner': 3.6396970610893644e-06}
Losses {'ner': 0.06581467093211384}
Losses {'ner': 7.255109026614427e-05}
Losses {'ner': 7.252019221698526}
Losses {'ner': 0.1376052255119252}
Losses {'ner': 2.1590460611955993}
Losses {'ner': 7.430937259406158e-05}
Losses {'ner': 2.3408164371470623}
Losses {'ner': 0.0001571230580922275}
Losses {'ner': 0.000154

Losses {'ner': 1.9611269725551326}
Losses {'ner': 1.5549753915400253}
Losses {'ner': 0.01490889272605783}
Losses {'ner': 2.21468494833122e-06}
Losses {'ner': 1.0949705012393625}
Losses {'ner': 1.599681665926377}
Losses {'ner': 9.801100404957611e-05}
Losses {'ner': 1.3028260349772411}
Losses {'ner': 3.5748924569095113}
Losses {'ner': 0.0018872922433604046}
Losses {'ner': 6.645579191731754e-06}
Losses {'ner': 0.006912887013947806}
Losses {'ner': 0.7208225114160242}
Losses {'ner': 0.008778490014671174}
Losses {'ner': 0.0003864226919678239}
Losses {'ner': 0.00014629410472473755}
Losses {'ner': 0.5227124953226107}
Losses {'ner': 0.0064331660718001}
Losses {'ner': 1.9993130682434668}
Losses {'ner': 0.05000071895209791}
Losses {'ner': 0.026871159766756175}
Losses {'ner': 0.049075056599618604}
Losses {'ner': 0.001803028076847685}
Losses {'ner': 0.0195138635503033}
Losses {'ner': 0.006070571867754366}
Losses {'ner': 0.008156292639953093}
Losses {'ner': 8.558768359183997}
Losses {'ner': 1.641399

Losses {'ner': 0.011414838617811968}
Losses {'ner': 7.722044800702813}
Losses {'ner': 1.8833705354099574e-07}
Losses {'ner': 8.218134792277156e-06}
Losses {'ner': 1.3728324696206087e-05}
Losses {'ner': 3.0140326522511462}
Losses {'ner': 0.024008026415802818}
Losses {'ner': 1.0975673374115607}
Losses {'ner': 0.0045202376128754985}
Losses {'ner': 0.001717962398215719}
Losses {'ner': 1.777889947480979}
Losses {'ner': 0.09570329122596279}
Losses {'ner': 1.5939598991312823e-05}
Losses {'ner': 6.745029756157401e-05}
Losses {'ner': 0.007391438029246275}
Losses {'ner': 0.004988484013016065}
Losses {'ner': 1.4121132431056929}
Losses {'ner': 0.009619331635472978}
Losses {'ner': 5.477124610293502e-08}
Losses {'ner': 3.333391579525936e-07}
Losses {'ner': 7.315317206802091e-08}
Losses {'ner': 4.830055584390453}
Losses {'ner': 5.037177099443769e-06}
Losses {'ner': 4.913255339344886}
Losses {'ner': 1.2247071881086175e-06}
Losses {'ner': 2.214730441381557}
Losses {'ner': 1.987797015054492e-05}
Losses 

Losses {'ner': 3.520032838708019}
Losses {'ner': 1.934999560723991e-07}
Losses {'ner': 0.47666351116322275}
Losses {'ner': 3.851916872414562e-07}
Losses {'ner': 2.0661814213380563e-08}
Losses {'ner': 4.175477570216643e-06}
Losses {'ner': 0.039302244962044645}
Losses {'ner': 2.222748116944134}
Losses {'ner': 1.320210939871988}
Losses {'ner': 0.26999731445145253}
Losses {'ner': 5.593522280323779}
Losses {'ner': 0.49937710504305827}
Losses {'ner': 0.011925029128769603}
Losses {'ner': 0.7124823898210036}
Losses {'ner': 0.5400882441058552}
Losses {'ner': 0.002188730307725936}
Losses {'ner': 6.3856426369691094e-06}
Losses {'ner': 0.32543295907022957}
Losses {'ner': 1.7123951483045106}
Losses {'ner': 0.005076297581484348}
Losses {'ner': 1.7689275192835434}
Losses {'ner': 0.00033543829186256794}
Losses {'ner': 0.0013874699305013983}
Losses {'ner': 2.182505606355906e-05}
Losses {'ner': 0.0002499435665439828}
Losses {'ner': 0.00044121547546611484}
Losses {'ner': 2.0000006364058214}
Losses {'ner'

Losses {'ner': 0.0015789373816292212}
Losses {'ner': 7.043130458714631e-07}
Losses {'ner': 2.0838011794575616e-07}
Losses {'ner': 0.029661842828587692}
Losses {'ner': 0.001705231626274408}
Losses {'ner': 2.164482509813339e-05}
Losses {'ner': 7.386414711900024}
Losses {'ner': 1.641435956333333e-10}
Losses {'ner': 0.011344216894850661}
Losses {'ner': 2.7492880265974147e-08}
Losses {'ner': 3.9743775224919182}
Losses {'ner': 7.090085723101892e-05}
Losses {'ner': 2.156377569149731}
Losses {'ner': 0.0005639806205665476}
Losses {'ner': 0.00019689934519686122}
Losses {'ner': 0.04593447168638253}
Losses {'ner': 0.45783921884693807}
Losses {'ner': 6.011547275796434e-05}
Losses {'ner': 0.0002754570962292329}
Losses {'ner': 8.226454670780667e-05}
Losses {'ner': 1.905071017416394}
Losses {'ner': 7.838506946614078e-11}
Losses {'ner': 8.166129143082848e-09}
Losses {'ner': 2.501287565195551e-09}
Losses {'ner': 0.0006287919915946405}
Losses {'ner': 0.08115075674274166}
Losses {'ner': 2.0929474822266734

In [10]:
# from spacy.training import Example
from spacy.scorer import Scorer
from tqdm import tqdm
def evaluate(ner_model, examples):
    gold_list = []
    scorer = Scorer()
    for ex in tqdm(examples):
        input_ = ex[0]
        annot = ex[1]
        doc_gold_text = ner_model.make_doc(input_)
        example = Example.from_dict(doc_gold_text, annot)
        example.predicted = ner_model(input_)
        gold_list.append(example)
        
    return scorer.score(gold_list)
# example run  examples = [     ('Who is Shaka Khan?',      [(7, 17, 'PERSON')]),     ('I like London and Berlin.',      [(7, 13, 'LOC'), (18, 24, 'LOC')]) ]  ner_model = spacy.load(ner_model_path) # for spaCy's pretrained use 'en_core_web_sm' results = evaluate(ner_model, examples) 

In [11]:
evaluate(trained_model, TEST_DATA)

POULTRY & GAME SPECIALIST
Tax Invoice..." with entities "[(661, 669, 'AccountNum'), (143, 154, 'ABN'), (645...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  7.53it/s]


{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'tag_acc': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_per_feat': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'ents_p': 0.7878787878787878,
 'ents_r': 0.4727272727272727,
 'ents_f': 0.5909090909090909,
 'ents_per_type': {'AccountName': {'p': 0.6666666666666666,
   'r': 0.5714285714285714,
   'f': 0.6153846153846153},
  'ABN': {'p': 1.0, 'r': 0.7142857142857143, 'f': 0.8333333333333333},
  'BSB': {'p': 0.7142857142857143,
   'r': 0.45454545454545453,
   'f': 0.5555555555555556},
  'AccountNum': {'p': 0.5, 'r': 0.25, 'f': 0.3333333333333333},
  'SwiftCode': {'p': 1.0, 'r': 0.3333333333333333, 'f': 0.5},
  'BankName': {'p': 1.0, 'r': 0.4, 'f': 0.5714285714285715}},
 'cats_score': 0.0,
 'cats_score_desc': 'macro F',
 'cats_micro_p': 0.0,
 'cats_micro_r': 0.0,
 'cats_micro_f': 0.0,
 'cats_macro_p': 0.0,
 'cats_macro_r': 0.0,
 'cats_ma

In [12]:
for tdata in TEST_DATA:
    text = tdata[0]
    doc = trained_model(text)
    # print(doc)
    for ent in doc.ents:
        print(ent)
        print(type(ent))
        print(ent.text, ent.label_)
    spacy.displacy.render(doc, style="ent", jupyter=True) 

Define Potential Pty Ltd
<class 'spacy.tokens.span.Span'>
Define Potential Pty Ltd AccountName
38 113 072 755
<class 'spacy.tokens.span.Span'>
38 113 072 755 ABN
71 618 216 390
<class 'spacy.tokens.span.Span'>
71 618 216 390 ABN
032 713
<class 'spacy.tokens.span.Span'>
032 713 BSB


28 864 970 579
<class 'spacy.tokens.span.Span'>
28 864 970 579 ABN
183004
<class 'spacy.tokens.span.Span'>
183004 AccountNum
8733
<class 'spacy.tokens.span.Span'>
8733 AccountNum


Wells Fargo Bank
<class 'spacy.tokens.span.Span'>
Wells Fargo Bank AccountName


74 599 608 295
<class 'spacy.tokens.span.Span'>
74 599 608 295 ABN
Reserve Bank of Australia
<class 'spacy.tokens.span.Span'>
Reserve Bank of Australia BankName
Department of Industry, Science, Energy and Resources
<class 'spacy.tokens.span.Span'>
Department of Industry, Science, Energy and Resources AccountName
092-009
<class 'spacy.tokens.span.Span'>
092-009 BSB
118689
<class 'spacy.tokens.span.Span'>
118689 AccountNum
RSBKAU2S
<class 'spacy.tokens.span.Span'>
RSBKAU2S SwiftCode




ZEPP & CO
<class 'spacy.tokens.span.Span'>
ZEPP & CO AccountName
Zepp & Co Pty Ltd
<class 'spacy.tokens.span.Span'>
Zepp & Co Pty Ltd AccountName
0414 259 250
<class 'spacy.tokens.span.Span'>
0414 259 250 BSB
032-
<class 'spacy.tokens.span.Span'>
032- BSB
WPACAU2S
<class 'spacy.tokens.span.Span'>
WPACAU2S SwiftCode


12 497 045 671
<class 'spacy.tokens.span.Span'>
12 497 045 671 ABN


12 080 534 005
<class 'spacy.tokens.span.Span'>
12 080 534 005 ABN
Bevington Consulting Pty Ltd
<class 'spacy.tokens.span.Span'>
Bevington Consulting Pty Ltd AccountName


23 063 641 510
<class 'spacy.tokens.span.Span'>
23 063 641 510 ABN
Bank of America
<class 'spacy.tokens.span.Span'>
Bank of America BankName
232 001
<class 'spacy.tokens.span.Span'>
232 001 BSB
17159011
<class 'spacy.tokens.span.Span'>
17159011 AccountNum


082-057
<class 'spacy.tokens.span.Span'>
082-057 BSB
974252919
<class 'spacy.tokens.span.Span'>
974252919 AccountNum
97 164 598 120
<class 'spacy.tokens.span.Span'>
97 164 598 120 ABN


97 414 746 779
<class 'spacy.tokens.span.Span'>
97 414 746 779 ABN
032-135
<class 'spacy.tokens.span.Span'>
032-135 BSB


19 002 966 001
<class 'spacy.tokens.span.Span'>
19 002 966 001 ABN


1427248
<class 'spacy.tokens.span.Span'>
1427248 AccountNum


In [None]:
create_test_train_split("../../data")

In [32]:
[x for x in TEST_DATA[4][1]['entities'] if x[2] == 'ABN']

[(239, 250, 'ABN')]

In [14]:
from spacy.training import Example
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = Example(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
        return scorer.scores  # example run  examples = [     ('Who is Shaka Khan?',      [(7, 17, 'PERSON')]),     ('I like London and Berlin.',      [(7, 13, 'LOC'), (18, 24, 'LOC')]) ]  ner_model = spacy.load(ner_model_path) # for spaCy's pretrained use 'en_core_web_sm' results = evaluate(ner_model, examples) 

In [None]:
TEST_DATA[0][0]
# TEST_DATA[0][1]

In [80]:
trained_model(TEST_DATA[2][0]).ents

(74 599 608 295,
 Reserve Bank of Australia,
 Department of Industry, Science, Energy and Resources,
 118689,
 RSBKAU2S)

In [18]:
len(TEST_DATA[0])

2

In [23]:
for i, x in enumerate(TEST_DATA[0]):
    print(i, x)

0 ZEPP & CO
PAUL PITTIONI
VOICE OVER
TAX INVOICE
Invoice Date
Zepp & Co Pty Ltd
30 Nov 2021
129 The Comenarra
IAG
Invoice Number
Parkway
INV-04861
Turramurra NSW 2074
SYDNEY NSW 2000
AUSTRALIA
AUSRALIA
ABN
0414 259 250
72 104 433 356
ppittioni@bigpond.com
paulpittioni.com.au
Description
Quantity
Unit Price
Amount AUD
Paul Pittioni Voice Over Session
2.00
500.00
1,000.00
2 x 30" National TVCs - 3 month duration
Subtotal
1,000.00
TOTAL GST 10%
100.00
TOTAL AUD
1,100.00
Due Date: 14 Dec 2021
Terms are 14 days
Payment to:
Bank : Westpac
Acc Name: Zepp & Co Pty Ltd
BSB: 032-090
Acc Number: 368-840
Swift Code :WPACAU2S

1 {'entities': [(570, 577, 'BSB'), (610, 618, 'SwiftCode'), (529, 536, 'BankName'), (217, 231, 'ABN'), (590, 597, 'AccountNum'), (60, 77, 'AccountName')]}


In [25]:
type(TEST_DATA[0])

tuple

In [64]:
from spacy.tokens import Doc
from spacy.training import Example
nlp = spacy.load("en_core_web_sm")
words = ["hello", "world", "!"]
spaces = [True, False, False]
predicted = Doc(nlp.vocab, words=words, spaces=spaces)
reference = Example.reference(my_data)
example = Example(predicted, reference)
example

NameError: name 'my_data' is not defined

In [53]:
Doc(TEST_DATA[0])

TypeError: Argument 'vocab' has incorrect type (expected spacy.vocab.Vocab, got tuple)

In [55]:
ner = nlp.add_pipe("ner")
ner.initialize(lambda: [], nlp=nlp)

ValueError: [E007] 'ner' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']

In [56]:
ner.get_examples

NameError: name 'ner' is not defined

In [1]:
import os
# import sys
import string
import spacy

# sys.path.append('..')

from src.utils import azure_utils

# MODEL_PATH = azure_utils.download_model()
MODEL_PATH = "../"

nlp = spacy.load(MODEL_PATH)
# nlp = spacy.load(m_dir)


def predict(text):
    entities = {}
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(translator)
    doc = nlp(text)
    # print(
    #     "SPACY"
    # )
    # print(doc)
    for ent in doc.ents:
        entities[ent.label_] = {'text': ent.text, 'start': ent.start_char}
    return entities

# data_dir = "../../model/ner_model"


SyntaxError: invalid syntax (Temp/ipykernel_11208/1123235015.py, line 11)