## Install Libraries

In [1]:
!pip install fuzzywuzzy
!pip install sentencepiece
!pip install transformers
!pip install mysql-connector-python
!pip install python-Levenshtein

Collecting fuzzywuzzy
  Downloading https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 9.1MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 9.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4

In [2]:
!gdown --id 16_igOe3H0HougjHL2oeMnA89eDcGftge
!unzip pytorch_models.zip

Downloading...
From: https://drive.google.com/uc?id=16_igOe3H0HougjHL2oeMnA89eDcGftge
To: /content/pytorch_models.zip
430MB [00:02, 152MB/s]
Archive:  pytorch_models.zip
   creating: pytorch_models/
   creating: pytorch_models/action/
  inflating: pytorch_models/action/config.json  
  inflating: pytorch_models/action/pytorch_model.bin  
   creating: pytorch_models/actor/
  inflating: pytorch_models/actor/config.json  
  inflating: pytorch_models/actor/pytorch_model.bin  


In [3]:
!gdown --id 1Dux41_n__uPla-XPoUQ0TsJbVhcV390F

Downloading...
From: https://drive.google.com/uc?id=1Dux41_n__uPla-XPoUQ0TsJbVhcV390F
To: /content/parties.csv
802MB [00:10, 28.3MB/s]


## Class and Helper Functions

In [4]:
import re 
import pandas as pd 
import numpy as np
import mysql.connector
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch
import time
from fuzzywuzzy import fuzz

In [5]:
def divide_chunks(total_len,chunks):
    chunk_size = total_len / float(chunks)
    last = 0.0 
    out = []
    while last < total_len:
        out.append((int(last),int(last+chunk_size)))
        last += chunk_size
    return out

In [6]:
def get_action_match_map():
    match_map = {
        # This takes precedence over other events 
        'Event cancelled' : [
            '/(Event|Trial|Session|review) scheduled .{0,50} (cancelled|canceled)/', 
            '/(Motion|DISMISSAL|conference|review).{0,50}SCHEDULED.{0,50}(cancelled|canceled)/',
            '/^PRH FOR .{5,12} CANCELLED /',

        ],
        'Acknowledgement of service' : [
            '/^Acknowledge?ment of service/',
        ],
        'Affidavit in support of motion' : [
            '/Affidavit of [\s\S]{1,50} in support of motion to/'
        ],
        'Affidavit filed' : [
            '/^Affidavit of [\s\S]{1,100} filed\.?/'
        ],
        'Affidavit' : [
            '/^Affidavit that/'
            
        ],
        'Agreement for Judgment' : [
            '/Agreement for judge?ment /' ,
            '/^Agreement filed/',
            '/Agreement for Judge?ment filed/',
            '/^.{5,40} Agreement Filed/',
            '/^Agreement/',

        ],
        'Answer filed' : [
            '/Answer filed by /'	,
            '/Answer of [\s\S]{2,55}/',
            'Answer to trustee summons filed ',
            'Defendant responded to initial complaint',
            '/^no ability to pay.Not working/',
            'Answer to third-party cross-claim',
        ],
        'Answer to counterclaim filed' : [
            '/Answer to (cross-?|counter-?) ?claim of[\s\S]*filed by/',
        ],

        'Amended Answer filed' : [
            'Amended answer filed by',
        ],
        'Appearance filed' : [
            '/Appearance for .{2,110} filed/',
            'Appearance filed for ',
            '/Appearance as substitute counsel .* filed/'

        ],
        'Application filed' : [
            'application filed' ,

        ],


        'Appointment of agent filed' : [  # Probate
            'Appointment of agent',

        ],
        'Arbitration' : [
            'Parties have agreed to Arbitration',
        ],
        'Assent':[
          'Assent',
          '/^Assent of [a-zA-Z-.\s]+$/',
          '/^[0-9]+ Assents'
        ],
        'Assent And Waiver Of Notice filed' : [
            '/An Interested Person[\s\S]*Filed MPC 455 to ?Assent And Waiver Of Notice/',
        ],
        'Assent and waiver': [
            '/^Assent and waiver of/'
        ],
        'Bankruptcy stay of proceedings' : [
            '/Bankruptcy stay of proceedings[\s\S]*has filed bankru(ptc|TCP)y/',
            '/^Misc Entry:[\s\S]*Notice of Bankruptcy Case Filing/'
        ],
        'Bond approved' : [
            '/^Bond with(out)? surety?(ies)? approved/',
            '/^Bond of (Trustee|personal representative) Approved[\s\S]{0,9}/',
            '/^Bond of [\s\S]{0,90} ?Approved[\s\S]{0,32}/'
        ],

        'Bond with sureties filed' : [
            'Bond with sureties',
        ],
        'Bond without sureties filed' : [
            'Bond without sureties',
        ],

        'Capias' : [
            'Capias issued'	,

        ],
        'Capias expired' : [
            'Capias returned expired',
        ],
        'Case inactivated' : [
            'Case Inactivated',
        ],
        'Case management conference' : [
            '/Case management conference[\s\S]*held/'
        ],
        'Capias returned' : [
            '/^Capias returned\.?/'  
        ],
        'Capias returned unserved' : [
            '/^Capias returned unserved/',
        ],

        # Probate

        'Certificate of Death filed' : [
            'Certificate of Death',

        ],
        
        'Citation Issued' : [
            '/Citation ([\s\S]{1,50}) Issued/'
        ],
        # 'Citation Filed' : [
        #     'Citation Filed'
        # ],
        
        'Change of address' : [
            '/^Change of address for /',
        ],
        'Complaint filed' : [
            '/Complaint .* civil action cover sheet filed/',
            'Plff filed complaint', 
            'complaint file', 
            'complaint filed'
        ],

        'Continuance' : [
            '/automatically continued until/',
            'CONTINUED FOR'	,
            'CONTINUED Until',
            '/ TO BE CONTINUED GENERALLY AT /',
            '/pltff ?(present)? deft no show.{0,100} cont til /',
            '/FIRST APPEARA?NCE IN TRIAL SESSI?O?N SCHEDULED.*RESCHEDULED/',
            '/.{0,20}rescheduled because .{0,20}/',
            '/ CONT FOR PAYMENT TIL /',
            '/up to date with payments cont[\s\S]{0,8}til/',
            '/ yet cont til /',
            '/ conts payment review til/',
            '/both parties present cont [0-9]+\/[0-9]+/',
            '/^(cont\.?|continue) generally ?, for settlement.*payment to clear/',
            'no one told him of the cont ',
            '/Motion (to continue|for continuance) ?[\s\S]* filed by[\s\S]*and assented to by other/' ,
            '/^Stipulation  ?of parties to continue case\.?/',
            '/^DEFENDANT IS PAYING\. ? ?CONTINUED GENERALLY/',
            '/^[\s\S]{0,25}continue generally per[\s\S]{0,25}/',
            '/Off list and continued[\s\S]{0,5}generally\.?/',
            '/^[\s\S]{5,16}? ?PRH CONTINUED TO/',
            '/^PRH (FOR|FROM|ON) [\s\S]{5,16} (CONTINUED|DEFERR?ED)/',
            '/^PRH FOR [\s\S]{5,16}TAKEN OFF LIST AT/',
            '/CONTINUE PRH TO [\s\S]{5,16}/',
        ],
        'Copy of will filed' : [
            '/^Will/',
            '/^Authenticated Copy of Will and Appointment[\s\S]{0,5}?(along with two codicils)?/',
            '/^A? ?(Certi?fied )?Copy of will[ 0-9\/\-]{0,20}/',
            '/^A? ?(Certi?fied )?Copy of will and appointment/',

            '/^A? ?(Certi?fied )?Copy of will \(?(original )?(filed)?[\s\S]{0,20}/',
            '/^A? ?(Certi?fied )?Copy of appointment and will/',
            '/^A? ?(Certi?fied )?Copy of will,? ?dated/',
            '/^A? ?(Certi?fied )?Copy of will,? ?[\s\S]{0,30}pa?ge?s/',
            '/^A? ?(Certi?fied|Authenticated)? ?Copy of will from[\s\S]{0,30}/',
            '/^A? ?(Certi?fied )?Photocopy of will[\s\S]{0,40}/',
            '/^A? ?Copy of Will with Alterations/',
            '/^Unsigned copy of will[\s\S]{0,40}/',

            '/^Will( [\s\S]{1,8})?/',
            '/^Will ? ?\( ?copy of ?\)/',
            '/^Will, dated .{2,20}/',

            'Will, ( entered in error ) see event #4 - authenticated copy of will',
        ],
        'Corporate disclosure statement filed' : [
            'Corporate disclosure statement filed',

        ],
        'Counsel added' : [
            '/added as .{0,20} Counsel/' ,
            '/Attorney.{6,30}representing.{6,}as of/',
            '/On this date .*, Esq\.?[\s\S]{0,25} added for /',
        ],
        'Counsel withdrawal' : [
                '/dismissed\/withdrawn as .{0,20} Counsel/',
                'Motion to withdraw as counsel filed',
                '/Attorney[\s\S]*dismissed\/withdrawn/',
        ],
        'Counterclaim filed' : [
            '/^Counterclaim filed by /',
            '/^ ?[\s\S]{3,55}\'s[\s\S]{0,4}Counterclaim[\s\S]{0,5}/'
        ],
        'Debtor before the court' : [
            'Judgment debtor before the court',
        ],
        'Decree and Order of Formal Probate' : [
            '/^Decree and Order of Formal Probate/',
        ],
        'Default' : [
            '/^.{0,15}Judgment (in SP)? ?by default/',
            '/^.{0,15}Default entered against /' ,
            'Judgment debtor failed to appear and was defaulted',
            'pltff present deft no show default judgment',
            'Default Judgment entered',
            '/^Defendant defaulted at /'
        ],
        'Default Removed' : [
            '/^.{0,15}Default removed against/'
        ],
        'Demand for Jury Trial' : [
            '/^Jury trial claim on all issues/',
            '/^Jury claim of /',
            'Jury Demand filed by ',

        ],
        'Deposit in Court' : [
            '/^Deposite?d? in court.{0,10}by /',
        ],
        'Dismissal' : [
            'DISM PAID IN FULL',
            '/Dismissed[\s\S]*PAID IN FULL/',
            '/^Supplementary process dismissed /',
            '/^Judgment of dismissal /',	
            '/CASE DISMISSE?D?\.? ? ?/',
            '/case[\s\S]{0,5}Dismissed at the request of/',
            'Notice of voluntary dismissal',
            'dismissed by agreement of parties',
            'Parties agree to dismiss case upon payment',
            '/^.{0,20}NPP ?- ?DISM\.?/',
            '/^.{0,12}Neither Party Present,? ?off list/',
        ],
        'Entry of Action filed' : [
            '/^.{0,20}Entry of action filed/',
        ],

        'Event resulted' : [
            '/^Event resulted[\s\S]*Result:/',

        ],
        'Event scheduled' : [
            '/^Event scheduled[\s\S]*Date/',
            'hearing scheduled',
            '/^(Motion|Payment review|status review) scheduled/',

            'PRETRIAL CONFERENCE SCHEDULED for',
            'TRIAL SCHEDULED',
            'CASE MANAGEMENT CONFERENCE SCHEDULED',
            '/FIRST APPEARA?NCE IN TRIAL SESSI?O?N SCHEDULED/',
            '/^Scheduled[\s\S]{0,10}Event:/',

        ],
        'Execution' : [
            '/^Execution (Writ )?issued/',
            '/^Execution Writ for possession [\s\S]*issued[\s\S]*sent to/',
        ],
        'Execution satisfied' : [
            '/^Execution Writ (for possession ?)?(returned )?(fully )?satisfied/',
        ],
        'Execution unsatisfied' : [
            '/^Execution Writ (returned )?unsatisfied/',
        ],

        'Ex Parte hearing' : [
            '/^.{0,20}Ex parte hearing held/'
        ],


        'Fee paid' : [
            '/Filing fee .{1,100} paid/',
            '/^Summary Process:[\s\S]*MGL[\s\S]*Receipt:.*Date/',
            '/^Surcharge[\s\S]*MGL[\s\S]*Receipt:.*Date/',
        ],
        'Fee due'  : [
            'Filing Fee due',
            'Filing Fee Surcharge due',
            '/^Small Claims Filing Fee [\s\S]{0,50}due/',
        ],
        'Fee waived' : [
            'filing fee waived on finding of',
            'Filing fee and surcharge waived on'
        ],
        'Hearing' : [
            '/.{4,15} 4th session both parties present.{20,}/',
            '/.{4,15} 4th session.*[0-9]{2}(am|pm).{20,}/' ,
            '/9:28 offlist deft arrived @ 9:31; pltff not pres;/',
            '/rm 125 deft arrived @ 11:06 paid \500/',
            'both parties present deft has not been living',
            'both parties present; deft brought letter',
            '/start \/ stop .{0,12}both parties present deft in custody 1st session/',
            '/ session .{0,20}both parties present[\s\S]/',

            '/child care center (pltff|deft) present (pltff|deft) show/',
            '/^.{5,12}deft present only\.?/',
            '/^bpp present /',
            'th session, deft. only present',
            '6-20-2011 @ 4:30 p.m. Asst. Clerk-Magistrate',
            'START 1046 / STOP 2165 / BOTH PARTIES PRESENT',
            '3/21/11 plaintiff not present, Deft. only present',
            'child care center pltff present deft no show',
            '/Room [0-9]+, pltff\.? only present, deft\.? not present/',
            '/Motion for attachment by trustee process [\s\S]{0,30}heard [\s\S]{0,40}Tape [0-9]th session/',
        ],
        'Interpreter requested' : [
            '/^.{0,12}Interpreter requested/',
        ],
        'Judgment' : [
            '/^Judgment issued/',
            '/^Judgment by Default for /',
            '/^Judgment for /' ,
            '/^Judgment Entered:/',
            'Judgment for pltff',
            'judgment for deft',
            '/Order for Judgment[\s\S]*findings for/',
        ],
        'Judgment amended' : [
            '/Amended Default Judgment.*issued/',
        ],
        'Judgment debtor able to pay' : [
            '/Judgment debtor.*(found|stipulated).* to be.* able to pay on judgment/',
        ],
        'Judgment debtor unable to pay' : [
            '/Judgment debtor.*(found|stipulated).* to be.* unable to pay on judgment/',
        ],
        'Judgment vacated' : [
            '/^Judgment entered on .{2,20} vacated/'
        ],
        'Mediation' : [
            '/^Mediation held\.?/'	 
        ],

        'Memorandum filed' : [
            '/^Memorandum filed by /'	 
        ],
        'Military affidavit filed' : [
            '/^Military affidavit filed ?/',
            '/Military Service and (Rule 10 )?Affidavit/',
            'Affidavit as to Military Service',
        ],
        'Must file to avoid judgment' : [
            '/^(Deft|Pltff)\.? to file [\s\S]{5,120}or judgment[\s\S]{1,10}entered/',
        ],

        'Motion allowed' : [
            '/^Motion  ?(to )? ?(acc?ept|allow|amend|appoint|approve|assess|compel|continue|dismiss|execution|issue)[\s\S]*allowed/',
            '/Motion  ?(for|to) [\s\S]*(is|and)? allowed\.?.{0,170}/' ,
            '/Motion  ?(for|to) .* heard at which .* were present and ALLOWED/' , 
            '/Motion  ?for new capias[\s\S]* allowed/',
            '/Motion  ?for summary judgment allowed/',
            '/^Motion PHOTOCOPY [\s\S]*Allowed/',
            'pltff present deft no show mtoion allowed capias',
            'Structured settlement transfer approved',
            '/^Attachment by trustee process[\s\S]{0,50} allowed against/',

        ],
        'Motion denied' : [
            '/(Misc entry:?)? ?Mot(ion)?  ?(for|to) [\s\S]*(is|and)? ?denied\.?/' ,
            '/^Motion  ?(to )? ?(acc?ept|allow|amend|appoint|approve|assess|compel|continue|dismiss|execution|issue)[\s\S]*denied/',

        ],
        'Motion withdrawn' : [
            '/^.{0,25}Motion (transfer|to|for) [\s\S]* waived or withdrawn/',
            '/^.{0,25}Motion (transfer|to|for) [\s\S]* withdrawn by moving/',

        ],
        'Motion filed' : [
            '\^Motion filed/',
            '\^Motion  filed/'
        ],

        'Motion to accept copy of will' : [
            '/^Motion to (allow|approve|ac?cept) (photo)?copy of will( as original)?/',

        ],


        'Motion to amend judgment' : [
            '/^Motion ? ?to amend judgment and execution filed/',
            '/^Motion  ?to Amend Default Judgment.*filed/',

        ],

        'Motion to amend Voluntary Administration Statement' : [
            '/^Motion to amend voluntary administration/'
        ],
        'Motion to appoint special process server' : [
            '/^Motion to appoint special process server/',
        ],
        'Motion to approve transfer of structured settlement' : [
            '/Motion transfer of structured settlement ? ?filed/',				

        ],
        # These motions should appear beneath allowed/withdrawn motions above
        'Motion to attach by trustee process' : [
            '/^(Ex parte )?motion for attachment by trustee process/',
            '/Motion ? ?to attach wages and for successive service on trustee/',
        ],

        'Motion to assess attorney fees' : [
            '/^(on .{0,16})?Motion  ?(to|for)? ?assess(ment)? (of )?attorney(\'s)? fees/'
        ],

        'Motion to assess damages' : [
            '/^(on .{0,16})?Motion ? ?(to|for)? ?assess(ment)? (of )?damages/'
        ],
        'Motion to compel discovery' : [
            '/^Motion  ?to compel discovery/',
            '/^Motion  ?to compel production of documents/',
        ],
        'Motion for continuance' : [
            '/^Motion ? for ? continuance ? ?[\s\S]*filed by/',
            '/^Motion to continue trial filed/',
        ],

        'Motion for default judgment' : [
            '/^.{0,16}Application[\s\S]*(for|to enter) default[\s\S]* against /',
            '/Motion for default judgment and req to assess damages filed/',
            'Motion for Entry of Default Judgment filed',
            '/Motion to default (&|and) charge trustee/',
        ],


        'Motion to dismiss counterclaim' : [
            '/^Motion ? ?to Dismiss Counterclaim/'
        ],
        'Motion for exemption from time standards' : [
            '/^Motion  ?t?o? ? ?(request )?for exemption from time standards filed by/',
        ],
        'Motion for final judgment for relief' : [
            '/^Motion  ?for final judgment for relief[\s\S]{0,50}filed by/',
        ],

        'Motion to inspect records' : [
            '/^Motion  ?for Inspection and Copying of[\s\S]*records/'
        ],
        'Motion to issue execution' : [
            '/Motion (to )? ?issue execution filed/',
            '/Motion to Issue Execution Late filed/',
            '/Application.{0,26} for Execution Writ/',
        ],
        'Motion for judgment on the pleadings' : [
            '/^.{0,15}Motion  ?for judgment on the pleadings [\s\S]*filed/',
        ],
        'Motion to lift garnishment' : [
            'Motion to lift garnishment filed',
        ],
        'Motion for minor settlement' : [
            '/^Motion minor settlement/'
        ],


        'Motion for new capias' : [
            '/Motion *(for new|lost) capias[\s\S]*filed by/',
            '/^Motion expired capias/',
            '/Motion new capias ? ?filed by/',
            '/Motion new capias  ?made in open court/',
            '/Motion (to )?Renew capias filed/',

        ],

        'Motion to remove default' : [
            '/^Motion ? ?to remove default and file answer late/',
            '/^Motion ? ?to remove default[\s\S]*filed by/',
            '/^Motion by ? ?[\s\S]{1,30} ? ?to remove default/',
        ],
        'Motion to stay execution' : [
            'Motion to further stay Execution filed',
        ],
        'Motion to strike answer' : [
            '/Motion to strike .*answer /',
        ],
        'Motion to Substitute a Party' : [
            '/^Motion.*to substitute.*party/',
        ],
        'Motion for successive service of trustee summons' : [
            '/^Motion for successive service of trustee summons/'
        ],
        'Motion for summary judgment' : [
            '/^.{0,70}Motion for summary judgment/',
        ],
        'Motion to serve trustee summons' : [
            '/^Motion[\s\S]{0,4}to serve (copy of )?trustee summons/',
        ],
        'Motion to transfer' : [
            '/^Motion to transfer .* filed/',
            '/Copy of Motion to transfer [\s\S]*filed/',
        ],
        'Motion to vacate dismissal' : [
            '/^Motion to vacate dismissal [\s\S]*filed by/',
        ],
        'Motion to vacate judgment' : [
            '/^Motion ? ?to vacate judgment[\s\S]*filed by/',
            '/^Motion ? ?vacated? judgment[\s\S]*filed by/',
        ],



        'Notice of appeal'	 : [
            'Notice of appeal for trial',
        ],
        'Notice of claim' : [
            '/^Notice of claim:[\s\S]{1,20}/'
        ],
        'Notice of dismissal' : [
            '/Notice of dismissal .*filed by /',
            'Notice of dismissal filed',
            '/Stipulation of dismissal.*filed/',
        ],
        'Notice of potential default' : [
            '/parties notified[\s\S]*subjected to defaulting[\s\S]{0,20}failure to answer/',
        ],
        'Notice of potential dismissal' : [
            'Notice of Potential Dismissal',
            '/^DISMISSAL .{0,50}potentially applicable/',
            '/^DISMISSAL FOR FAILURE TO ACT[\s\S]*potentially applicable on/'
        ],

        'Notice of intent to offer medical records' : [
            '/^Affidavit [\s\S]*(under|pursuant to).*233.*79G/',
            '/^Affidavit [\s\S]*intent[ion]{3}? to offer medical[\s\S]*records/',
            '/^Affidavit of [\s\S]* for medical Recor?ds from [\s\S]',
            'Intention to offer certified Medical Records',
            '/Affidavit in compliance with [\s\S]*233[\s\S]*79G/',
            '/Notice of Intention to offer [\s\S]*Medical Records/',

        ],
        'Notice sent' : [
            '/^Notice concerning next court event.*sent to parties\.?/',
            '/^Notice of next event sent/',
            'Notice of case management conference sent',
            'Notice of pretrial conf',
            '/^Notice of judgment sent to parties/',
            '/^Notice sent to parties\.?([\s\S]{0,10}A? ?Noti?ce (sent )?to the Parties[\s\S]{0,20}sent[\s\S]{0,100})?/',
            '/^Notice of court action on .{1,15} sent to parties/',
            '/^Notice of Default sent to parties/',
            '/^notice of PTC? (mailed|sent)/',
        ],

        'Notice of trial' : [
            'notice of trial issued',
        ],
        'Notice to quit filed' : [

        ],
        'No capias returned' : [
            'No Capias returned' , 
        ],
        'No parties present' : [
            '/^NPP.{0,3}/',
            '/^.{0,12}neither party present\.?/',
            'neither party present offlist',
            'NEITHER PARTY IN COURT',

        ],
        'Opposition to motion' : [
            '/Opposition filed by[\s\S]{1,40}to Motion/',
            '/Opposition to Motion filed by[\s\S]{1,40}/',
        ],
        'Order for Informal Probate' : [
            '/^Order for Informal Probate of Will/',
        ],
        'Order Nisi for Dismissal' : [
            '/^Order nisi for judgment of dismissal/',
        ],

        # Unknown actual order, terse (nisi = unless)
        'Order Nisi' : [
            '/^Order:? NISI;? ?parties notified[\s\S]{0,7}/'
        ],
        'Original note filed' : [
            '/^Original note or other paper filed/',
        ],
        'Payment order' : [
            '/^.{0,20}Payment order/', 
            '/^Order, you have until.{2,20}to place the amount of /',
        ],
        'Payment review' : [
            'Payment Review scheduled for[\s\S]*has[\s\S]*been[\s\S]*resulted', 
            '/^Payment review held/',
            '/ Sess\. BPP before .{1,20} for P\.R\./', 
            '/ session for payment review/',
            '/amount cont for further payment review /',
        ],


        # Probate matters #

        'Petition for Administration' : [
            '/^Petition for Administration ?(DBN\/)?(CTA)?/',
            'Petition for Appointment of Public Administrator'
        ],
        'Petition for Allowance of Account'   : ['Petition for Allowance of Account'],
        'Petition for Appointment of Trustee' : ['Petition for Appointment of Trustee'],
        'Petition for Formal Probate' : ['Petition for Formal Probate'],
        'Petition for Informal Probate' : ['/^Petition for Informal/'],
        'Petition for Order of Complete Settlement' : ['Petition for Order of Complete Settlement'],
        'Petition for Probate of Foreign Will' : [],
        'Petition for Probate of Will and Appointment of Executor' : [],
        'Petition for Removal of Personal Representative' : [],
        'Petition for Sale of Real Estate' : ['Petition for Sale of Real Estate'],		
        'Petition for Special Administration' : [],

        # END probate matters #


        'Pre-Trial conference' : [

            '/Pretrial conference held/',
        ],
        'Pre-Trial Memorandum' : [

            '/Pre-Trial Memorandum filed/',
        ],
        'Release of All Demands and Assent to Account' : [
            '/^Release of All Demands and Assent to Account/',
        ],
        'Request for admissions filed' : [
            '/^Request for admissions/',
        ],
        'Request for continuance' : [
            '/ requests that case be continued/',
            '/^Parties jointly request that case be continued/',
            '/^.{0,25}requested a new hearing date/',
        ],

        'Request for Temporary Restraining Order' : [
            ' files request for TRO ',
        ],
        'Request to not dismiss' : [
            '/^Pla?i?n?t?i?ff\.?  ?reports that case is active and requests that it[\s\S]*not be dismissed/'
        ],

        'Return of Service' : [
            '/^Return of service/'	 
        ],
        'Satisfaction of judgment' : [
            'Satisfaction of judgment filed',
            'pltff present reports judgment satisfied', 
            'present judgment is satisfied',
            'pltff reports judgment satisfied',
        ],
        'Settlement agreement' : [
            '/MEDIATED SETTLEMENT[\s\S]*AGREEMENT REACHED/',
            'both parties present mediated (settlement|agreement)',
            'Settlement approved between ', 
            '/^Settlement reported by parties/',

        ],
        'Statement of small claim filed' : [
            '/Statement of small claims? entered/',
            '/statement of small claims? filed/',
        ],
        'Statement of damages filed' : [
            'statement of damages filed by' ,
        ],
        'Statement of Voluntary Administration filed' : [
            '/^Statement of Voluntary Administration[\s\S]'
        ],
        'Status Review' : [

            '/^.{0,15}Status review held/',
        ],
        'Suggestion of bankruptcy filed' : [ # in real terms this is a NOTICE of bankruptcy
            '/Suggestion of bank?ruptcy .{0,70}filed/',
            'Notice of Chapter 7 Meeting filed from Bankruptcy Court',
        ],

        'Suggestion of death filed' : [
            '/^Suggestion of Death of[\s\S]{0,40}/'
        ],

        # Housing
        'Summary Process Summons and Complaint filed' : [
            '/^SP Summons and Complaint[\s\S]*(rent|cause)/',
        ],

        'Summons' : [
            'summons to ',	 
            'summons issued for ',
        ],
        'System Notes' : [
            'This Case Converted from',

        ],
        'Termination notice filed' : [
            '/^Termination notices? filed[\s\S]{0,40}/'
        ],
        'Third-party cross-claim filed' : [
            '/^Third-party cross-claim filed/'
        ],
        'Transfer of case' : [
            '/^Case received at .{0,100} Court.{0,30}from .{0,100} Court\.?/',
            '/^Case transferred out from [\s\S]* to /',
            '/^Case ordered transferred to /',
            '/^Misc\.? Entry:? ?Case transferred to/',
            '/^transferr?ed from .{4,20}/',
        ],
        'Trial' : [
            '/.{0,14}First appearance in trial session held/',
            'Jury trial held',
            '/both parties present .{0,5}session.{0,15} trial on the merits begins/',
        ],
        'UCC filed' : [
            '/Uni?fo?rm Counse?l Certi?fi?ca?tion (form )?filed by/',
        ],
        'Under Advisement' : [
            '/Motions? Taken under advisement/',
            '/Taken under advisement\.?/'
        ],
        'Verification of Defendant\'s Address filed' : [
            '/Veri?fi?cation of Def(endan)?t.{0,3} Address (form )?filed/',
        ],
        'Waiver of Attorneys Fees filed' : [
            '/ files waiver of attorneys fees\.?/',
        ],
        'Waiver of Award of Pre-Judgment Interest filed' : [
            '/^Waiver of Award of Pre-Judgment Interest filed/'
        ],
        'Waiver of jury trial' : [
            'Waiver of jury filed by ',
        ],
        'Witness list filed' : [
            '/^.{3,12} Witness List filed/',

        ],


        # Catch blobs of notes
        'Misc Notes' : [

            '/^deft\.? paid money order.{5,15}and will pay.{5,60}/',
            'Unattested photocopies of court documents.',
            'Blank Summons MGL 262 section 4b',
            '/^Certificate of Orders, Decrees, Rulings, Judgments or Other Proceeding/',
            '/^Misc Entry: ?[\s\S]{300,}/',
            '/^Misc Entry: ?account closed/',
            '/^Misc Entry: ?pltff\'s calls/',
            '/^Misc Entry: ?deft drops off /',
            '/^Misc Entry: ?DEFT SENDS /',
            '/^Misc Entry: ?OFF LIST/',
            'Misc Entry: Motion taken off list',
            '/^Misc Entry: \[0-9\.]+ money order rec/',
            '/^Misc Entry: ?Deft\.? called and spoke/',
            '/^REC\'D COPY DEFT WILL MAIL PAYMENT TO PLTFF.{0,60}/',
            'PLAINTIFF TO FILE MOTION TO ISSUE EXECUTION IF NEEDED BEFORE',
            '/^deft\. called would like to speak with pltff/',
            '/^no entry fee was received.*sent letter to pltff[\s\S]*request them[\s\S]*forward the payment/',

            'dismissed request of pltff in court acm john clough',

            '/^no action was given to clerk by covering atty/',
        ]


    }
    
    return match_map




In [7]:
def get_actor_match_map():
    match_map = {
        '/.{4,15} 4th session both parties present.{20,}/': 'Court',
        '/.{4,15} 4th session.*[0-9]{2}(am|pm).{20,}/'    : 'Court',
        '/9:28 offlist deft arrived @ 9:31; pltff not pres;/'   : 'Court',
        '/rm 125 deft arrived @ 11:06 paid \$500/'   : 'Court',
        '6-20-2011 @ 4:30 p.m. Asst. Clerk-Magistrate'      : 'Court',
        '3/21/11 plaintiff not present, Deft. only present' : 'Court',
        '/Affidavit of Ngu Huynh Affidavit.*233.*79/'    : 'Defendant',
        '/amount cont for further payment review /' : 'Court',
        '/^Acknowledge?ment of service[\s\S]*\(Plaintiff\)/'    : 'Plaintiff',
        '/^Acknowledge?ment of service[\s\S]*\(Defendant\)/'    : 'Defendant',
        'Appearance filed for deft' : 'Defendant',
        'Appearance as substitute counsel for plaintiff': 'Plaintiff',
        'Appearance as substitute counsel for P01' : 'Plaintiff',
        'application filed'        : 'Plaintiff',
        '/^Affidavit in compliance with Massachusetts General Laws Chapter[\s\S]{2,10}Section 79G$/' : 'Court', 
        '/^Application[\s\S]*(for|to enter) default[\s\S]* against D01/' : 'Plaintiff',
        '/Attorney Wilson.{6,30}representing Portfolio.{6,}as of/'  : 'Plaintiff',
        'Blank Summons MGL 262 section 4b'    : 'Court',
        '/both parties present cont [0-9]+\/[0-9]+/' : 'Court',
        'both parties present mediated agreement' : 'Court',
        'both parties present deft has not been living' : 'Court',
            'both parties present; deft brought letter' : 'Court',
            'th session, deft. only present' : 'Court',
        'by defendant\'s attorney'  : 'Plaintiff',
        '/^bpp present /' : 'Court',
        '/^Capias returned/'     : 'Court',
        '/^CASE DISMISSED\.?$/'  : 'Court',
        'Case Inactivated'     : 'Court',
        '/case[\s\S]{0,5}Dismissed at the request of/': 'Court',
        'CASE MANAGEMENT CONFERENCE SCHEDULED' : 'Court',
        '/^Case received at .{0,100} Court.{0,30}from .{0,100} Court\.?$/'          : 'Court',
        '/^Certificate of Orders, Decrees, Rulings, Judgments or Other Proceeding/' : 'Court',
        'PRETRIAL CONFERENCE SCHEDULED for'    : 'Court',
        '/Case management conference[\s\S]*held/' : 'Court',

        '/Change of address for P0/'  : 'Plaintiff',
        '/Change of address for D0/'  : 'Defendant',

        'child care center both parties present' : 'Court',
        '/child care center (pltff|deft) present (pltff|deft) show/' : 'Court',

        'CONTINUED FOR'   : 'Court',
        'CONTINUED Until' : 'Court',
        '/ CONT FOR PAYMENT TIL /'    : 'Court',
        '/ conts payment review til/' : 'Court',
        'Capias issued'   : 'Court',
        'Capias returned'      : 'Court',
        '/^cont\.? generally ?, for settlement.*payment to clear/' : 'Court',
        '/Deposited in court:.{1,10} by P0/' : 'Plaintiff',
        '/Deposited in court:.{1,10} by D0/' : 'Defendant',

        '/filed jointly by all parties/' : 'All Parties', # *** special/rare *** # 
        '/^Default removed/'   : 'Court',
        'Defendant responded to initial complaint'    : 'Defendant',
        'Defendant\'s intention to offer Medical'     : 'Defendant',
        '/^.{5,12}deft present only\.?$/' : 'Court',
        '/^(Deft|Pltff)\.? to file [\s\S]{5,120}or judgment[\s\S]{1,10}entered/' : 'Court',

        '/evidence pursuant[\s\S]*Kingsley Ladega[\s\S]*Marianne Langford/'    : 'Plaintiff',
        '/(added|dismissed\/withdrawn) as .{0,20} Counsel.{0,4}for Defendant/' : 'Defendant',
        '/(added|dismissed\/withdrawn) as .{0,20} Counsel.{0,4}for Plaintiff/' : 'Plaintiff',
        '/filed .{0,12}by P0/'   : 'Plaintiff',
        '/filed .{0,12}by D0/'   : 'Defendant',


        'filed by P0'  : 'Plaintiff',
        'filed by plainfiff' : 'Plaintiff',
        'filed by plaintiff' : 'Plaintiff',
        'filed by D0' : 'Defendant',

        'filed by defendant' : 'Defendant',
        'filed by deft' : 'Defendant',


        'Interpreter requested for next court date.' : 'Court',
        '/start \/ stop .{0,12}both parties present deft in custody 1st session/' : 'Court',

        'made in open court by P0' : 'Plaintiff',
        'made in open court by D0' : 'Defendant',


        '/Military affidavit filed as to whether[\s\S]*is in[\s\S]*military service/' : 'Plaintiff',



        '/Motion  ?for new capias[\s\S]*filed by/' : 'Plaintiff',
        '/Motion  ?(for|to) [\s\S]*(is|and) denied\.?.{0,200}$/'   : 'Court',
        '/^Motion[\s\S]{0,4}to serve (copy of )?trustee summons/' : 'Plaintiff',
        '/^no ability to pay.Not working$/' : 'Defendant',
        '/^Misc Entry:[\s\S]*Notice of Bankruptcy Case Filing$/': 'Court',
        'Notice of Chapter 7 Meeting filed from Bankruptcy Court' : 'Court',



        '/parties notified[\s\S]*subjected to defaulting[\s\S]{0,20}failure to answer/' : 'Court',
        '/^Notice of court action on .{1,15} sent to parties/' : 'Court',
        'Notice of dismissal filed by ' : 'Plaintiff',
        '/Stipulation of dismissal.*filed/' : 'Plaintiff',


        '/On this date .*, Esq\.?[\s\S]{0,30}added for Plaintiff/' : 'Plaintiff',
        '/On this date .*, Esq\.?[\s\S]{0,30}added for Defendant/' : 'Defendant',

        '/^Order, you have until.{2,20}to place the amount of /' : 'Court',
        'Plff files request'     : 'Plaintiff',

        '/pltff ?(present)? deft no show.{0,100} cont til /' : 'Court',
        'pleading by P01'  : 'Plaintiff',
        'pleading by D01'  : 'Defendant',
        'pltff present deft no show mtoion allowed capias' : 'Court',
        '/^(the )?Plaintiff\'s motion[\s\S]*filed\.?$/' : 'Plaintiff',
        '/^(the )?Defendant\'s motion[\s\S]*filed\.?$/' : 'Defendant',
        '/^(the )?Plaintiff request(s|ed) /'                            : 'Plaintiff',
        '/^(the )?Defendant request(s|ed) /'                            : 'Defendant',
        # Ambiguous, could be plaintiff or trustee:
        # '/^Return of service/' : 'Plaintiff',

        '/Room [0-9]+, pltff\.? only present, deft\.? not present/' : 'Court',
        'Satisfaction of judgment filed'        : 'Plaintiff',
        'Suggestion of banruptcy of Joanne V Sacilotto filed' : 'Defendant',
        '/^.{0,15}Status review held/' : 'Court',
        'pltff present reports judgment satisfied': 'Plaintiff',
        'pltff reports judgment satisfied' : 'Plaintiff',
        'Plaintiff files '  : 'Plaintiff',
        'present judgment is satisfied see letter by pltff': 'Plaintiff',
        '/ session .{0,20}both parties present[\s\S]*/' : 'Court',
        '/ Sess\. BPP before .{1,20} for P\.R\./'       : 'Court',
            '/ session for payment review/'            : 'Court',
        'Settlement approved between '           : 'Court',
        '/Statement of small claims? entered/'   : 'Plaintiff',
        '/statement of small claims? filed/'     : 'Plaintiff',
        'statement of damages filed by'     : 'Plaintiff',
        'Suggestion of bank?ruptcy of D0' : 'Defendant',
        '/bankruptcy stay[\s\S]*see enclosed letter[\s\S]*from pltff/' : 'Plaintiff',
        'Supplementary process dismissed '       : 'Court',
        'by plaintiff\'s attorney' : 'Plaintiff',

        'Uniform Counsel Certification filed by' : 'Plaintiff',

        'This Case Converted from' : 'Notes',
        'TRIAL SCHEDULED' : 'Court',
        'Unattested photocopies of court documents.': 'Court',
        '/up to date with payments cont[\s\S]{0,8}til/': 'Court',
        '/^Waiver of Award of Pre-Judgment Interest filed/' : 'Defendant',
        '/(Plaintiff\'s) Witness List filed/'  : 'Plaintiff',
        '/(Defendant\'s|Deft\'s) Witness List filed/'  : 'Defendant',
        '/ yet cont til /'  : 'Court',
        '/ by moving party\.?[\s\S]{0,60}$/' : '[party-in-previous-case-action]',
        '/^Original note or other paper filed/' : '[party-in-previous-case-action]',
        '/withdrawn for Plaintiff/' : 'Plaintiff',
        '/withdrawn for Defendant/' : 'Defendant',
    }
    
    return match_map
    


In [8]:
def get_action_actor_dict(lowercase=True):
    
    action_to_actor = {
        'Court' : [
            'Event cancelled',
            'Agreement for Judgment',
            'Arbitration' ,
            'Bond approved',
            'Capias' ,
            'Capias expired' ,
            'Case inactivated' ,
            'Case management conference',
            'Capias returned' ,
            'Capias returned unserved' ,
            'Continuance',
            'Debtor before the court',
            'Default' ,
            'Default Removed',
            'Dismissal' ,
            'Event resulted',
            'Event scheduled' ,
            'Execution',
            'Execution satisfied',
            'Execution unsatisfied',
            'Ex Parte hearing' ,
            'Fee due' ,
            'Fee waived',
            'Hearing' ,
            'Judgment',
            'Judgment amended',
            'Judgment debtor able to pay',
            'Judgment debtor unable to pay',
            'Judgment vacated',
            'Mediation' ,
            'Motion allowed' ,
            'Motion denied',
            'Notice of dismissal' ,
            'Notice of potential default',
            'Notice of potential dismissal',
            'Notice sent',
            'Notice of trial' ,
            'No capias returned',
            'No parties present' ,
            'Order for Informal Probate',
            'Order Nisi for Dismissal' ,
            'Order Nisi',
            'Payment order',
            'Payment review' ,
            'Pre-Trial conference',
            'Pre-Trial Memorandum',
            'Return of Service' ,
            'Satisfaction of judgment',
            'Settlement agreement',
            'Status Review',
            'Summons',
            'Transfer of case' ,
            'Trial' ,
            'Under Advisement',
            'Misc Notes' 
        ],

        'Defendant' : [],

        'Notes' : ['System Notes'],

        'Plaintiff' : [
            'Complaint filed',
            'Entry of Action filed',
            'Fee due',
            'Fee paid',
            'Motion to serve trustee summons',
            'Motion to attach by trustee process',
            'Request to not dismiss',
            'Summary Process Summons and Complaint filed',
            'Termination notice filed',
            'Verification of Defendant\'s Address filed',
        ],

        # Probate has various parties, this will select ONE of the following, 
        # in the order they actually exist in the case
        'Executor/trix--Executor/rix--Administrator/trix CTA--Administrator/trix----Administrator/rix CTA--Administrator/rix--Special Administrator/trix--Petitioner--Proposed Fiduciary--Trustee--Personal Representative--Special Personal Representative--Interested Person' : [
                'Appointment of agent filed',
                'Bond with sureties filed',
                'Bond without sureties filed',
                'Certificate of Death filed',
                'Copy of will filed',
                'Decree and Order of Formal Probate',
                'Fee due',
                'Fee paid',
                'Military Affidavit filed',
                'Motion to accept copy of will',
                'Motion to amend Voluntary Administration Statement',
                'Petition for Administration' ,
                'Petition for Allowance of Account',
                'Petition for Appointment of Trustee' ,
                'Petition for Formal Probate' ,
                'Petition for Informal Probate',
                'Petition for Order of Complete Settlement',
                'Petition for Probate of Foreign Will',
                'Petition for Probate of Will and Appointment of Executor',
                'Petition for Removal of Personal Representative',
                'Petition for Sale of Real Estate' ,
                'Petition for Special Administration',
                'Statement of Voluntary Administration filed',

        ]

    }
    
    if lowercase:
        for key,val in action_to_actor.items():
            temp = [v.lower() for v in val]
            action_to_actor[key] = temp
    
    return action_to_actor

In [9]:
class Normalizer():
    def __init__(self, model_dir_action,model_dir_actor):
        
        # connect to db
        self.db = self.init_db()
        self.cursor = self.db.cursor()
        
        # get match maps
        self.action_match_map = get_action_match_map()
        self.actor_match_map = get_actor_match_map()
        self.action_actor_dict = get_action_actor_dict()
        
        # load models
        self.model_dir_action = model_dir_action
        self.model_dir_actor = model_dir_actor
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.tokenizer,self.model_action,self.model_actor = self.load_tokenizer_and_models()

        # load parties
        self.parties = pd.read_csv('./parties.csv')
        
    def init_db(self,host = '73.38.248.152',user = 'buspark',password = 'U@5p1r3!'):
        try:
            return mysql.connector.connect(host=host, user=user, password=password)
        except Exception as e:
            print("Exception in connecting to the database!")
            print(e)
            
    
    def extract_action_regex(self,description):  
        '''
            This function extracts action using regex
            params:
                description: case action description
            returns:
                action string (empty string if regex does not match)
        '''
        
        out = ''
        description = description.lower()
        for normalized_val ,vals in self.action_match_map.items():
            # Already matches normalized value exactly
            if description == normalized_val.lower():
                out = normalized_val
                break
            for v in vals:
                # v = v.lower()
                try:
                    # Regex
                    if v.startswith('/'):
                        
                        v2 = v[1:]
                        if v2.endswith('/'):
                            v2 = v2[:-1]
                        if re.search(v2,description,re.IGNORECASE):
                            out = normalized_val
                            break
                            
                    else: # simple
                        temp = min([len(description),90])
                        if description[0:temp].find(v.lower()) != -1:
                            out = normalized_val
                            break 
                except:
                    continue
        return out

    def load_tokenizer_and_models(self):
        
        '''
            This function loads tokenizer and models
            params:
            
            returns:
            	tokenizer
                model to extract action
                model to extract actor
        '''

        tokenizer = AutoTokenizer.from_pretrained("deep-learning-analytics/wikihow-t5-small")
        model_action = AutoModelWithLMHead.from_pretrained(self.model_dir_action).to(self.device)
        model_actor = AutoModelWithLMHead.from_pretrained(self.model_dir_actor).to(self.device)
        
        return tokenizer,model_action,model_actor


    def extract_action_model(self,description):
        
        '''
            This function extracts action using trained model
            params:
                description: case action description
            returns:
                action string
        '''
        
        preprocess_description = description.strip().replace("\n","")
        tokenized_description = self.tokenizer.encode(preprocess_description, return_tensors="pt").to(self.device)
        
        action_ids = self.model_action.generate(
              tokenized_description,
              max_length=10, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
          )

        out = self.tokenizer.decode(action_ids[0], skip_special_tokens=True)
        return out 
    
    def extract_action(self,description):
        '''
        	This function will extract action from description
            params:
            	description: case action description
            returns:
            	action string
        '''
        
        action = self.extract_action_regex(description)
        if action == "":
            action = self.extract_action_model(description)
            
        return action
    
    def extract_actor_from_action(self,description,action,parties):
        
        '''
        	This function will attempt to extract actor based on action
            params:
            	description: case action description
                action: case action
            returns:
            	actor string
        '''
        actor = ""
        action = action.lower()
        for key,actions in self.action_actor_dict.items():
            if action in actions:
                actor = key
                break
                
        party_types = parties['party_type'].apply(lambda x: x.lower()).tolist()
        party_types.append('notes')
        
        if '--' in actor and 'petitioner' in actor:
            for act in actor.split('--'):
                if act.lower() in party_types:
                    actor = act
                    break
                
        return actor
    
    def verify_actor(self,actor,parties):
        '''
        	This function will vwrify whether the extracted actor exists in the parties
            params:
            	actor: actor to verify
                parties: dataframe of parties assoicated with the case
            returns:
            	True/False
        '''
        party_types = parties['party_type'].to_list()
        party_types.append('Court')
        party_types.append('Notes')
        
        valid=False
        for party_type in party_types:
            if actor.lower() in party_type.lower():
                valid = True 
                break
                
        return valid
    
    def get_case_actions_(self,case_id):
        
        query = "SELECT * \
                     FROM wp_courtdocs.cdocs_case_action_index \
                     WHERE wp_courtdocs.cdocs_case_action_index.case_id =" + str(case_id) + ';'
        self.cursor.execute(query)
        
        return pd.DataFrame(list(self.cursor),columns=['case_action_id','case_id','actor','action','description','date_time','file_reference_number','last_indexed'])
    
    def get_case_actions(self,case_id):
        
        try:
            return self.get_case_actions_(case_id)
        except:
            self.db = self.init_db()
            self.cursor = self.db.cursor()
            return self.get_case_actions_(case_id)
        
    
    def extract_actor_from_hardcoded(self,description,parties):
        
        
        description = description.lower()
        actor = ''
        before_tokens = ['answer of','claim of','filed by','motion by','acknowledgement of service']
        for tok in before_tokens:
            if tok in description:
                arr = description.split(tok)
                actor = arr[1].strip()
                if len(actor) > 40:
                    actor = actor[:40]
                break
                
        if actor == '':
            after_tokens = ["'s counterclaim"]
            for tok in after_tokens:
                if tok in description:
                    arr = description.split(tok)
                    actor = arr[0].strip()
                    break
                    
        if actor == '':
            if ('added for ' in description) and ('esq' in description):
                arr = description.split('added for')
                actor = arr[1].strip()
                if len(actor) > 40:
                    actor = actor[:40]
                    
            elif 'affidavit of ' in description:
                actor = description
                if ' certified mail' in description:
                    arr = description.split('certified mail')
                    actor = arr[0].strip()
                actor = actor.replace('affidavit of ','').replace(' filed','').strip()
            
            elif 'endorsed upon pleading by' in description:
                arr = description.strip('upon pleading by')
                arr1 = arr[1].split('(')
                actor = arr1[0].strip()
        
        if actor!='':
            if 'applies to:' in description:
                return ''
            similar_text_matches = []
            parties_dicts = parties.to_dict('records')
            
            for party in parties_dicts:
                name = party['party_name']
                if 'corporate party' in description:
                    if ('llc' in name) or ('corp' in name) or ('inc' in name):
                        continue
                
                    similarity = fuzz.partial_ratio(name,actor)
                    if similarity > 0.25:
                        party['sim_score'] = similarity
                        similar_text_matches.append(party)
                    
            if len(similar_text_matches) > 0:
                similar_text_matches = sorted(similar_text_matches,key=lambda x: x['sim_score'] )
                actor = similar_text_matches[0]['party_type']
            else:
                actor = ''
        
        if actor =='':
            pattern = 'Appearance for (.{2,80}) filed'
            match = re.match(pattern,description,re.IGNORECASE)
            if match:
                parties_dicts = parties.to_dict('records')
                for party in parties_dicts:
                    if fuzz.partial_ratio(party['party_name'],match[1]) > 0.8:
                        actor = party['party_type']
                        break
                        
        if actor == '' and 'no capias returned' in description:
            actor = 'Notes'
            
        return actor
            
    def extract_actor_regex(self,description,action,parties,case_id,case_action_id):
        '''
        	This function will use regex patterns to extract actor from description
            params:
            	description: case action description
                action: case action
                parties: dataframe of parties in this case
                case_id: case id 
                case_action_id: case_action id
            returns:
            	actor string
        '''
        actor = self.extract_actor_from_action(description,action,parties)
        if actor=="":
            # description = description.lower()
            for pattern,actor_val in self.actor_match_map.items():
                # regex
                if pattern.startswith('/'):
                    pattern = pattern[1:]
                    if pattern.endswith('/'):
                        pattern = pattern[:-1]
                    if re.search(pattern,description,re.IGNORECASE):
                        actor = actor_val
                        
                # simple 
                elif pattern.lower() in description.lower():
                    actor = actor_val
                
                if actor == '[party-in-previous-case-action]':
                    case_actions = self.get_case_actions(case_id).to_dict('records')
                    case_actions.reverse()
                    
                    get_next_actor=False
                    ca_n_current = None
                    for ca_n, ca in enumerate(case_actions):
                        if (get_next_actor) and (ca['actor']!='') and (not (ca['actor'] in ['Notes','Court'])) and (abs(ca_n - ca_n_current) < 5):
                            actor = ca['actor']
                            break
                            
                        if (ca['case_action_id'] == case_action_id):
                            get_next_actor = True
                            ca_n_current = ca_n
                    
                    if actor == '[party-in-previous-case-action]':
                        actor = ''
                
                if actor!='':
                    break
        
        if not self.verify_actor(actor,parties):
            actor = ''
        
        if actor == '' and len(parties) > 0:
            actor = self.extract_actor_from_hardcoded(description,parties)    

        return actor
    
    def extract_actor_model(self,description):
        
        preprocess_description = description.strip().replace("\n","")
        tokenized_description = self.tokenizer.encode(preprocess_description, return_tensors="pt").to(self.device)
        
        actor_ids = self.model_actor.generate(
              tokenized_description,
              max_length=10, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
          )

        out = self.tokenizer.decode(actor_ids[0], skip_special_tokens=True)
        return out 
    
    def extract_actor(self,description,action,parties,case_id,case_action_id):
        
        actor = self.extract_actor_regex(description,action,parties,case_id,case_action_id)
        if actor=="":
            actor = self.extract_actor_model(description)
            if not self.verify_actor(actor,parties):
                actor = ''
        return actor
        
    def get_parties_(self,case_id):
        
        query = "SELECT wp_courtdocs.cdocs_party_assignment_index.party_type, wp_courtdocs.cdocs_party_index.party_name \
                     FROM wp_courtdocs.cdocs_party_assignment_index JOIN wp_courtdocs.cdocs_party_index \
                     ON wp_courtdocs.cdocs_party_assignment_index.party_id = wp_courtdocs.cdocs_party_index.post_id \
                     WHERE wp_courtdocs.cdocs_party_assignment_index.case_id =" + str(case_id) + ';'
        
        self.cursor.execute(query)
        return pd.DataFrame(list(self.cursor),columns=['party_type','party_name'])

    
    def get_parties(self,case_id):
        '''
         This function will query the database and return the parties associated with the case
            params:
            	case_id: case id 
            returns:
            	dataframe of parties associated with the case
        '''
        try:
            return self.parties[self.parties['case_id'] == case_id].copy()
        except:
            self.db = self.init_db()
            self.cursor = self.db.cursor()
            return self.get_parties_(case_id)


    def normalize(self,row):
        '''
            This function will take a row as input and extract the actor and action from the description
            params:
                row: pandas series object or dict - row to normalize
            returns:
                normalized row
        '''
        
        # extract action
        if row['action'] == '':
            row['action'] = self.extract_action(row['description'])
            
        # extract actor
        if row['actor'] == '':
            ## get parties 
            parties = self.get_parties(row['case_id'])
            ## extract actor
            row['actor'] = self.extract_actor(row['description'],row['action'],parties,row['case_id'],row['case_action_id'])
        
        return row
        

## Initialize Normalizer Class

In [10]:
model_dir_action = './pytorch_models/action/'
model_dir_actor = './pytorch_models/actor/'
normalizer = Normalizer(model_dir_action,model_dir_actor)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=736.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




  if self.run_code(code, result):


## Connect to DB

In [11]:
def connect(host='73.38.248.152', user='buspark', password='U@5p1r3!'):
    mydb = mysql.connector.connect(host=host,user=user,password=password)
    if (mydb):
        print("Connection Successful")
    else:
        print("Connection Unsuccessful")
    mycursor = mydb.cursor()
    
    return mydb,mycursor

In [12]:
mydb,mycursor = connect()

Connection Successful


## Normalize Data

In [None]:
chunk_size = 100
i = 0
complete = False
while(True):
        
    written_maintainer = False
    written_normalized = False
    try:
        # get case_action_ids
        query = "SELECT * \
            FROM wp_courtdocs_NORMALIZED.maintainer \
            WHERE status_row = 'available' \
            LIMIT " + str(chunk_size)

        if not (mydb.is_connected()):
            mydb,mycursor = connect()
        df_case_action_ids = pd.read_sql(query,con = mydb)
        if len(df_case_action_ids) < 1:
            complete=True
            break

        # Change status to in progress
        query = "UPDATE wp_courtdocs_NORMALIZED.maintainer \
            SET status_row = 'in progress' \
            WHERE case_action_id = %s "
        temp_list = df_case_action_ids['case_action_id'].tolist()
        vals_case_action_ids = [(x,) for x in temp_list]
        if not (mydb.is_connected()):
            mydb,mycursor = connect()
        mycursor.executemany(query,vals_case_action_ids)
        mydb.commit()
        written_maintainer = True 

        # get data to normalize
        query = "SELECT * FROM wp_courtdocs.cdocs_case_action_index as c_a_index \
                  WHERE case_action_id in " + str(tuple(temp_list))
        if not (mydb.is_connected()):
            mydb,mycursor = connect()
        df = pd.read_sql(query,con=mydb)

        # normalize
        for j in range(len(df)):
            df.iloc[j] = normalizer.normalize(df.iloc[j].copy())

        # write normalized data to db
        val_string = "(%s,%s,%s,%s,%s,%s,%s,%s)"
        col_names_string = '(' + ','.join(df.columns) + ')' 
        query = "INSERT IGNORE INTO wp_courtdocs_NORMALIZED.cdocs_case_action_index " + col_names_string + ' VALUES ' + val_string

        df = df.astype(str)
        values = [tuple(x) for x in df.to_records(index=False)]
        if not(mydb.is_connected()):
            mydb,mycursor  = connect()
        mycursor.executemany(query,values)
        mydb.commit()
        written_normalized = True
        
        # update status
        query = "UPDATE wp_courtdocs_NORMALIZED.maintainer \
            SET status_row = 'done' \
            WHERE case_action_id = %s"
        if not (mydb.is_connected()):
            mydb,mycursor = connect()
        mycursor.executemany(query,vals_case_action_ids)
        mydb.commit()

        i+=1
    except Exception as e:
        print(e)
        print(df_case_action_ids.head())
        print(written_maintainer,written_normalized)
        if written_normalized:
            # update status
            query = "UPDATE wp_courtdocs_NORMALIZED.maintainer \
                SET status_row = 'done' \
                WHERE case_action_id = %s"
            if not (mydb.is_connected()):
                mydb,mycursor = connect()
            mycursor.executemany(query,vals_case_action_ids)
            mydb.commit()
            
        elif written_maintainer:
            # update status to available
            query = "UPDATE wp_courtdocs_NORMALIZED.maintainer \
                SET status_row = 'available' \
                WHERE case_action_id = %s"
            if not (mydb.is_connected()):
                mydb,mycursor = connect()
            mycursor.executemany(query,vals_case_action_ids)
            mydb.commit()
        
        i+=1

In [None]:
if complete:
    print('DONE!!')