In [1]:
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification, BertConfig, BertModel
import pandas as pd
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.utils import class_weight
import numpy as np
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import json
import random
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import re
from tokenizers import BertWordPieceTokenizer

In [2]:
pd.options.display.max_colwidth = 3000
pd.options.display.max_rows = 3000

In [3]:
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cuda')
device

device(type='cuda')

In [9]:
#data = pd.read_csv('./Backup master data/master_data_6.csv', usecols=['ADR', 'text', 'category', 'type', 'date'])
data = pd.read_csv('./master_data_V3_0.csv')
data.shape

  data = pd.read_csv('./master_data_V3_0.csv')


(314492, 9)

In [None]:
data['type'].value_counts()

In [None]:
data.isnull().sum()

In [10]:
data.dropna(subset=['text'], inplace=True)
data.shape

(312843, 9)

In [None]:
data['category'].value_counts()

In [19]:
data["length"] = data["text"].str.split().apply(len)

In [34]:
data["length"].describe()

count    312843.000000
mean        428.645004
std         359.406909
min           1.000000
25%         209.000000
50%         343.000000
75%         567.000000
max        5798.000000
Name: length, dtype: float64

In [20]:
d = data.loc[:, data.columns != 'text']
d.to_csv("length.csv", index=False)

In [None]:
data['category'].value_counts()

In [13]:
mappings = {'Assets - Source of funds': 'Bank Statement',
 'Bank Statement': 'Bank Statement',
 'Bank Statement(s)': 'Bank Statement',
 'BK Papers': 'Bankruptcy Papers',
 'Child Support/Alimony Agreement': 'Divorce Decree / Child Support',
 'Construction Agreement/Contract': 'Purchase Agreement',
 'Contractor Bids': 'Purchase Agreement',
 'Contractor Docs': 'Purchase Agreement',
 'Divorce Decree': 'Divorce Decree / Child Support',
 'Earnest Money Deposit': 'Purchase Agreement',
 'Gift Funds': 'Bank Statement',
 'Hazard Insurance Contact Information': 'Hazard Insurance',
 'Hazard Insurance Dec Page - Final': 'Hazard Insurance',
 'Hazard Insurance Dec Page - Incomplete': 'Hazard Insurance',
 'Hazard Insurance Dec Page - Initial': 'Hazard Insurance',
 'Hazard Insurance Dec Page - Insufficient Coverage': 'Hazard Insurance',
 'Hazard Insurance Declaration': 'Hazard Insurance',
 'Insurance - Cost Estimator': 'Hazard Insurance',
 'Insurance - Hazard Checklist': 'Hazard Insurance',
 'Insurance - Proof Paid': 'Hazard Insurance',
 'Insurance Dec Page, Other': 'Hazard Insurance',
 'Insurance Document - Other': 'Hazard Insurance',
 'Investment Account Statements': 'Retirement Account Statement(s)',
 'Mortgage Statement': 'Mortgage Statement',
 'Mortgage Statement of Inspection of Subject Property': 'Mortgage Statement',
 'Mortgage Statement/Coupon, 1st': 'Mortgage Statement',
 'Mortgage Statement/Coupon, 2nd': 'Mortgage Statement',
 'Other Property 1st Mtg Stmt': 'Mortgage Statement',
 'Other Property 2nd Mtg Stmt': 'Mortgage Statement',
 'Purchase Agreement': 'Purchase Agreement',
 'Purchase Agreement Addendum': 'Purchase Agreement',
 'Purchase Document Other': 'Purchase Agreement',
 'Rental Agreements(s)': 'Rental Agreements(s)',
 'Retirement Account Statement(s)': 'Retirement Account Statement(s)',
 'Sales Checklist': 'Purchase Agreement',
 'Sales Contract and Addendums': 'Purchase Agreement',
 'Sales Contract/Purchase Agreement': 'Purchase Agreement',
 'Sales Forms': 'Purchase Agreement',
 'Sales Stips': 'Purchase Agreement',
 'Divorce Decree / Child Support': 'Divorce Decree / Child Support'}

In [14]:
data['category'] = data['category'].replace(mappings)

In [None]:
data['category'].value_counts()

### Stopword removal

In [None]:
alterted_text=[]
remove_words = set(stopwords.words('english') + list(string.punctuation))
for text in data.text.values:
    text = re.sub(r'[~^0-9]', '', text)
    wordlist = [word for word in word_tokenize(text.lower()) if not word in remove_words]
    alterted_text.append((" ").join(wordlist))

In [None]:
data['text']= alterted_text

### Training

In [None]:
temp = []

# cats = ['Retirement Account Statement(s)',
#         'Mortgage Statement',
#         'Bank Statement',
#         'Purchase Agreement',
#         'Rental Agreements(s)',
#         'Hazard Insurance',
#         'Divorce Decree / Child Support',
#         'Bankruptcy Papers']

cats = ['Hazard Insurance',
        'Divorce Decree / Child Support',
        'Bankruptcy Papers']
        
        
for category in data['category']:
    if category not in cats:
        temp.append('others')
    else:
        temp.append('focused')
data['new_category'] = temp

In [None]:
data['new_category'].unique()

In [None]:
label_dict = {}
label_dict['focused'] = 1
label_dict['others'] = 0
label_dict

In [None]:
# possible_labels = data.new_category.unique()

# label_dict = {}
# label_dict['others']=0
# for i,cat in enumerate(cats):
#     label_dict[cat]=i+1
# label_dict

In [None]:
data['label'] = data.new_category.replace(label_dict)
data.head(1)

In [None]:
df = data.sample(frac=0.0, random_state=200)
df.shape

In [None]:
test = data.drop(df.index).reset_index(drop=True)
test.shape

In [None]:
test['new_category'].value_counts()

In [None]:
df['new_category'].value_counts()

In [None]:
classweight = torch.tensor(class_weight.compute_class_weight(class_weight='balanced',
                                                             classes=df['new_category'].unique().tolist(),
                                                             y=df['new_category'].values.tolist()), dtype=torch.float)
classweight

In [None]:
df.reset_index(drop=True, inplace=True)
df.head(1)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = 'not_set'
df.head(2)

In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['new_category', 'label', 'data_type']).count()

In [55]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

In [56]:
tokenizer.save_vocabulary("/data/mmortgage/amal_workspace/multi_page_classification_preprocessing/tokens")

('/data/mmortgage/amal_workspace/multi_page_classification_preprocessing/tokens/vocab.txt',)

In [None]:
# tokenizerBW = BertWordPieceTokenizer.from_pretrained("bert-base-uncased")
# # tokenized_sequenceBW = tokenizerBW.encode(sequence)

In [18]:
data_h = data[data["category"]=="Hazard Insurance"][:5]
data_h["length"] = data_h["text"].str.split().apply(len)
data_h

Unnamed: 0,ADR,text,category,type,date,ogr/aug,page#,angle,renamed category,length
22923,ADR-2022-100004515340,"PHH MORTGAGE P O BOX 5954, SPRINGFIELD, OH 45501-5954 Tel: 1-888-882-1855 Fax: 1-937-525-4210 May 12, 2022 ROBERT K WASHINGTON 2610 INDIAN TRL DR MISSOURI CITY, TX 77489-5200 SUBJECT: Account Number: 0038071999 Hazard Insurance Expiration Date: 05/01/2023 Property Address: 2610 INDIAN TRL DR MISSOURI CITY, TX 77489-5200 Dear Customer: Why We Are Sending This Letter? Enclosed is a policy that renews the lender-placed insurance policy on the account. We originally placed this policy on your property because acceptable proof of coverage was not provided. The annual premium to renew is shown on the policy. This premium will be charged to the escrow account. Under the terms of the mortgage documents, adequate insurance is required to be maintained on the property at all times. Failure to do so is a breach of the agreements of the mortgage. We obtained this policy because we did not receive proof that there was a sufficient insurance policy on the property. We have taken this action to protect our mutual interests in the property. What Needs To Be Done? Any lender-placed policy we have obtained may be cancelled at any time by providing us proof of sufficient insurance coverage. If you are able to provide proof of alternate, sufficient coverage, charges for the lender-placed coverage will be limited to the time periods for which you cannot provide proof of coverage. Any unused premium amounts will be refunded to the escrow account. If your account was previously non-escrow and you would like your account to go back to non-escrow, contact customer care and they can assist with de-escrowing your account. We strongly urge you to contact an agent or company of your choice to purchase coverage. Please send the policy to the address shown below, or fax a copy of the policy to 1-937-525-4210. You may also update your hazard coverage directly to our web site at www.MyCoverageInfo.com/mortgagefamily or send via email to MortgageFamily@MyCoverageinfo.com. PHH MORTGAGE SERVICES ITS SUCCESSORS AND/OR ASSIGNS AS THEIR INTEREST MAY APPEAR PO BOX 5954 SPRINGFIELD, OH 45501-5954 2114H4-1221",Hazard Insurance,LD,20220722123724,org,1,-0.1007,No,336
22924,ADR-2022-100004515340,"What We Will Do? The lender-placed coverage may not provide as much coverage as insurance you may be able to purchase directly and may cost significantly more than insurance you may purchase directly because our carrier has issued this coverage without the benefit of normal underwriting guidelines. Please read the policy carefully to make sure you understand its terms and conditions. If you have information to verify that the amount of coverage should be different, please notify us in writing and include the account number on your letter. This lender-placed policy insures your house structure only. It does not protect your personal property nor does it protect you from liability against accidents that occur on your property. For example, if your house were burglarized, it would not cover the stolen property. Thank you for taking the time to help us resolve this matter. We appreciate the opportunity to serve you and look forward to meeting your mortgage needs. If you have any questions regarding this matter, please call us at 1-888-882-1855 Monday through Friday between 8:00 a.m. and 9:00 p.m. Eastern Time, and on Saturday between 8:00 a.m. and 5:00 p.m. Eastern Time, and our Customer Service Representatives will be happy to help you. Sincerely, Loan Servicing If your insurance is not currently paid from an escrow account, and you are unable to pay your insurance premium in full, please call us as soon as possible to see if we can establish an escrow account to pay the insurance premium. If you choose this option and are eligible for escrow, we would establish a monthly escrow for the payment of future insurance premiums as well as any amounts already advanced. We will need the contact information for your insurance agent or company as well as the amount of the premium currently due. It is important that you call us immediately if you need our assistance. We cannot pay the insurance premium for coverage you obtain without your cooperation. IMPORTANT MESSAGE Please note that the coverage under the Lender Placed Policy referenced in this letter will be cancelled as of the payoff date of the account. If you retain ownership in the property after payoff, we strongly advise you to obtain your own insurance policy to avoid any lapse in coverage as a result of this cancellation. Any applicable refund will be deposited into the escrow account and the net proceeds will be sent to the last mailing address on record. Texas Property Owners: COMPLAINTS REGARDING THE SERVICING OF YOUR MORTGAGE SHOULD BE SENT TO THE DEPARTMENT OF SAVINGS AND MORTGAGE LENDING, 2601 NORTH LAMAR, SUITE 201, AUSTIN, TX 78705. A TOLL-FREE CONSUMER HOTLINE IS AVAILABLE AT 877-276-5550. A complaint form and instructions may be downloaded and printed from the department's website located at www.sml.texas.gov or obtained from the department upon request by: mail to the aforementioned address, telephone through their toll-free consumer hotline listed, or email at smlinfo@sml.texas.go...",Hazard Insurance,LD,20220722123724,org,2,0.0,No,595
22925,ADR-2022-100004515340,"DECLARATIONS AMERICAN SECURITY INSURANCE COMPANY CERTIFICATE NUMBER: MLR21149729747 PO BOX 50355, ATLANTA, GA 30302 A Stock Insurance Company CERTIFICATE PERIOD: Issued under the provisions of EFFECTIVE DATE EFFECTIVE TIME EXPIRATION DATE Master Policy No .: 05/01/2022 12:01 am 05/01/2023 MIP-RCH-02114-00 NAMED INSURED and Mailing Address: For Company Use: PHH MORTGAGE SERVICES Bosis: Territory: 0003 ITS SUCCESSORS AND/OR ASSIGNS Class: AS THEIR INTEREST MAY APPEAR Other: FIR SFD 021140160 P O BOX 5954 SPRINGFIELD, OH 45501-5954 DESCRIBED LOCATION. The property covered by this Certificate is at the described location unless otherwise stated: 2610 INDIAN TRL DR MISSOURI CITY, TX 77489-5200 COVERAGE AND LIMITS OF LIABILITY - Coverage is provided only where a premium is shown for the coverage, subject to all conditions of this Certificate. RESIDENTIAL PROPERTY: LIMIT OF LIABILITY DEDUCTIBLES PREMIUM Coverage A - $95,890 Windstorm, Hail or Hurricane: 2% of the Limit of Liability or $1,701.00 Coverage B - 10% of Coverage A $2,000, whichever is greater. All Other Perils: $1,000 TOTAL PREMIUM $1,701.00 COMMERCIAL PROPERTY: LIMIT OF LIABILITY DEDUCTIBLES PREMIUM Building - Windstorm, Hail or Hurricane: % of the Limit of Liability or , whichever is greater. All Other Perils: TOTAL PREMIUM Optional Coverages, Assessments, Surcharges, Taxes, Fees (if applicable): TOTAL AMOUNT $1,701.00 FORMS AND ENDORSEMENTS which are made a part of this Certificate at the time of issuance: MIP 223 AS (01-12),MIP 233 (01-12),MIP 05 TX (01-12),MIP 243 TX (04-18),NOTI1083 (01-20) NT0278 (08-19),MIP 219 (02-20),MIP 239 TX (02-12) BORROWER - Name and address: ROBERT K WASHINGTON 2610 INDIAN TRL DR MISSOURI CITY, TX 77489-5200 Loan No .: 0038071999 CLAIMS: 1-800-326-2845 Issue Date: 05/12/2022 ALL OTHER INQUIRIES: 1-888-882-1855 Countersignature (where required) MIP 04 AS (01-12) Page 1 of 1 MIPO4ASR-1116",Hazard Insurance,LD,20220722123724,org,3,0.0,No,282
22926,ADR-2022-100004515340,"AMERICAN SECURITY INSURANCE COMPANY P.O. BOX 50355, ATLANTA, GA 30302 A Stock Insurance Company Home Office: Wilmington, DE Residential Dwelling Certificate This Certificate only covers buildings and structures. Please read your Certificate and all endorsements carefully. THIS CERTIFICATE JACKET TOGETHER WITH THE DWELLING FORM AND ENDORSEMENTS, IF ANY, ISSUED TO FORM A PART THEREOF, COMPLETES THIS CERTIFICATE. QUICK REFERENCE Beginning Beginning On Page On Page AGREEMENT 1 CONDITIONS DEFINITIONS 1 Certificate Period 5 COVERAGES Insurable Interests 5 1 OTHER COVERAGES 1 Concealment or Fraud 5 Other Structures Your Duties After Loss 1 5 Debris Removal 2 Loss Settlement 6 Loss to a Pair or Set Reasonable Repairs 2 6 Property Removed 2 Glass Replacement 6 Appraisal Collapse 2 6 Glass or Safety Glazing Material 2 Other Insurance 6 Ordinance or Law 3 Subrogation 6 PERILS INSURED AGAINST Action Against Us 7 GENERAL EXCLUSIONS Loss Payment 7 Ordinance or Law 4 Deductible 7 Earth Movement 4 Abandonment of Property 7 Water Damage 4 No Benefit to Bailee 7 Power Failure 5 Cancellation 7 Neglect 5 Non-Renewal 7 War 5 Liberalization Clause 7 5 Waiver or Change of Certificate Provisions Nuclear Hazard 8 Intentional Loss 5 Assignment 8 Weather conditions 5 Nuclear Hazard Clause 8 Acts or decisions 5 Salvage and Recoveries 8 Faulty, inadequate or defective 5 Volcanic Eruption Period 8 Premiums 8 READ YOUR CERTIFICATE CAREFULLY MIP 223 AS (01-12) MIP223A5-0212",Hazard Insurance,LD,20220722123724,org,4,0.0,No,231
22927,ADR-2022-100004515340,"Mortgagee's Interest Protection Residential Dwelling Certificate AGREEMENT COVERAGE A - Dwelling We will provide the insurance described in this 1. Property Covered We cover Certificate in return for the premium and compliance with all applicable provisions of this Certificate and the a. The 1-4 family dwelling on the described location shown in the Declarations, used Declarations, which is attached to and forms a part of this Certificate. principally for dwelling purposes, including structures attached to the dwelling; . Materials and supplies located on or next to the DEFINITIONS described location used to construct, alter or repair the dwelling or other structures on the In this Certificate, you and your refer to the financial institution shown as named insured in the described location; and If not otherwise covered in this Certificate, Declarations. We, us and our refer to the Company c. building equipment and outdoor equipment providing this insurance. In addition, certain words and used for the service of and located on the phrases are defined as follows: 1. Borrower refers to the person or persons who have described location. 2. Property Not Covered entered into a lien or mortgage agreement with the a. Personal property of any kind. named insured for the property shown as the b. Outdoor trees, shrubs, plants and lawns. described location in the Declarations. 2. Residential Property means the dwelling and other . Outdoor swimming pools; fences, piers, structures covered by this Certificate and shown as wharves and docks; beach or diving platforms or appurtenances; retaining walls not the described location in the Declarations. 3. Net Loan Balance means and is limited to only the constituting a part of buildings; walks, roadway; and other paved surfaces. unpaid principal balance plus earned interest as of the date of loss. . Cost of excavations, grading or filling. Ti e. Foundations of buildings, machinery, boilers or 4. Actual Cash Value means the cost of replacing the damaged or destroyed property with a comparable engines which foundations are below the surface of the ground. new property, minus depreciation and obsolescence. Described Location. The legal description, or f. Pilings, piers, pipes, flues and drains which are underground. common street address, of real property which has g. Pilings which are below the low water mark. been pledged under a valid deed of trust, mortgage h. Land, including land on which the residential document, or any other mortgage instrument as property is located. security for a loan made, assumed or serviced by an insured and upon which a building is situated. COVERAGE B - Other Structures 6. Dwelling. A building designed for use as a residence for no more than four families or a We cover other structures on the described location, mobile home. 7. Mobile Home means: set apart from the dwelling by clear space. This includes structures connected to the dwelling by only a a. A building which satisfies the...",Hazard Insurance,LD,20220722123724,org,5,0.0,No,646


In [24]:
encoded_data = tokenizer.batch_encode_plus(
    data_h.text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    truncation=True,
    return_tensors='pt'
)

In [53]:
# type(encoded_data)
encoded_data

{'input_ids': tensor([[  101,  6887,  2232,  ...,     0,     0,     0],
        [  101,  2054,  2057,  ...,  1997, 10995,   102],
        [  101,  8170,  2015,  ...,  6070,  2475,   102],
        [  101,  2137,  3036,  ...,     0,     0,     0],
        [  101, 14344,  2063,  ...,  2029,  1037,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [26]:
encoded_data["input_ids"][2]#==100

tensor([  101,  8170,  2015,  2137,  3036,  5427,  2194,  8196,  2193,  1024,
        19875,  2099, 17465, 16932,  2683,  2581, 24594,  2581, 22610, 13433,
         3482,  2753, 19481,  2629,  1010,  5865,  1010, 11721, 19988,  2692,
         2475,  1037,  4518,  5427,  2194,  8196,  2558,  1024,  3843,  2104,
         1996,  8910,  1997,  4621,  3058,  4621,  2051,  4654, 16781,  3058,
         3040,  3343,  2053,  1012,  1024,  5709,  1013,  5890,  1013, 16798,
         2475,  2260,  1024,  5890,  2572,  5709,  1013,  5890,  1013, 16798,
         2509,  2771,  2361,  1011, 22110,  2232,  1011,  6185, 14526,  2549,
         1011,  4002,  2315, 16021, 12165,  1998,  5653,  2075,  4769,  1024,
         2005,  2194,  2224,  1024,  6887,  2232, 14344,  2578,  8945,  6190,
         1024,  3700,  1024,  2199,  2509,  2049, 18530,  1998,  1013,  2030,
        24022,  2465,  1024,  2004,  2037,  3037,  2089,  3711,  2060,  1024,
        21554, 16420,  2094,  6185, 14526, 12740, 16048,  2692, 

In [None]:
token_len = []
for i, row in tqdm(data.iterrows()):
    text = row[1]
    base_bert_tokens = tokenizer.tokenize(text)
    token_len.append(len(base_bert_tokens))
#     break
    

len(token_len) 
# print(token_len)
# cnt=0
# for tup in zip(base_bert_tokens):
#     print('{:<12} '.format(tup[0]))
#     cnt+=1
#     if cnt>=512:
#         break

306343it [41:12, 136.98it/s]

In [42]:
df = pd.DataFrame({'col':token_len})
df.describe()

Unnamed: 0,col
count,312843.0
mean,671.715531
std,489.760179
min,1.0
25%,346.0
50%,566.0
75%,892.0
max,8417.0


### Findings
1. On an average, BERT classification is reading and classifying based on 80% of OCR text
2. For more than 50 percentile of OCR text, max 512 token limit is exceeding
3. Based on min/max, data cleaning is required

In [50]:
np.percentile(token_len, 45)

515.0

In [52]:
out = pd.cut(token_len, bins=[0,650,700,10000])
out.value_counts().sort_index()

(0, 650]        182424
(650, 700]       12566
(700, 10000]    117853
dtype: int64

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512,
    truncation=True,
    return_tensors='pt'
)

In [None]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
batch_size=32
dataloader_train = DataLoader(dataset_train, 
                              shuffle=True,
                              num_workers=0, 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   shuffle=False,
                                   num_workers=0, 
                                   batch_size=batch_size)

In [None]:
len(dataloader_train)

In [None]:
len(dataloader_train)*32

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, 
                 dropout=0.1, 
                 BERTconfig=False, 
                 attention_heads=12, 
                 hidden_layers=12, 
                 numclass=1,
                 hidden_dropout_prob=0.1,
                 hidden_act='gelu',
                 position_embedding_type='absolute'):
        
        #self.attention_heads = attention_heads
        super(BertClassifier, self).__init__()
        
        if BERTconfig:
            configuration = BertConfig(num_attention_heads= attention_heads, 
                                       num_hidden_layers= hidden_layers)   
        else:
            configuration = BertConfig()
        #self.bert = BertModel.from_pretrained('bert-base-uncased', output_attentions=attention)
        #print(configuration)
        self.bert = BertModel(configuration)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, numclass, bias=True)

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)

        return linear_output

In [None]:
label_dict

In [None]:
ATTENTION = 12
HIDDEN_LAYER = 1
CLASSES = len(label_dict)


model = BertClassifier(dropout=0.3, 
                       BERTconfig=True, 
                       attention_heads=ATTENTION,
                       hidden_layers=HIDDEN_LAYER,
                       numclass= CLASSES,
                       hidden_dropout_prob=0.3,
                       position_embedding_type='absolute')

model.to(device)
model.train()

In [None]:
EPOCH = 2

save_model = 250

criterion = nn.CrossEntropyLoss(weight=classweight.to(device), reduction='mean')

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8,
                  no_deprecation_warning=True)
                  

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=250,
                                            num_training_steps=len(dataloader_train)*EPOCH)


logs = []
i=0

for epoch in range(EPOCH):
    
    print('epoch--- ',epoch+1)
    
    for train_input in tqdm(dataloader_train):
        i=i+1
        model.zero_grad()
        input_id = train_input[0].to(device)
        mask = train_input[1].to(device)
        train_label = train_input[2].to(device)
        output = model(input_id, mask)
        batch_loss = criterion(output, train_label)
        batch_loss.backward()
        optimizer.step()
        scheduler.step()
        
        if(i%save_model==0):
            label_train =  torch.tensor([], dtype=torch.uint8).to(device)
            pred_train = torch.tensor([], dtype=torch.uint8).to(device)
            total_loss_train = 0

            for train_eval in dataloader_train:
                input_id = train_eval[0].to(device)
                mask = train_eval[1].to(device)
                train_label = train_eval[2].to(device)
                output = model(input_id, mask)
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                label_train = torch.cat((label_train, train_label), 0)
                pred_train = torch.cat((pred_train, output.argmax(dim=1)), 0)

            label_train = label_train.detach().cpu().numpy()
            pred_train = pred_train.detach().cpu().numpy()
            total_acc_train = accuracy_score(label_train, pred_train)
            total_f1_train = f1_score(label_train, pred_train, average='weighted')
            total_loss_train = total_loss_train/(len(dataloader_train)*batch_size)
            


            label_val =  torch.tensor([], dtype=torch.uint8).to(device)
            pred_val = torch.tensor([], dtype=torch.uint8).to(device)
            total_loss_val = 0

            for val_input in dataloader_validation:
                input_id = val_input[0].to(device)
                mask = val_input[1].to(device)
                val_label = val_input[2].to(device)
                output = model(input_id, mask)
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                label_val = torch.cat((label_val, val_label), 0)
                pred_val = torch.cat((pred_val, output.argmax(dim=1)), 0)

            label_val = label_val.detach().cpu().numpy()
            pred_val = pred_val.detach().cpu().numpy()
            total_acc_val = accuracy_score(label_val, pred_val)
            total_f1_val = f1_score(label_val, pred_val, average='weighted')
            total_loss_val = total_loss_val/(len(dataloader_validation)*batch_size)
            
            print('train_accuracy: ', total_acc_train, ', train_f1: ', total_f1_train, ', train_loss: ', total_loss_train)
            print('val_accuracy: ', total_acc_val, ',  val_f1: ', total_f1_val, ', val_loss: ', total_loss_val)

            torch.save(model.state_dict(), 
                       f'/mnt/bert_models/BERT_encoder{HIDDEN_LAYER}_attention{ATTENTION}_epoch{i}.model')

            logs.append({'epoch': i,
                     'train_accuracy': total_acc_train,
                     'train_f1': total_f1_train,
                     'train_loss': total_loss_train,
                     'val_accuracy': total_acc_val,
                     'val_f1': total_f1_val,
                     'val_loss': total_loss_val})

In [None]:
log = pd.DataFrame(logs)
log['val_loss'] = log['val_loss'].round(3)
log['train_loss'] = log['train_loss'].round(3)
log['train_f1'] = log['train_f1'].round(2)
log['val_f1'] = log['val_f1'].round(2)
#log.index = log.epoch
#del log['epoch']
log

In [None]:
import matplotlib.pyplot as plt

fig,ax = plt.subplots(figsize=(20,10))

ax.plot(log.epoch, log.train_f1, color="red")
ax.plot(log.epoch, log.val_f1, color="green")

# set x-axis label
ax.set_xlabel("iterations", fontsize = 14)

# set y-axis label
ax.set_ylabel("F1", fontsize=14)

ax.set_xticks(log.epoch.values)

ax2=ax.twinx()
ax2.plot(log.epoch, log.train_loss,color="red")
ax2.plot(log.epoch, log.val_loss,color="green")

ax2.set_ylabel("LOSS", fontsize=14)
plt.grid(color = 'green', linestyle = '--', linewidth = 0.5)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
#plt.plot(log.epoch, log.train_accuracy, label= 'train_accuracy')
plt.plot(log.epoch, log.train_f1, label= 'train_f1')
#plt.plot(log.epoch, log.val_accuracy, label= 'val_accuracy')
plt.plot(log.epoch, log.val_f1, label='val_f1')
plt.legend(loc="upper left")
plt.xticks(log.epoch, rotation=90)
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
plt.plot(log.epoch, log.train_loss, label='train_loss')
plt.plot(log.epoch, log.val_loss, label='val_loss')
plt.legend(loc="upper right")
plt.xticks(log.epoch, rotation=90)
plt.grid(True)
plt.show()

In [None]:
'''EPOCH = 5

criterion = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8,
                  no_deprecation_warning=True)
                  

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=500,
                                            num_training_steps=len(dataloader_train)*EPOCH)

logs = []

for epoch in range(EPOCH):
    
    print('epoch--- ',epoch+1)
    total_loss_train = 0
    label_train =  torch.tensor([], dtype=torch.uint8).to(device)
    pred_train = torch.tensor([], dtype=torch.uint8).to(device)

    for train_input in tqdm(dataloader_train):
        model.zero_grad()
        input_id = train_input[0].to(device)
        mask = train_input[1].to(device)
        train_label = train_input[2].to(device)
        output = model(input_id, mask)
        label_train = torch.cat((label_train, train_label), 0)
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
        pred_train = torch.cat((pred_train, output.argmax(dim=1)), 0)
        batch_loss.backward()
        optimizer.step()
        scheduler.step()
        #break
    
    label_train = label_train.detach().cpu().numpy()
    pred_train = pred_train.detach().cpu().numpy()
    total_acc_train = accuracy_score(label_train, pred_train)
    total_f1_train = f1_score(label_train, pred_train, average='weighted')
    total_loss_train = total_loss_train/(len(dataloader_train)*batch_size)

    label_val =  torch.tensor([], dtype=torch.uint8).to(device)
    pred_val = torch.tensor([], dtype=torch.uint8).to(device)
    total_loss_val = 0

    with torch.no_grad():
        for val_input in tqdm(dataloader_validation):
            input_id = val_input[0].to(device)
            mask = val_input[1].to(device)
            val_label = val_input[2].to(device)
            output = model(input_id, mask)
            label_val = torch.cat((label_val, val_label), 0)
            batch_loss = criterion(output, val_label)
            total_loss_val += batch_loss.item()
            pred_val = torch.cat((pred_val, output.argmax(dim=1)), 0)
            #break
    
    label_val = label_val.detach().cpu().numpy()
    pred_val = pred_val.detach().cpu().numpy()
    total_acc_val = accuracy_score(label_val, pred_val)
    total_f1_val = f1_score(label_val, pred_val, average='weighted')
    total_loss_val = total_loss_val/(len(dataloader_validation)*batch_size)
    
    torch.save(model.state_dict(), f'./model/BERT_encoder{HIDDEN_LAYER}_attention{ATTENTION}_epoch{epoch+1}.model')
    
    print('train_accuracy: ', total_acc_train, ', train_f1: ', total_f1_train, ', train_loss: ', total_loss_train)
    print('val_accuracy: ', total_acc_val, ',  val_f1: ', total_f1_val, ', val_loss: ', total_loss_val)
    print('\n')
    
    logs.append({'epoch': epoch+1,
                 'train_accuracy': total_acc_train,
                 'train_f1': total_f1_train,
                 'train_loss': total_loss_train,
                 'val_accuracy': total_acc_val,
                 'val_f1': total_f1_val,
                 'val_loss': total_loss_val})'''

In [None]:
label_dict

### Evaluation

In [None]:
#test['label'] = test.new_category.replace(label_dict)
test.head(2)

In [None]:
test['new_category'].value_counts()

In [None]:
def BERTEncode(df, size, shuffle=False):
    encoded_data = tokenizer.batch_encode_plus(
        df.text.values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        max_length=512, 
        truncation=True,
        return_tensors='pt')
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(df.label.values)
    tensordataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(tensordataset,
                            shuffle=shuffle,
                            num_workers=0,
                            batch_size=size)
    return dataloader

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataloader_test = BERTEncode(test, 32)

In [None]:
ATTENTION = 12
HIDDEN_LAYER = 1
CLASSES = len(label_dict)

model = BertClassifier(dropout=0.2, 
                       BERTconfig=True, 
                       attention_heads=ATTENTION,
                       hidden_layers=HIDDEN_LAYER,
                       numclass= CLASSES,
                       hidden_dropout_prob=0.1,
                       position_embedding_type='absolute')
#model = BertClassifier(BERTconfig=False, numclass= CLASSES)

model.to(device)

model.load_state_dict(torch.load('/mnt/bert_models/BERT_encoder1_attention12_epoch3750.model', map_location=torch.device('cuda')))
model.eval()

In [None]:
def validate(dataloader, model):
    actual =  torch.tensor([], dtype=torch.uint8).to(device)
    predicted = torch.tensor([], dtype=torch.uint8).to(device)
    cr = []

    for input in tqdm(dataloader):
        input_id = input[0].to(device)
        mask = input[1].to(device)
        labels = input[2].to(device)
        actual = torch.cat((actual, labels), 0)
        with torch.no_grad():
            output = model(input_id, mask)
        predicted = torch.cat((predicted, output.argmax(dim=1)), 0)
        for logits in output:
            score = torch.nn.functional.softmax(logits, dim=0)
            cr.append(np.float64(score.max(dim=0)[0].detach().cpu().numpy()))
    
    actual = actual.detach().cpu().numpy()
    predicted = predicted.detach().cpu().numpy()
    
    total_acc_test = accuracy_score(actual, predicted)
    total_f1_test = f1_score(actual, predicted, average='weighted')
    print('Test Accuracy:', round(total_acc_test,2))
    print('Test f1:', round(total_f1_test,2))
    return (actual, predicted, cr)

In [None]:
actual, predicted, cr =validate(dataloader_test, model)

In [None]:
label_dict_inv=dict()
for label in label_dict:
    label_dict_inv[label_dict[label]]=label
label_dict_inv 

In [None]:
checkfor = range(len(label_dict))
#checkfor = [0]
final_predicted = []
    
classes = []
for check in checkfor:
    total = 0
    correct = 0
    notclassified = 0
    score = []
    for a,p, c in zip(actual, predicted, cr):
             
        if a == check:
            total+=1
            if (p == check):
                correct+=1
            elif (p!=check):
                notclassified+=1
                score.append(c)
                if(check==1):
                    classes.append(p)
                
    if total!=0:
        acc = round(correct/total,2)
        print('accuracy for {}: {}/{}={}'.format(label_dict_inv[check],correct,total,acc))
        print('notclassified {}'.format(notclassified))
        if notclassified!=0:
            print('notclassified score for {}'.format(np.median(score)))
    print('\n')

In [None]:
np.unique(classes, return_counts=True)

### GLOBAL_VALIDATION

In [None]:
global_validate = pd.read_csv('./global_validate.csv')
global_validate.head(2)

In [None]:
#global_validate = global_validate[global_validate['new_text_format']=='new']

In [None]:
global_validate['category'].value_counts()

In [None]:
#global_validate.drop(global_validate[global_validate['category']=='Tax Return'].index, inplace=True)

In [None]:
count=0
index=[]
for i,row in global_validate.iterrows():
    if row['ADR'] in data['ADR'].values.tolist():
        if row['text'] in data['text'][data['ADR']==row['ADR']].values.tolist():
            count+=1
            index.append(i)
count

In [None]:
#del global_validate['new_category']
#global_validate.drop(index=index, inplace=True)
global_validate.shape

In [None]:
#global_validate['category'].value_counts()

In [None]:
global_validate.to_csv('./global_validate.csv', header=True, index=False)

In [None]:
temp = []
cats = ['Retirement Account Statement(s)',
        'Mortgage Statement',
        'Bank Statement',
        'Purchase Agreement',
        'Rental Agreements(s)',
        'Hazard Insurance',
        'Divorce Decree / Child Support',
        'Bankruptcy Papers']

# cats = ['Retirement Account Statement(s)', 
#         'Mortgage Statement',
#         'Bank Statement']

for category in global_validate['category']:
    if category not in cats:
        temp.append('others')
    else:
        temp.append(category)
global_validate['new_category'] = temp
global_validate.head(2)

In [None]:
global_validate['new_category'].value_counts()

In [None]:
global_validate['label'] = global_validate.new_category.replace(label_dict)
global_validate.head(1)

In [None]:
global_validate['text']= [str(text) for text in global_validate['text']]

In [None]:
def BERTEncode(df, size, shuffle=False):
    encoded_data = tokenizer.batch_encode_plus(
        df['new_format_text'].values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        max_length=512, 
        truncation=True,
        return_tensors='pt')
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(df.label.values)
    tensordataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(tensordataset,
                            shuffle=shuffle,
                            num_workers=0,
                            batch_size=size)
    return dataloader

In [None]:
dataloader = BERTEncode(global_validate, 32, False)

In [None]:
ATTENTION = 12
HIDDEN_LAYER = 1
CLASSES = len(label_dict)


model = BertClassifier(dropout=0.2, 
                       BERTconfig=True, 
                       attention_heads=ATTENTION,
                       hidden_layers=HIDDEN_LAYER,
                       numclass= CLASSES,
                       hidden_dropout_prob=0.1,
                       position_embedding_type='absolute')

model.to(device)
model.load_state_dict(torch.load('/mnt/model/BERT_encoder1_attention12_epoch3000.model', map_location=torch.device('cuda')))
model.eval()

In [None]:
actual, predicted, cr = validate(dataloader, model)

In [None]:
global_validate.shape

In [None]:
label_dict_inv=dict()
for label in label_dict:
    label_dict_inv[label_dict[label]]=label
label_dict_inv 

In [None]:
label_dict

In [None]:
threshholds = range(0,92,2)
checkfor = range(len(label_dict))

accuracy=[]
notclassified = []
for thresh in threshholds:
    
    final_predicted=[]
    for a,p,c in zip(actual, predicted, cr):
        if c>(thresh/100):
            final_predicted.append(p)
        else:
            final_predicted.append(0)
    
    acc = 0
    incorrect = 0
    for check in checkfor:
        total = 0
        correct = 0
        for a,p,c in zip(actual, final_predicted, cr):
            if a == check:
                total+=1
                if (p == check):
                    correct+=1
                elif (p!=check):
                    incorrect+=1
        if(total!=0):
            acc = acc+round(correct/total,2)
    #accuracy.append(acc/len(checkfor))
    accuracy.append(acc/len(global_validate['new_category'].unique()))
    notclassified.append(incorrect)

In [None]:
fig,ax = plt.subplots(figsize=(20,10))

ax.plot(threshholds,
        accuracy,
        color="red", 
        marker="o")
# set x-axis label
ax.set_xlabel("threshold", fontsize = 14)
# set y-axis label
ax.set_ylabel("accuracy",
              color="red",
              fontsize=14)

ax.set_xticks(list(threshholds))
ax2=ax.twinx()
ax2.plot(threshholds, notclassified,color="blue",marker="o")
ax2.set_ylabel("notclassified",color="blue",fontsize=14)
plt.grid(color = 'green', linestyle = '--', linewidth = 0.5)
plt.show()

In [None]:
checkfor = range(len(label_dict))
#checkfor = [0]
final_predicted = []

for a,p, c in zip(actual, predicted, cr):
    if c>0.90:
        final_predicted.append(p)
    else:
        final_predicted.append(0)
    

for check in checkfor:
    total = 0
    correct = 0
    notclassified = 0
    score = []
    for a,p, c in zip(actual, final_predicted, cr):
             
        if a == check:
            total+=1
            if (p == check):
                correct+=1
            #elif ((p!=check)&(c<0.90)):
            #    correct+=1
            #elif ((p!=check)&(c>0.90)):
            #    notclassified+=1
            elif (p!=check):
                notclassified+=1
                score.append(c)
                
    if total!=0:
        acc = round(correct/total,2)
        print('accuracy for {}: {}/{}={}'.format(label_dict_inv[check],correct,total,acc))
        print('notclassified {}'.format(notclassified))
        if notclassified!=0:
            print('notclassified score for {}'.format(np.median(score)))
    print('\n')

In [None]:
global_validate['predlabel'] = list(final_predicted)
global_validate['predicted'] = [label_dict_inv[i] for i in final_predicted]
global_validate['cr'] = list(cr)
#global_validate.head(2)

In [None]:
global_validate['category'][(global_validate['new_category']=='others')&
                            (global_validate['predicted']!='others')].value_counts()

In [None]:
global_validate['new_category'].value_counts()

In [None]:
global_validate[(global_validate['label']==0)&(global_validate['predlabel']!=0)].shape

In [None]:
global_validate['new_category'].value_counts()

In [None]:
global_validate.shape

### after deployment validation

In [None]:
global_validate = pd.read_csv('/data/mmortgage/amal_workspace/multi_page_classification_preprocessing/domain_testing/BERT Testing_10.08.2022_Results.csv')
global_validate.head(1)

In [None]:
# count=0
# index=[]
# for i,adr in enumerate(global_validate['ADR']):
#     if adr in data['ADR'].values.tolist():
#         count+=1
#         index.append(i)
# global_validate.drop(index=index, inplace=True)
global_validate.shape

In [None]:
label = {
    'others': 0,
    'Retirement Account Statement(s)': 1, 
    'Mortgage Statement': 2, 
    'Bank Statement': 3, 
    'Purchase Agreement': 4, 
    'Rental Agreements(s)': 5,
    'Divorce Decree / Child Support': 6,
    'Hazard Insurance': 7,
    'Bankruptcy Papers': 8
}

In [None]:
label_inv=dict()
for lab in label:
    label_inv[label[lab]]=lab
label_inv 

In [None]:
temp = []

cats = list(label.keys())

for category in global_validate['Actual Classification']:
    if category not in cats:
        temp.append('others')
    else:
        temp.append(category)
global_validate['new_category'] = temp

In [None]:
global_validate['label'] = global_validate.new_category.replace(label)
global_validate.head(2)

In [None]:
global_validate['new_category'].value_counts()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, 
                 dropout=0.1, 
                 BERTconfig=False, 
                 attention_heads=12, 
                 hidden_layers=12, 
                 numclass=1,
                 hidden_dropout_prob=0.1,
                 hidden_act='gelu',
                 position_embedding_type='absolute'):
        
        #self.attention_heads = attention_heads
        super(BertClassifier, self).__init__()
        
        if BERTconfig:
            configuration = BertConfig(num_attention_heads= attention_heads, 
                                       num_hidden_layers= hidden_layers)   
        else:
            configuration = BertConfig()
        #self.bert = BertModel.from_pretrained('bert-base-uncased', output_attentions=attention)
        #print(configuration)
        self.bert = BertModel(configuration)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, numclass, bias=True)

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)

        return linear_output

In [None]:
import configparser
import ast

device = torch.device('cuda')

configs = configparser.ConfigParser()
configs.read('./config.ini')
labels = []
BERT_MODELS = []
THRESHOLDS=[]
GARBAGE_REMOVAL=[]

for config in configs.sections():
    label_ = ast.literal_eval(configs[config].get('categoryMAP'))
    labels.append(label_)
    THRESHOLDS.append(float(configs[config].get('cr_thresh')))
    GARBAGE_REMOVAL.append(bool(configs[config].get('garbageremoval')))
    HIDDEN_LAYERS = int(configs[config].get('hidden_layers'))
    ATTENTIONS = int(configs[config].get('attentions'))
    CLASSES = len(label_)
    MODEL_FOLDER = configs[config].get('modelfolder')
    model = BertClassifier(BERTconfig=True, 
                           attention_heads=ATTENTIONS, 
                           hidden_layers=HIDDEN_LAYERS, 
                           numclass= CLASSES)
    model.to(device)
    model.load_state_dict(torch.load('./model/{}/BERT.model'.format(MODEL_FOLDER), map_location=torch.device('cuda')))
    model.eval()
    BERT_MODELS.append(model)

In [None]:
remove_words = set(stopwords.words('english') + list(string.punctuation))
def predict(model, text, labels, thresh, grabageremoval):
    
    if grabageremoval:
        text = re.sub(r'[~^0-9]', '', str(text))
        wordlist = [word for word in word_tokenize(text.lower()) if not word in remove_words]
        text = (" ").join(wordlist)
    
    encoded_data = tokenizer.encode_plus(
            str(text), 
            add_special_tokens=True, 
            return_attention_mask=True, 
            padding='max_length', 
            max_length=512, 
            truncation=True,
            return_tensors='pt')
    
    input_ids_text = encoded_data['input_ids'].to(device)
    attention_masks_text = encoded_data['attention_mask'].to(device)

    with  torch.no_grad(): 
        outputs = model(input_ids_text, attention_masks_text)
        
    scores = torch.nn.functional.softmax(outputs.detach().cpu(), dim=1).numpy()[0]
    
    if np.max(scores)>thresh:
        return (labels[np.argmax(scores)],np.max(scores))
    else:
        return ('others', np.max(scores))

In [None]:
txts = ['LibertyGuard Deluxe Homeowner Policy Declarations Liberty Mutual Personal Insurance Company Liberty Mutual. INSURANCE FAX: ATTN: POLICY NUMBER: H3V-251-492955-70 NAME & ADDRESS Charles Corbett Dolores McKeehan 403 Poplar Ridge Rd Chapmansboro, TN 37035-5334 RESIDENCE PREMISES INSURED 4127 Meadow View Cir Pleasant View, TN 37146-8198 THESE DECLARATIONS EFFECTIVE 12/21/2021 Same as Residence POLICY PERIOD 12/21/2021 through 12/21/2022 RESIDENCE PREMISES 403 Poplar Ridge Rd Chapmmsboro, TN 37035-5334 SECTIONI AND II: COVERAGES AND LIMITS UNDER YOUR LIBERTY GUARD HOMEOWNERS POLICY I: COVERAGE A - YOUR DWELLING COVERAGE B - OTHER STRUCTURES ON RESIDENCE PREMISES COVERAGE C - PERSONAL PROPERTY COVERAGE D - LOSS OF USE OF YOUR RESIDENCE PREMISES $ 280,500 $ 28,050 $ 210,380 Actual Loss Sustained II: COVERAGE E - PERSONAL LIABILITY (EACH OCCURRENCE) COVERAGE F - MEDICAL PAYMENTS TO OTHERS (EACH PERSON) $ 300,000 5,000 DEDUCTIBLE: LOSSES COVERED UNDER SECTION I ARE SUBJECT TO A DEDUCTIBLE OF 1% Wind/Hail (if applicable) 1% NET PREMIUM: $1,359.00 PAID IN FULL NO Replacement Cost Coverage X Yes No Expanded Replacement Cost [X] 20% No Functional Replacement Roof Replacement Cost Coverage Yes [X] No Mortgagee 1 HIGHLANDS RESIDENTIAL MORTGAGE, LTD. Loan # 7018195910 Issoe/atime C/O Conlar P.O. Box 202028 Florence, SC 29502 Jans mathe Market President Secretary Countersigned by: Date: December 09, 2021']

In [None]:
from concurrent.futures import ThreadPoolExecutor
import time

start_time = time.time()

pred_label = []
i=0
#for txt in global_validate['text'].values:
for txt in txts:
#for txt in zip(global_validate['text'].values, global_validate['Ref. No.'].values, global_validate['text'].values:
    i=i+1
    print(i)
    with ThreadPoolExecutor() as executor:
        predictions = executor.map(predict, 
                                   BERT_MODELS, 
                                   [txt]*len(labels), 
                                   labels, 
                                   THRESHOLDS,
                                   GARBAGE_REMOVAL)
    predictions = list(predictions)
    print(predictions)
    if(predictions[0][0]=='focused'):
        classifications = list(filter(lambda x: x[0]!='others', predictions[1:]))
        if len(classifications)==0:
            final_class = sorted(predictions[1:],key=lambda x: x[1], reverse=True)[0]
        else:
            final_class = sorted(classifications,key=lambda x: x[1], reverse=True)[0]

        '''if final_class[1]>0.90:
            final_class = final_class
        else:
            final_class = ('others',final_class[1])'''
    else:
        final_class = ('others',predictions[0][1])
    pred_label.append(label[final_class[0]])
    print(final_class)
    print('\n')
    
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
label

In [None]:
acc = accuracy_score(global_validate['label'].values, np.array(pred_label))
f1 = f1_score(global_validate['label'].values, np.array(pred_label), average='weighted')
acc,f1

In [None]:
label_inv

In [None]:
range(len(label_inv))

In [None]:
checkfor = range(len(label_inv))
for check in checkfor:
    total = 0
    correct = 0
    notclassified = 0
    score = []
    for a,p in zip(global_validate['label'].values, np.array(pred_label)):
             
        if a == check:
            total+=1
            if (p == check):
                correct+=1
            elif (p!=check):
                notclassified+=1
                
    if total!=0:
        acc = round(correct/total,2)
        print('accuracy for {}: {}/{}={}'.format(label_inv[check],correct,total,acc))
        #print('notclassified {}'.format(notclassified))
        print('\n')

In [None]:
global_validate['pred_label'] = pred_label
global_validate.head(1)

In [None]:
global_validate['pred_category'] = global_validate['pred_label'].replace(label_inv )
global_validate.head(1)

In [None]:
global_validate['new_category'].values

In [None]:
temp = []
for i,j in zip(global_validate['new_category'].values, global_validate['pred_category'].values):
    if i==j:
        temp.append('PASS')
    else:
        temp.append('FAIL')
global_validate['local_P/F']= temp

In [None]:
global_validate.to_csv('./valid.csv', header=True, index=False)

In [None]:
global_validate[(global_validate['label']==0)&
                            (global_validate['pred_label']!=0)].shape

In [None]:
bank 
morgage 
Retirement

In [None]:
Purchase
Rental

In [None]:
tax return
k1
w2
paystub

In [None]:
import nlpaug.augmenter.sentence as nas
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac

In [None]:
augc= nac.OcrAug() 
'''aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', 
                                model_type='bert',
                                action="insert", 
                                aug_p=0.80)'''
                                
#augs = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', top_p=0.5, batch_size=4)
#randomSentAug = nas.random.RandomSentAug(mode='neighbor', action='swap') <---

In [None]:
txt = [data['text'][1],data['text'][0]]
augmented_data = augc.augment(txt)
#augmented_data = randomWordAug.augment(txt)

In [None]:
txt

In [None]:
augmented_data