In [1]:
import pandas as pd
lexicon = pd.read_csv("../data/acronym_lexicon_partial.csv")

# do a quick filter on the has-tags field
lexicon["has_tags"] = [l!='[]' for l in lexicon["tags"]]
tagged_lexicon = lexicon[lexicon["has_tags"]]

In [2]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [3]:
print(len(tagged_lexicon))
len(set(tagged_lexicon["acronym"].to_list()))

129


48

In [4]:
print(len(lexicon))
len(set(lexicon["acronym"].to_list()))

500


61

In [7]:
from typing import List

def zeroshot_classify(text:str, candidate_labels:List[str]= ['finance', 'regulation','government institute','bank','policy','guideline','regulatory body','tax','company']):

    results = classifier(text, candidate_labels,multi_label=True)
    
    return {k:v for k,v in zip(results["labels"], results["scores"])}

def assign_class(text):
    # Do some logic for class assignment
    # if finance <0.4: assign it non-financial-entity
    
    # else
    # if max of non financial tags < 0.4:
    # assign the class as other-financial entity
    # else
    # assign it as the maximum
    
    classification = zeroshot_classify(text)
    
    if classification["finance"] < 0.4:
        output_class = "Non-Financial Entity"
    else:
        del classification["finance"]
        
        scores = sorted(classification.items(), key=lambda x:x[1], reverse=True)

        max_class, max_score = scores[0]
    
        if max_score <0.5:
            output_class = "Other Financial Entity"
            
        else: output_class = max_class
        
    return output_class, classification


In [10]:
# Example usage
#text = tagged_lexicon["summary"].to_list()[0].split(".")
text = "Goverment deparment financial"
classification, data = assign_class(text)
print(classification)
print(data)


[('regulatory body', 0.9633584022521973), ('policy', 0.8513957858085632), ('regulation', 0.5462090373039246), ('bank', 0.3800763785839081), ('guideline', 0.36723998188972473), ('tax', 0.11873148381710052), ('government institute', 0.08829646557569504), ('company', 0.03803764656186104)]
regulatory body
{'regulatory body': 0.9633584022521973, 'policy': 0.8513957858085632, 'regulation': 0.5462090373039246, 'bank': 0.3800763785839081, 'guideline': 0.36723998188972473, 'tax': 0.11873148381710052, 'government institute': 0.08829646557569504, 'company': 0.03803764656186104}


In [11]:
from tqdm import tqdm


semantic_type = []
classification_data = []
for summary in tqdm(tagged_lexicon["summary"]):
    classification, data = assign_class(summary)
    semantic_type.append(classification)
    classification_data.append(data)
    
tagged_lexicon["semantic_type"] = semantic_type
tagged_lexicon["classification_data"] = classification_data


  1%|▎                                               | 1/129 [00:06<13:54,  6.52s/it]

[('regulation', 0.9457217454910278), ('guideline', 0.8812936544418335), ('policy', 0.7326834201812744), ('company', 0.729239284992218), ('bank', 0.5681201219558716), ('regulatory body', 0.4712848663330078), ('government institute', 0.14986301958560944), ('tax', 0.05059107765555382)]


  2%|▋                                               | 2/129 [00:12<13:02,  6.16s/it]

[('bank', 0.8064061403274536), ('regulatory body', 0.7751412987709045), ('regulation', 0.6663729548454285), ('policy', 0.6225541234016418), ('government institute', 0.5584531426429749), ('company', 0.5555528402328491), ('guideline', 0.2599217891693115), ('tax', 0.07821773737668991)]


  3%|█▍                                              | 4/129 [00:26<13:34,  6.51s/it]

[('company', 0.8514012098312378), ('guideline', 0.47469210624694824), ('policy', 0.41347047686576843), ('regulation', 0.29572615027427673), ('regulatory body', 0.24901707470417023), ('tax', 0.20877550542354584), ('government institute', 0.14505250751972198), ('bank', 0.09984436631202698)]


  3%|█▍                                              | 4/129 [00:29<15:31,  7.45s/it]


KeyboardInterrupt: 

In [94]:
tagged_lexicon

Unnamed: 0.1,Unnamed: 0,acronym,summary,tags,has_tags,semantic_type,classification_data
2,2,AML,The know your customer or know your client (KY...,"[{'match': 'financial', 'norm': 'financial', '...",True,guideline,"{'guideline': 0.8812936544418335, 'policy': 0...."
12,12,SFC,The Galway county football team represents Gal...,"[{'match': 'board', 'norm': 'board', 'case': '...",True,Non-Financial Entity,"{'regulatory body': 0.8917790651321411, 'guide..."
13,13,FATF,The Financial Action Task Force (on Money Laun...,"[{'match': 'money', 'norm': 'money', 'case': '...",True,policy,"{'policy': 0.9388455152511597, 'regulatory bod..."
14,14,ASIC,An application-specific integrated circuit (AS...,"[{'match': 'standard', 'norm': 'standard', 'ca...",True,Non-Financial Entity,"{'guideline': 0.2505337595939636, 'regulatory ..."
20,20,EBA,The European Banking Authority (EBA) is a regu...,"[{'match': 'financial', 'norm': 'financial', '...",True,regulatory body,"{'regulatory body': 0.9875544905662537, 'bank'..."
...,...,...,...,...,...,...,...
847,847,ANZ,The Australia and New Zealand Banking Group Li...,"[{'match': 'financial', 'norm': 'financial', '...",True,bank,"{'bank': 0.8740919232368469, 'regulatory body'..."
848,848,GST,Goods and Services Tax (GST) is an indirect ta...,"[{'match': 'finance', 'norm': 'finance', 'case...",True,regulatory body,"{'regulatory body': 0.7395630478858948, 'polic..."
850,850,PFI,The private finance initiative (PFI) was a Uni...,"[{'match': 'finance', 'norm': 'finance', 'case...",True,policy,"{'policy': 0.6511884927749634, 'regulatory bod..."
853,853,SVF,Serial Vector Format (SVF) is a file format th...,"[{'match': 'standard', 'norm': 'standard', 'ca...",True,Non-Financial Entity,"{'bank': 0.5123648047447205, 'policy': 0.47108..."


In [96]:
# save this lexicon
tagged_lexicon.to_csv("../data/financial_taxonomy.csv")