In [None]:
from os import listdir
from os.path import isfile, join
import re
import pandas as pd

from lexnlp.extract.en.entities import nltk_re
from lexnlp.nlp.en.segments import sentences
from lexnlp.utils import parse_df

In [None]:
DATA_PATH = "..\data\interim\sfcr"

In [None]:
files = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f)) and f[-3:]=='txt']
documents = []
for file_name in files:
    file = open(join(DATA_PATH, file_name), "rb")
    text = file.read().decode('utf-8')
    file.close()
    #text = text.replace("\n", " ")
    documents.append(text)

In [None]:
print(len(documents))

# Get companies

In [None]:
# nltk_re performs a complex regular expression for finding company entities

l = list(nltk_re.get_companies(documents[0]))

In [None]:
l[0:5]

# Accounting firms

Dutch accounting firms are supervised by the AFM. It publishes a register with accounting firms: https://www.afm.nl/en/professionals/registers/vergunningenregisters/accountantsorganisaties

In [None]:
# file directly downloaded from AFM website
f = "..//dictionary//nl//AFM//accountantsorganisaties.csv"
df = pd.read_csv(f, encoding = 'latin-1', sep= ';').set_index('Naam organisatie')
# additional file for aliases of firm names (for example EY instead of Ernst & Young Accountants)
f = "..//dictionary//nl//AFM//accountantsorganisaties-alias.csv"
alias = pd.read_csv(f, sep = ',', encoding = 'latin-1').set_index('Naam organisatie')
df = df.join(alias).reset_index()

parse_columns = ('Naam organisatie', 'Korte naam', 'Afkorting')
result_columns = {'Naam organisatie': 'name'}
preformed_entity = {'entity_type': 'accounting firm', 
                    'source'     : 'AFM', 
                    'country'    : 'NL'}

In [None]:
df_results = pd.DataFrame()
for idx, document in enumerate(documents):
    for sentence in sentences.get_sentence_list(document):
        items = list(parse_df.DataframeEntityParser(df, parse_columns, result_columns, preformed_entity).get_entities_from_text(sentence))
        for i in items:
            df_results = df_results.append(pd.DataFrame(columns = ['file name']+list(i.keys())+['sentence'], 
                                                        data = [[files[idx]] + list(i.values())+[str(sentence)]]))
df_results

# Insurance undertakings

EIOPA's register includes all European insurance undertakings.

In [None]:
f = "..//dictionary//common//EIOPA//DATINS_Export_637110803393817886.csv"
df = pd.read_csv(f, encoding = 'latin-1', sep = ';')
df = df[df['Name of NCA']=='De Nederlandsche Bank']

In [None]:
parse_columns = ('International Name', 'Official name of the entity')
result_columns = {'International Name': 'name', 'Home Country': 'country'}
preformed_entity = {'entity_type': 'insurance undertaking', 
                    'source'     : 'EIOPA'}

In [None]:
df_results = pd.DataFrame()
for idx, document in enumerate(documents[0:1]):
    for sentence in sentences.get_sentence_list(document):
        items = list(parse_df.DataframeEntityParser(df, parse_columns, result_columns, preformed_entity).get_entities_from_text(sentence))
        for i in items:
            df_results = df_results.append(pd.DataFrame(columns = ['file name']+list(i.keys())+['sentence'], 
                                                        data = [[files[idx]]+list(i.values())+[str(sentence)]]))
df_results