In [1]:
from os import listdir
from os.path import isfile, join
import re
import pandas as pd

from lexnlp.extract.en.entities import nltk_re
from lexnlp.nlp.en.segments import sentences
from lexnlp.utils import parse_df

In [2]:
DATA_PATH = "..\data\interim\sfcr"

In [3]:
files = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f)) and f[-3:]=='txt']
documents = []
for file_name in files:
    file = open(join(DATA_PATH, file_name), "rb")
    text = file.read().decode('utf-8')
    file.close()
    #text = text.replace("\n", " ")
    documents.append(text)

In [4]:
print(len(documents))

13


# Get companies

In [5]:
# nltk_re performs a complex regular expression for finding company entities

l = list(nltk_re.get_companies(documents[0]))

In [6]:
l[0:5]

[Achmea B.V., (19993, 20008), Achmea Reinsurance 
 Company N.V., (20988, 21021), Achmea B.V., (21548, 21563), Achmea B.V., (22175, 22187), Levensverzekeringen N.V., (22207, 22232)]

# Accounting firms

Dutch accounting firms are supervised by the AFM. It publishes a register with accounting firms: https://www.afm.nl/en/professionals/registers/vergunningenregisters/accountantsorganisaties

In [7]:
# file directly downloaded from AFM website
f = "..//dictionary//nl//AFM//accountantsorganisaties.csv"
df = pd.read_csv(f, encoding = 'latin-1', sep= ';').set_index('Naam organisatie')
# additional file for aliases of firm names (for example EY instead of Ernst & Young Accountants)
f = "..//dictionary//nl//AFM//accountantsorganisaties-alias.csv"
alias = pd.read_csv(f, sep = ',', encoding = 'latin-1').set_index('Naam organisatie')
df = df.join(alias).reset_index()

parse_columns = ('Naam organisatie', 'Korte naam', 'Afkorting')
result_columns = {'Naam organisatie': 'name'}
preformed_entity = {'entity_type': 'accounting firm', 
                    'source'     : 'AFM', 
                    'country'    : 'NL'}

In [8]:
df_results = pd.DataFrame()
for idx, document in enumerate(documents):
    items = list(parse_df.DataframeEntityParser(df, parse_columns, result_columns, preformed_entity).get_entities_from_text(document))
    for i in items:
        df_results = df_results.append(pd.DataFrame(columns = ['file name']+list(i.keys()), data = [[files[idx]] + list(i.values())]))
df_results

Unnamed: 0,file name,location_start,location_end,source,name,entity_type,country
0,2018_Achmea B.V._SFCR.txt,25077,25101,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Achmea B.V._SFCR.txt,39296,39320,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Levensverzekering_SFCR.txt,27752,27793,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Levensverzekering_SFCR.txt,27752,27776,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Schadeverzekering_SFCR.txt,16715,16756,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Schadeverzekering_SFCR.txt,16715,16739,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Spaarkas_SFCR.txt,22764,22805,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Spaarkas_SFCR.txt,22807,22848,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Spaarkas_SFCR.txt,22764,22788,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL
0,2018_Aegon Spaarkas_SFCR.txt,22807,22831,AFM,PricewaterhouseCoopers Accountants N.V.,accounting firm,NL


# Insurance undertakings

EIOPA's register includes all European insurance undertakings.

In [9]:
f = "..//dictionary//common//EIOPA//DATINS_Export_637110803393817886.csv"
df = pd.read_csv(f, encoding = 'latin-1', sep= ';')

In [10]:
parse_columns = ('International Name', 'Official name of the entity')
result_columns = {'International Name': 'name', 'Home Country': 'country'}
preformed_entity = {'entity_type': 'insurance undertaking', 
                    'source': 'EIOPA'}

In [11]:
df_results = pd.DataFrame()
for idx, document in enumerate(documents[0:1]):
    items = list(parse_df.DataframeEntityParser(df, parse_columns, result_columns, preformed_entity).get_entities_from_text(document))
    for i in items:
        df_results = df_results.append(pd.DataFrame(columns = ['file name']+list(i.keys()), data = [[files[idx]] + list(i.values())]))
df_results

Unnamed: 0,file name,location_start,location_end,source,name,country,entity_type
0,2018_Achmea B.V._SFCR.txt,22232,22265,EIOPA,Achmea Schadeverzekeringen N.V.,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,22265,22281,EIOPA,N.V. Hagelunie,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,22284,22317,EIOPA,Achmea Reinsurance Company N.V.,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,33762,33795,EIOPA,Achmea Schadeverzekeringen N.V.,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,35599,35632,EIOPA,Achmea Reinsurance Company N.V.,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,35917,35963,EIOPA,Achmea Pensioen- en Levensverzekeringen N.V.,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,35964,35997,EIOPA,Achmea Schadeverzekeringen N.V.,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,35998,36014,EIOPA,N.V. Hagelunie,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,36015,36048,EIOPA,Achmea Reinsurance Company N.V.,NL,insurance undertaking
0,2018_Achmea B.V._SFCR.txt,36092,36123,EIOPA,Achmea Zorgverzekeringen N.V.,NL,insurance undertaking
