In [1]:
from os import listdir
from os.path import isfile, join
import re
import pandas as pd
from lexnlp.extract.en.entities import nltk_re
from lexnlp.nlp.en.segments import sentences
from lexnlp.utils import parse_df



In [2]:
DATA_PATH = "..\data\interim\sfcr"

In [3]:
txt_files = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f)) and f[-3:]=='txt']
documents = []
for file_name in txt_files:
    file = open(join(DATA_PATH, file_name), "rb")
    text = file.read().decode('utf-8')
    file.close()
    #text = text.replace("\n", " ")
    documents.append(text)
    
pickle_files = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f)) and f[-6:]=='pickle']
df = pd.DataFrame()
for file_name in pickle_files:
    df = df.append(pd.read_pickle(join(DATA_PATH, file_name)), ignore_index = True)

In [None]:
print("Number of documents: " + str(len(documents)))
print("Number of sentences: " + str(len(df.index)))

# Get companies (nltk approach)

In [None]:
# nltk_re performs a complex regular expression for finding company entities

l = list(nltk_re.get_companies(documents[0]))

In [None]:
l[0:5]

# Accounting firms

Dutch accounting firms are supervised by the AFM. It publishes a register with accounting firms: https://www.afm.nl/en/professionals/registers/vergunningenregisters/accountantsorganisaties

In [None]:
# file directly downloaded from AFM website
f = "..//dictionary//nl//AFM//accountantsorganisaties.csv"
df_accounting = pd.read_csv(f, encoding = 'latin-1', sep= ';').set_index('Naam organisatie')
# additional file for aliases of firm names (for example EY instead of Ernst & Young Accountants)
f = "..//dictionary//nl//AFM//accountantsorganisaties-alias.csv"
alias = pd.read_csv(f, sep = ',', encoding = 'latin-1').set_index('Naam organisatie')
df_accounting = df_accounting.join(alias).reset_index()

parse_columns = ('Naam organisatie', 'Korte naam', 'Afkorting')
result_columns = {'Naam organisatie': 'name'}
preformed_entity = {'entity_type': 'accounting firm', 
                    'source'     : 'AFM', 
                    'country'    : 'NL'}

In [None]:
df_results1 = pd.DataFrame()
for idx, document in enumerate(documents):
    for sentence in sentences.get_sentence_list(document):
        items = list(parse_df.DataframeEntityParser(df_accounting, parse_columns, result_columns, preformed_entity).get_entities_from_text(sentence))
        for i in items:
            df_results1 = df_results1.append(pd.DataFrame(columns = ['file name']+list(i.keys())+['text'], 
                                                          data = [[txt_files[idx]]+list(i.values())+[sentence]]), ignore_index = True)

In [None]:
len(df_results1.index)

In [None]:
df_results2 = pd.DataFrame()
for sent_id in df.index:
    sentence = df.loc[sent_id,'text']
    items = list(parse_df.DataframeEntityParser(df_accounting, parse_columns, result_columns, preformed_entity).get_entities_from_text(sentence))
    for i in items:
        df_results2 = df_results2.append(pd.DataFrame(columns = list(df.loc[sent_id].index) + list(i.keys()), 
                                                      data = [list(df.loc[sent_id].values) + list(i.values())]), ignore_index = True)

In [None]:
len(df_results2.index)

# Insurance undertakings

EIOPA's register includes all European insurance undertakings.

In [None]:
f = "..//dictionary//common//EIOPA//DATINS_Export_637110803393817886.csv"
df_insurers = pd.read_csv(f, encoding = 'latin-1', sep = ';')
df_insurers = df_insurers[df_insurers['Name of NCA']=='De Nederlandsche Bank']

In [None]:
parse_columns = ('International Name', 'Official name of the entity')
result_columns = {'International Name': 'name', 'Home Country': 'country'}
preformed_entity = {'entity_type': 'insurance undertaking', 
                    'source'     : 'EIOPA'}

In [None]:
df_results = pd.DataFrame()
for idx, document in enumerate(documents[0:4]):
    for sentence in sentences.get_sentence_list(document):
        items = list(parse_df.DataframeEntityParser(df_insurers, parse_columns, result_columns, preformed_entity).get_entities_from_text(sentence))
        for i in items:
            df_results = df_results.append(pd.DataFrame(columns = ['file name']+list(i.keys())+['sentence'], 
                                                        data = [[txt_files[idx]]+list(i.values())+[str(sentence)]]))
df_results

In [None]:
# text to html file
#with open("text.html", "w", encoding='utf-8') as e:
#    e.write(str(documents[0]))

In [4]:
import pickle

In [5]:
from lexnlp.nlp.en.segments import titles
from src.nlp.en.segments import solvency2_titles



In [6]:
for idx in range(len(documents)):
    documents[idx] = documents[idx].replace('\uf0b7', "").replace('\uf0a7', "").replace('\uf0fc', "")

In [None]:
df_feature, df_target = solvency2_titles.build_model("h:/30_code/python/github/solvency2-nlp/src/nlp/en/segments/solvency2_titles_model.csv", DATA_PATH)

In [None]:
df_target[df_target == 1.0]

In [13]:
feature_data = solvency2_titles.build_document_title_features(documents[7])
# Predict title lines
predicted_lines = solvency2_titles.SECTION_SEGMENTER_MODEL.predict_proba(feature_data)
predicted_df = pd.DataFrame(predicted_lines, columns=["prob_false", "prob_true"])
title_lines = predicted_df.loc[predicted_df["prob_true"] >= 0.05, :].index.tolist()
title_lines

[0]

In [None]:
list(documents[3].splitlines())

In [17]:
list(solvency2_titles.get_titles(documents[9], score_threshold=0.05))

[]

In [18]:
documents[9].splitlines()

['2018',
 'SFCR ',
 'ASR Nederland ',
 'N.V.',
 '',
 '',
 'ASR Nederland N.V.',
 '',
 'Archimedeslaan 10',
 'P.O. Box 2072',
 '3500 HB  Utrecht',
 'The Netherlands',
 'www.asrnl.com',
 '',
 '',
 '2018 ',
 '',
 'SFCR ',
 'ASR Nederland N.V.',
 '',
 '',
 ' ',
 ' ',
 '',
 'SFCR ASR Nederland N.V.  |  Contents ',
 '',
 '4 ',
 '4',
 '',
 'Contents',
 '',
 '7',
 '',
 '8',
 '8',
 '8',
 '9',
 '10',
 '11',
 '',
 'Introduction ',
 ' ',
 'Summary ',
 'A   Business and performance ',
 'B   System of governance ',
 'C   Risk profile ',
 'D   Valuation for Solvency purposes ',
 'E   Capital management ',
 ' ',
 '12',
 'A     Business and performance ',
 '12',
 'A.1   Business ',
 '12',
 'A.1.1  Profile ',
 '14',
 'A.1.2  General information ',
 '15',
 'A.1.3  Structure ',
 '19',
 'A.2   Underwriting performance ',
 '19',
 'A.2.1  Financial Performance ',
 '20',
 'A.2.2  Financial Performance Non-life segment ',
 '27',
 'A.2.3  Financial Performance Life segment ',
 '31',
 'A.3  ',
 '31',
 'A.3.1  Re