# KPMG analyse



In [4]:

#!python -m spacy download nl_core_news_md
import pandas as pd
import spacy

# Load model
nlp = spacy.load("nl_core_news_md")
stopwords = nlp.Defaults.stop_words

DEBUG = False

In [5]:
#some debugging stuff
# 
if DEBUG:
    import inspect 
    def getname():
        import sys
        return sys._getframe(1).f_code.co_name

In [24]:
def nlp_cleanandlemmatize(txtdoc: str, list_wordstoskip:str = '', onlynouns:bool = True) -> (dict,list):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
   
    LANG='nl'
    # words to discard
    months={'nl':['januari','februari','maart','april','mei','juni','augustus','september','oktober','november','december'],
            'fr':['janvier','fevrier','mars','avril','mai','juin','juillet','aout','septembre','octobre','decembre']}
    days={'nl':['maandag','dinsdag','woensdag','donderdag','vrijdag','zaterdag','zondag'],
            'fr':['lundi','mardi','mercredi','jeudi','vendredi','samedi','dimanche']}
    # 
    nlp.max_length=10000000
    nlp_doc=nlp(txtdoc)
    list_allwordslemmatized=[]
    dict_uniqwords={}
    list_tokens=[]
    #filter
    for token in nlp_doc:
        lemma_lower=token.lemma_.lower()
        if token in stopwords:
            continue
        if (token.is_punct or token.is_space or token.is_stop):
            continue
        if token.text.isdecimal():
            continue
        if True in [char.isdigit() for char in token.text]:
            continue
        if token.text[-1] == '.':
            continue
        if len(token.text) <= 2:
            continue
        if lemma_lower in months[LANG]:
            continue
        if lemma_lower in days[LANG]:
            continue
        if lemma_lower in list_wordstoskip:
            continue
        #pass only nouns
        if token.pos_ != 'NOUN':
            continue
        #create dict of unique words with count
        if lemma_lower not in dict_uniqwords: 
            dict_uniqwords[lemma_lower]=1
            #save tokens for vector comparison
            list_tokens.append(token)
        else:
            dict_uniqwords[lemma_lower]+=1
    return dict_uniqwords, list_tokens
    #end of filter------------------------

#compare a token against a tokenslist
def token_compare(token_totest,list_tokens,min_score:int = 0.6):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
    
    #return list of tokens with similarity >= min_score 
    list_tokens_toreturn=[]
    tot_similar_word=0
    istax=False
    for token in list_tokens:
        similar=token_totest.similarity(token)
        #only addup the scores >= min_score
        if similar >= min_score:
            list_tokens_toreturn.append([token_totest,token,similar])
            #positive for tax
            istax=True
            tot_similar_word+=similar
    return istax, tot_similar_word, list_tokens_toreturn

def createlistofkeywords(numberofdocumenttoscan:int = 50,similar_doc:int = 0,min_score:float = 0.6, list_keywords:list = []) -> (list,list,list):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
     
    list_keep_tax_words=[]
    list_keep_pointer_taxdocs=[]
    list_keep_pointer_alldocs=[]

    #take all docs if 0
    if numberofdocumenttoscan == 0:
        numberofdocumenttoscan=len(df)
    
    for n in range(numberofdocumenttoscan):

        print("Documents analyzed: ",numberofdocumenttoscan,n)

        txtdoc = df['cleantextnl'].values[n]
        list_wordstoskip=['blabla','blablabla'] #add here words to discard
        dict_uniqwords,list_tokens=nlp_cleanandlemmatize(txtdoc,list_wordstoskip) #get tokens

        #tokenize and check
        tot_similar_doc=0     #keep score for onlytax docs (istax=True)
        tot_similar_doc_all=0 #keep score for all docs
        istax_doc=False
        
        for word in list_keywords:
            token_word = nlp(word)
            istax, tot_similar_word, list_res_tokens=token_compare(token_word,list_tokens,min_score)
            #if DEBUG: print("DEBUG",istax, tot_similar_word, list_res_tokens)
            tot_similar_doc_all+=tot_similar_word

            if istax:
                for taxtoken in list_res_tokens:
                    if taxtoken[1].lemma_.lower() not in list_keep_tax_words:
                        list_keep_tax_words.append(taxtoken[1].lemma_.lower())
                tot_similar_doc+=tot_similar_word
            istax_doc |= istax

        #store score for all
        list_keep_pointer_alldocs.append([n,tot_similar_doc_all])

        #store score for taxdocs
        if istax_doc and (tot_similar_doc >= similar_doc):
            list_keep_pointer_taxdocs.append([n,tot_similar_doc])

    return list_keep_pointer_taxdocs, list_keep_tax_words, list_keep_pointer_alldocs

def create_pickle_keywords_and_docscores(list_keywords:list = ['belasting'], file_keywords:str = "", file_docscores:str = "")-> (list,list):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
     
    #settings : [numberofdocumenttoscan (0 for all), min similarity score for doc to get into the taxlist, min similarity score for keywords]):
    settings=[  [5,5,0.97],    #step1
                [10,10,0.95],  #step2
                [15,20,0.90],  #step3
                [0,40,0.87]    #step4 All documents
                ]
    
    numberofsteps=len(settings)
    for step in range(numberofsteps):
        list_keep_pointer_taxdocs , list_keep_tax_words, list_keep_pointer_alldocs = createlistofkeywords(settings[step][0],settings[step][1],settings[step][2],list_keywords)
        list_keywords+=list_keep_tax_words
        #no duplicates
        list_keywords = list(set(list_keywords))
        list_docscores=list_keep_pointer_alldocs
        
        # no file no pickle
        if file_keywords != "" :
            df_keywords=pd.DataFrame(list_keywords,columns=['keywords'])
            df_keywords.to_pickle(file_keywords)  
            
        if file_docscores != ""  :
            df_docscores=pd.DataFrame(list_keep_pointer_alldocs,columns=['docpointer','docscores']) 
            df_docscores.to_pickle(file_docscores)  

    return list_keywords, list_docscores

def score_text(txt:str,language:str = 'nl', min_score:float = 0.3, file_keywords:str = "../data/tax_keywords_nl.pkl") -> float :
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))

    if language == 'nl':
        df_keywords = pd.read_pickle(file_keywords) 
        list_keywords=list(df_keywords['keywords'])
        
        #clean txt / get tokens
        dict_uniqwords,list_tokens=nlp_cleanandlemmatize(txt,[]) 
        
        docscore=0  
        for word in list_keywords:
            token_word = nlp(word)
            for token in list_tokens:
                similar=token_word.similarity(token)
                #similarity > min_score to be taken into account
                if similar >= min_score:
                    docscore+=similar
    else:
        print("Language selection not supported for now!")

    return docscore

def create_initial_keywordlist(language:str ='nl') -> None:
    # create keyword list picklefile and docscores picklefile
    #read all docs
    global df
    df = pd.read_pickle("../data/Staatsblad_nl_fr.pkl") 
    if language == 'nl':
        #start search for nl keywords
        keywords, docscores = create_pickle_keywords_and_docscores(['belasting','tax','fisc'], "../data/tax_keywords_nl.pkl", "../data/tax_docscores_nl.pkl")
    else:
        print("Language selection not supported for now!")


#unsupervised keyword search
def get_keywordsunsupervised(txt:str, sim:float = 0.90) -> dict:
    # input a text 
    # output relevant keywords
    doc = nlp(txt)

    chnk=[]
    for chunk in doc.noun_chunks:
        chnk.append(chunk)

    sim_low=sim
    txt_keywords=''
    list_keywordswithscores=[]
    for c in chnk:
        word_simil=0
        for t in chnk:
            simil=c.similarity(t)
            if (simil >= sim_low and simil < 1): 
                word_simil+=simil
                
        if word_simil > 1:
            list_keywordswithscores.append([c,word_simil])
            txt_keywords=txt_keywords + c.lemma_ +' '
            
    dict_txt,list_tokens=nlp_cleanandlemmatize(txtdoc=txt_keywords,list_wordstoskip='',onlynouns=True)
    return dict_txt

def score_topic_list(txt:str,language:str = 'nl', min_score:float = 0.3, list_topics:list = ['belasting','tax']) -> float :
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))

    if language == 'nl':
       
        #clean txt / get tokens
        dict_uniqwords,list_tokens=nlp_cleanandlemmatize(txt,[]) 

        topic_score=[]
       
        for word in list_topics:
            docscore=0
            token_word = nlp(word)
            for token in list_tokens:
                similar=token_word.similarity(token)
                #similarity > min_score to be taken into account
                #print(similar)
                if similar >= min_score:
                    docscore+=similar
            topic_score.append(docscore)
            #print("@@@",word,docscore)
    else:
        topic_score=[]
        print("Language selection not supported for now!")

    return topic_score   


def score_text_byvector(txt:str,language:str = 'nl', min_score:float = 0.3, file_keywords:str = "../data/tax_keywords_nl.pkl") -> float :
    if language != 'nl':
        print("Language selection not supported for now!")
        return -999

    df_keywords = pd.read_pickle(file_keywords) 
    list_keywords=list(df_keywords['keywords'])

    txt_keywords=''
    for t in list_keywords: txt_keywords += ' ' + t
    token_txt = nlp(txt_keywords)

    #clean txt  
    txt_doc=''
    dict_uniqwords,list_tokens=nlp_cleanandlemmatize(txt,[]) 
    for i in list_tokens: txt_doc += ' ' + (i.lemma_).lower()
    token_doc = nlp(txt_doc)

    docscore=token_doc.similarity(token_txt)   

    return docscore


def get_topic_byvector(txt:str, language:str = 'nl') -> [float,str] :
    if language != 'nl':
        print("Language selection not supported for now!")
        return -999
    list_topic_keywords = [ ['inkomstenbelasting'],
                            ['personenbelasting'],
                            ['vennootschapsbelasting'],
                            ['rechtspersonenbelasting'],
                            ['belasting van niet-inwoners'],
                            ['belasting op de toegevoegde waarde'],
                            ['internationale belastingrecht'],
                            ['registratierechten'],
                            ['successierechten'],
                            ['douanerechten'],
                            ['verkeersbelasting'],
                            ['loonbelasting'],
                            ['dividendbelasting'],
                            ['erfbelasting'],
                            ['schenkbelasting'],
                            ['kansspelbelasting'],
                            ['gokbelasting'],
                            ['vermogensrendementsheffing']
                            ]


    #clean text  
    txt_doc=''
    dict_uniqwords,list_tokens=nlp_cleanandlemmatize(txt,[]) 
    for i in list_tokens: txt_doc += ' ' + (i.lemma_).lower()

    #
    topicscore=[]
    for list_keywords in list_topic_keywords:
        txt_keywords=''
        for t in list_keywords: txt_keywords += ' ' + t
        token_txt = nlp(txt_keywords)
        token_doc = nlp(txt_doc)

        topicscore.append([token_doc.similarity(token_txt),list_keywords])   
        topicscore.sort(reverse=True)
        score = topicscore[0]
    return score

    

In [None]:
#This will create the inital keyword list (takes some time to run +-7min on my old laptop) and needs to be run only once!
#df = pd.read_pickle("../data/Staatsblad_nl_fr.pkl") 

#create_initial_keywordlist()

In [26]:
#Example usage for get_topic_byvector
df = pd.read_pickle("../data/Staatsblad.pkl") 

for loop in range(10):
    txt=df['cleantextnl'][loop]
    topicscore=get_topic_byvector(txt)

    #topic.sort(reverse=True)
    print("Doc. nr:",loop," --->> ",topicscore)
    print('------------------------------------------------------')



  topicscore.append([token_doc.similarity(token_txt),list_keywords])


Doc. nr: 0  --->>  [0.7487207824085366, ['vennootschapsbelasting']]
------------------------------------------------------
Doc. nr: 1  --->>  [0.6901101525735599, ['successierechten']]
------------------------------------------------------
Doc. nr: 2  --->>  [0.4549312785212403, ['registratierechten']]
------------------------------------------------------
Doc. nr: 3  --->>  [0.7398864433592595, ['internationale belastingrecht']]
------------------------------------------------------
Doc. nr: 4  --->>  [0.7552442384831771, ['vennootschapsbelasting']]
------------------------------------------------------
Doc. nr: 5  --->>  [0.7532072980154575, ['vennootschapsbelasting']]
------------------------------------------------------
Doc. nr: 6  --->>  [0.7793690473749172, ['vermogensrendementsheffing']]
------------------------------------------------------
Doc. nr: 7  --->>  [0.7524580376003799, ['internationale belastingrecht']]
------------------------------------------------------
Doc. nr:

In [14]:
#Get score for one txt string
#############################
#This will use the pickled keyword list created by the create_initial_keywordlist() function
txt='de belastingen zijn er weer\n Deze tax keer meer belastingen en meer tax te betalen!\n meer en meer belastingen tax is nodig en fisc'
txt2='this text does not contain any ... related words\n '
d = score_text(txt)
d2 = score_text(txt2)
print("score for d:",d,"score for d2:",d2)

score for d: 2.9043948372239012 score for d2: 0


  similar=token_word.similarity(token)


In [92]:
#GETSCORE FOR THE DIFFERENT TYPES OF TAXES
#
# soorten belastingen
def get_topic(txt:str ,topic:list = ['inkomstenbelasting',
                'personenbelasting',
                'vennootschapsbelasting',
                'rechtspersonenbelasting',
                'belasting van niet-inwoners',
                'belasting op de toegevoegde waarde',
                'internationale belastingrecht',
                'registratierechten',
                'successierechten',
                'douanerechten',
                'verkeersbelasting',
                'loonbelasting',
                'dividendbelasting',
                'erfbelasting',
                'schenkbelasting',
                'kansspelbelasting',
                'gokbelasting',
                'vermogensrendementsheffing'
                ]) -> list:
    
    score=score_topic_list(txt,'nl',0.001,topic)
    maxscore = 0
    for i in range(len(topic)) :
        #print(type_of_taxes[i], score[i])
        if score[i] > maxscore : 
            maxscore = score[i]
            maxtopic=topic[i]    

    return maxscore, maxtopic

#test
stop=100
for txt in df['cleantextnl']:
    maxx = get_topic(txt)
    print(maxx)
    if stop == 1: 
        break
    stop+=-1

  similar=token_word.similarity(token)


(20.797576331427283, 'vennootschapsbelasting')
(10.702925898996895, 'successierechten')
(12.932751534976378, 'registratierechten')
(20.9291968132176, 'internationale belastingrecht')
(15.127752742486264, 'vennootschapsbelasting')
(10.571855618677969, 'vennootschapsbelasting')
(83.10764506730264, 'successierechten')
(21.81420090009421, 'internationale belastingrecht')
(12.927745560924786, 'vennootschapsbelasting')
(12.927745560924786, 'vennootschapsbelasting')
(4.606601795191464, 'vennootschapsbelasting')
(39.71346430634192, 'successierechten')
(24.621960825340512, 'successierechten')
(6.807182539653486, 'registratierechten')
(20.127441643780777, 'vennootschapsbelasting')
(31.355287047152935, 'vennootschapsbelasting')
(33.43220662498416, 'internationale belastingrecht')
(6.064743018773677, 'inkomstenbelasting')
(3.9201014971712604, 'vennootschapsbelasting')
(9.630507982377221, 'inkomstenbelasting')
(5.014051644103372, 'vennootschapsbelasting')
(6.968780967554487, 'successierechten')
(6.

In [15]:
#example for unsupervised function
#########################################################
# test with a file containing NL text
#
with open('../data/text.txt', encoding="utf8") as file:
    txt = file.read()
#print(txt)
dict_text=get_keywordsunsupervised(txt,0.9)
print("TOPIC DOC {}:{}\n".format(0,dict_text))


#example for unsupervised
# test with a pandas NL text
#
df = pd.read_pickle("../data/Staatsblad_nl_fr.pkl") 
#testing with txt at pos 100 en 101
for i in range(2):
    dict_text=get_keywordsunsupervised(df['cleantextnl'][i+100],0.90)
    print(dict_text)
    print('-----------------------')

TOPIC DOC 0:{'houdbaarheidsdatum': 2, 'bewoner': 1, 'applicatie': 1, 'medewerker': 1, 'koffiekoek': 1, 'reactie': 1, 'overschot': 1}



  simil=c.similarity(t)


{'onder_wijswetgeving': 1, 'onderwijs': 9, 'artikel': 3, 'regering': 12, 'voorwaarde': 3, 'organisatie': 8, 'onderwijsactiviteit': 2, 'studie': 7, 'type': 7, 'inspecteur': 1, 'akkoordbevinding': 1, 'minister': 2, 'lid': 17, 'wet': 1, 'opschorting': 1, 'noodzakelijkheid': 3, 'vast_stellen': 1, 'maatregel': 3, 'aandacht': 1, 'instelling': 5, 'omzendbrief': 2, 'nummer': 1, 'aanneming': 1, 'voordracht': 1, 'gezondheidscrisis': 1, 'regeling': 2, 'student': 7, 'onderwijseenheden': 1, 'einddatum': 1, 'eenheid': 1, 'lesti_jd': 1, 'lestijd': 1, 'punt': 1, 'ond_erwijswetgeving': 1, 'herinsc_hrijving': 1, 'onderwijseenhed': 1, 'opsluiting': 1, 'zitting': 3, 'onderwijsinstelling': 2, 'paragraaf': 1, 'evaluatie': 8, 'leerresultaten': 3, 'directie': 2, 'toelating': 2, 'lee_rresultat': 2, 'nazicht': 2, 'onderwijseenheid': 2, 'regel': 3, 'plan': 1, 'evaluatiedatum': 1, 'aard': 1, 'kenmerk': 1, 'materieel': 1, 'omstandigheid': 1, 'mededeling': 1, 'beoordeling': 1, 'eindevaluatie': 2, 'begind_atum': 3, 

In [None]:
#THIS IS THE END FOR NOW ...