# KPMG analyse



In [1]:

#!python -m spacy download nl_core_news_md
import pandas as pd
import spacy

# Load model
nlp = spacy.load("nl_core_news_md")
stopwords = nlp.Defaults.stop_words

DEBUG = False

In [2]:
#some debugging stuff
# 
if DEBUG:
    import inspect 
    def getname():
        import sys
        return sys._getframe(1).f_code.co_name

In [3]:
def nlp_cleanandlemmatize(txtdoc: str, list_wordstoskip:str = ''):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
   
    LANG='nl'
    # words to discard
    months={'nl':['januari','februari','maart','april','mei','juni','augustus','september','oktober','november','december'],
            'fr':['janvier','fevrier','mars','avril','mai','juin','juillet','aout','septembre','octobre','decembre']}
    days={'nl':['maandag','dinsdag','woensdag','donderdag','vrijdag','zaterdag','zondag'],
            'fr':['lundi','mardi','mercredi','jeudi','vendredi','samedi','dimanche']}
    # 
    nlp.max_length=10000000
    nlp_doc=nlp(txtdoc)
    list_allwordslemmatized=[]
    dict_uniqwords={}
    list_tokens=[]
    #filter
    for token in nlp_doc:
        lemma_lower=token.lemma_.lower()
        if token in stopwords:
            continue
        if (token.is_punct or token.is_space or token.is_stop):
            continue
        if token.text.isdecimal():
            continue
        if True in [char.isdigit() for char in token.text]:
            continue
        if token.text[-1] == '.':
            continue
        if len(token.text) <= 2:
            continue
        if lemma_lower in months[LANG]:
            continue
        if lemma_lower in days[LANG]:
            continue
        if lemma_lower in list_wordstoskip:
            continue
        #pass only nouns
        if token.pos_ != 'NOUN':
            continue
        #create dict of unique words with count
        if lemma_lower not in dict_uniqwords: 
            dict_uniqwords[lemma_lower]=1
            #save tokens for vector comparison
            list_tokens.append(token)
        else:
            dict_uniqwords[lemma_lower]+=1
    return dict_uniqwords, list_tokens
    #end of filter------------------------

#compare a token against a tokenslist
def token_compare(token_totest,list_tokens,min_score:int = 0.6):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
    
    #return list of tokens with similarity >= min_score 
    list_tokens_toreturn=[]
    tot_similar_word=0
    istax=False
    for token in list_tokens:
        similar=token_totest.similarity(token)
        #only addup the scores >= min_score
        if similar >= min_score:
            list_tokens_toreturn.append([token_totest,token,similar])
            #positive for tax
            istax=True
            tot_similar_word+=similar
    return istax, tot_similar_word, list_tokens_toreturn

def createlistofkeywords(numberofdocumenttoscan:int = 50,similar_doc:int = 0,min_score:float = 0.6, list_keywords:list = []) -> (list,list,list):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
     
    list_keep_tax_words=[]
    list_keep_pointer_taxdocs=[]
    list_keep_pointer_alldocs=[]

    #take all docs if 0
    if numberofdocumenttoscan == 0:
        numberofdocumenttoscan=len(df)
    
    for n in range(numberofdocumenttoscan):

        print("Documents analyzed: ",numberofdocumenttoscan,n)

        txtdoc = df['cleantextnl'].values[n]
        list_wordstoskip=['blabla','blablabla'] #add here words to discard
        dict_uniqwords,list_tokens=nlp_cleanandlemmatize(txtdoc,list_wordstoskip) #get tokens

        #tokenize and check
        tot_similar_doc=0     #keep score for onlytax docs (istax=True)
        tot_similar_doc_all=0 #keep score for all docs
        istax_doc=False
        
        for word in list_keywords:
            token_word = nlp(word)
            istax, tot_similar_word, list_res_tokens=token_compare(token_word,list_tokens,min_score)
            #if DEBUG: print("DEBUG",istax, tot_similar_word, list_res_tokens)
            tot_similar_doc_all+=tot_similar_word

            if istax:
                for taxtoken in list_res_tokens:
                    if taxtoken[1].lemma_.lower() not in list_keep_tax_words:
                        list_keep_tax_words.append(taxtoken[1].lemma_.lower())
                tot_similar_doc+=tot_similar_word
            istax_doc |= istax

        #store score for all
        list_keep_pointer_alldocs.append([n,tot_similar_doc_all])

        #store score for taxdocs
        if istax_doc and (tot_similar_doc >= similar_doc):
            list_keep_pointer_taxdocs.append([n,tot_similar_doc])

    return list_keep_pointer_taxdocs, list_keep_tax_words, list_keep_pointer_alldocs

def create_pickle_keywords_and_docscores(list_keywords:list = ['belasting'], file_keywords:str = "", file_docscores:str = "")-> (list,list):
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))
     
    #settings : [numberofdocumenttoscan (0 for all), min similarity score for doc to get into the taxlist, min similarity score for keywords]):
    settings=[  [5,5,0.97],    #step1
                [10,10,0.95],  #step2
                [15,20,0.90],  #step3
                [0,40,0.87]    #step4 All documents
                ]
    
    numberofsteps=len(settings)
    for step in range(numberofsteps):
        list_keep_pointer_taxdocs , list_keep_tax_words, list_keep_pointer_alldocs = createlistofkeywords(settings[step][0],settings[step][1],settings[step][2],list_keywords)
        list_keywords+=list_keep_tax_words
        #no duplicates
        list_keywords = list(set(list_keywords))
        list_docscores=list_keep_pointer_alldocs
        
        # no file no pickle
        if file_keywords != "" :
            df_keywords=pd.DataFrame(list_keywords,columns=['keywords'])
            df_keywords.to_pickle(file_keywords)  
            
        if file_docscores != ""  :
            df_docscores=pd.DataFrame(list_keep_pointer_alldocs,columns=['docpointer','docscores']) 
            df_docscores.to_pickle(file_docscores)  

    return list_keywords, list_docscores

def score_text(txt:str,language:str = 'nl', min_score:float = 0.3, file_keywords:str = "../data/tax_keywords_nl.pkl") -> float :
    if DEBUG : print("In function: ",getname(), inspect.signature(globals()[getname()]))

    if language == 'nl':
        df_keywords = pd.read_pickle(file_keywords) 
        list_keywords=list(df_keywords['keywords'])
        
        #clean txt / get tokens
        dict_uniqwords,list_tokens=nlp_cleanandlemmatize(txt,[]) 
        
        docscore=0  
        for word in list_keywords:
            token_word = nlp(word)
            for token in list_tokens:
                similar=token_word.similarity(token)
                #similarity > min_score to be taken into account
                if similar >= min_score:
                    docscore+=similar
    else:
        print("Language selection not supported for now!")

    return docscore

def create_initial_keywordlist(language:str ='nl') -> None:
    # create keyword list picklefile and docscores picklefile
    #read all docs
    global df
    df = pd.read_pickle("../data/Staatsblad_nl_fr.pkl") 
    if language == 'nl':
        #start search for nl keywords
        keywords, docscores = create_pickle_keywords_and_docscores(['belasting','tax','fisc'], "../data/tax_keywords_nl.pkl", "../data/tax_docscores_nl.pkl")
    else:
        print("Language selection not supported for now!")


In [None]:
#This will create the inital keyword list (takes some time to run +-7min on my old laptop) and needs to be run only once!
#df = pd.read_pickle("../data/Staatsblad_nl_fr.pkl") 
create_initial_keywordlist()

In [4]:
#Get score for one txt string
#############################
#This will use the pickled keyword list created by the create_initial_keywordlist() function
txt='de belastingen zijn er weer\n Deze tax keer meer belastingen en meer tax te betalen!\n meer en meer belastingen tax is nodig en fisc'
txt2='this text does not contain any ... related words\n '
d = score_text(txt)
d2 = score_text(txt2)
print("score for d:",d,"score for d2:",d2)

  similar=token_word.similarity(token)


score for d: 32.10280970434261 score for d2: 0


In [None]:
#THIS IS THE END FOR NOW ...