# Keyword extraction notebook

*These keywords will be used as targets for our clustering model*

1. Extract keywords for all documents from the meta_data as provided in `CLA_meta_from_2018.csv`
2. Put them in a dataframe with the text added
3. Save this dataframe to `CLA_targets_NL.csv`


In [12]:
import pandas as pd
import re 
from os import listdir

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from rake_nltk import Metric, Rake

## Create Dataframe df_target_full

We create a dataframe which will contain the extracted keywords from the themes which are written in the metadata .csv file. For each document there is metadata available.

**Method:**
1. Loop through all the files and create a dataframe with file_id
2. Loop through the metadata from all the files 
- extract metadata keywords with NLTK and exclude stopwords
- add the keywords to the dataframe


In [13]:
# reading csv file
df=pd.read_csv("../data/raw/CLA_meta_from_2018.csv") 

# Add custom stopwords
stopwords = stopwords.words('dutch')

custom_stopwords=['waarvoor','wegens','sommige','betreffende','maatregel','stelsel','excl','aanv','adv','artikel','hoofdstuk','2020','2019','2018','2021','2022','uren']
stopwords.extend(custom_stopwords)
print (stopwords)

rake_nltk_var = Rake(language='dutch',stopwords=stopwords,include_repeated_phrases=False,ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)

# Go through all the files
input_path='../data/processed/NL'
file_list=listdir(input_path)

doc_list=[]
for file in file_list:
    if 'NL_' in file:
        document_id=file[3:-4]
    else:
        document_id=file[:-4]
    doc_list.append(document_id) 

# Make a dataframe which will have document_id and the keywords in seperate columns
df_target_full=pd.DataFrame()

for document_id in doc_list:

    # get the themes_text (NL)
    themes_text=df.loc[df['filename'].str.contains(document_id),'themes_nl'].values.astype(str).tolist()

    # Check if value is filled and extract keywords
    if pd.isnull(df.loc[df['filename'].str.contains(document_id),'themes_nl'].values)!=True:
        themes_text = re.sub("\)\\\\","",themes_text[0])
        themes_text = re.sub("\)\.","",themes_text)
        themes_text = re.sub(" \)","",themes_text)
        themes_text = re.sub("\),",",",themes_text)
        themes_text = re.sub("-,",",",themes_text)
        themes_text = re.sub(",-",",",themes_text)
        rake_nltk_var.extract_keywords_from_text(themes_text)
        keyword_extracted = rake_nltk_var.get_ranked_phrases()

    theme_keywords=df.loc[df['filename'].str.contains(document_id),'themes_nl'].values[0]

    # Keyword_extracted is a list, so split them into seperate words for the columns 
    doc_keywords=[document_id] 
    targets=set(keyword_extracted)

    # First column will be document_id, so put it as first element
    doc_keywords=[document_id]

    # Exclude keywords with lenght < 4 
    for x in targets:
        if len(x)>3:
            doc_keywords.append(x)

    # Put the keywords to a dataframe and transpose to fit with full_target dataframe
    df_target=pd.DataFrame(doc_keywords)
    df_target=df_target.transpose()

    df_target_full=pd.concat([df_target_full,df_target],ignore_index=True)

['de', 'en', 'van', 'ik', 'te', 'dat', 'die', 'in', 'een', 'hij', 'het', 'niet', 'zijn', 'is', 'was', 'op', 'aan', 'met', 'als', 'voor', 'had', 'er', 'maar', 'om', 'hem', 'dan', 'zou', 'of', 'wat', 'mijn', 'men', 'dit', 'zo', 'door', 'over', 'ze', 'zich', 'bij', 'ook', 'tot', 'je', 'mij', 'uit', 'der', 'daar', 'haar', 'naar', 'heb', 'hoe', 'heeft', 'hebben', 'deze', 'u', 'want', 'nog', 'zal', 'me', 'zij', 'nu', 'ge', 'geen', 'omdat', 'iets', 'worden', 'toch', 'al', 'waren', 'veel', 'meer', 'doen', 'toen', 'moet', 'ben', 'zonder', 'kan', 'hun', 'dus', 'alles', 'onder', 'ja', 'eens', 'hier', 'wie', 'werd', 'altijd', 'doch', 'wordt', 'wezen', 'kunnen', 'ons', 'zelf', 'tegen', 'na', 'reeds', 'wil', 'kon', 'niets', 'uw', 'iemand', 'geweest', 'andere', 'waarvoor', 'wegens', 'sommige', 'betreffende', 'maatregel', 'stelsel', 'excl', 'aanv', 'adv', 'artikel', 'hoofdstuk', '2020', '2019', '2018', '2021', '2022', 'uren']


Let's look at our dataframe

In [14]:
df_target_full.head(-3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,88
0,10202-2018-012766,syndicale vormingmaatregel,overuren,actieve werknemer,indexeringsbepalingen,bediendenstelsel,cheque,premie eigen,jaarbasis,onderneming,...,,,,,,,,,,
1,10202-2020-013175,syndicale vormingmaatregel,overuren,loonsverhogingen,actieve werknemer,indexeringsbepalingen,bediendenstelsel,cheque,premie eigen,jaarbasis,...,,,,,,,,,,
2,10205-2018-004963,loonsverhogingen,bedienden,eenmalige premie,actieve werknemer,landingsbanen,indexeringsbepalingen,bediendenstelsel,premie eigen,jaarbasis,...,,,,,,,,,,
3,10206-2019-003872,vergoedingen,actieve werknemer,groepsverzekeringen,alle premies,aanvullende pensioenen,,,,,...,,,,,,,,,,
4,10206-2020-000814,aanvullende pensioenen,loonsverhogingen,commerceanciënniteitspremie,bedienden,actieve werknemer,indexeringsbepalingen,jaarbasis,onderneming,jonge werknemers,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5523,341-2021-005291,hospitalisatieverzekering,ecocheques,vergoedingen,beloningswijze,actieve werknemer,alternatief voordeel,betaalde verlofdag,cheque,alle premies,...,,,,,,,,,,
5524,341-2021-015425,geschenken,vergoedingen,actieve werknemer,alle premies,premie eigen,cultuurcheques,sport,onderneming,sector,...,,,,,,,,,,
5525,341-2021-015811,bedrijfstoeslag,swtindexeringsbepalingen,werkloosheid,lonen,,,,,,...,,,,,,,,,,
5526,341-2021-015813,ouderschapsverlof,persoonlijke redenen,verlof,,,,,,,...,,,,,,,,,,


#### Write this dataframe to a .csv file

In [16]:
df_target_full.to_csv('../csv/CLA_meta_keywords.csv', sep=";", index=False) 

#### Function to clean the text 

In [17]:
def clean_text(text):
  text = text.lower()
  text = re.sub("[^a-zA-Z\'\-éòóôëè]", " ", text) 
  return " ".join(word_tokenize(text)[:])

### Complete the dataframe with the document text

With this function we add the full text of the document to the dataframe.

The output is written to the file `CLA_targets_NL.csv`

In [20]:
# Add the text of the files (CLA) to the dataframe
import os
def add_CLA_text_to_df():
    input_path=os.path.join('..','data','processed','NL')
    file_list=listdir(input_path)
    doc_list=[]

    for file in file_list:
        if 'NL_' in file:
            document_id=file[3:-4]
        else:
            document_id=file[:-4]
        doc_list.append(document_id) 

    df=pd.read_csv('../csv/CLA_meta_keywords.csv',sep=';')

    for document_id in doc_list:

        try: 
            with open(os.path.join(input_path,f'NL_{document_id}.txt'),encoding="utf-8") as f:
                text = f.read() 
                df.loc[df['0']==document_id,'text']=clean_text(text)
        except FileNotFoundError:
            with open(os.path.join(input_path,f'{document_id}.txt'),encoding="utf-8") as f:
                text = f.read() 
                df.loc[df['0']==document_id,'text']=clean_text(text)


    df.reset_index(inplace=True)
    df.to_csv('../csv/CLA_targets_NL.csv', sep=";", index=False)
    
add_CLA_text_to_df()