In [2]:
import re
import os
from os.path import join
import json
import pandas as pd

In [92]:
data_p = join('..', 'data', 'afgørelser_split.json')
output_p = join('..', 'output')

with open(data_p, 'r') as f:
    data = json.load(f)

In [4]:
# regexes
cpr_re = re.compile(r'\nCpr\.\s?nr\.\s+(\d+)\s+(\d+)', re.IGNORECASE)
kommune_re = re.compile(r'Du har klaget over ([a-zæøå]+(?:\-[a-zæøå]+)?)\s\s?[K]\w+', re.IGNORECASE)
jnr_re = re.compile(r'j\.nr\.?\s+([0-9]+\-[0-9]+)', re.IGNORECASE)
caseworker_re = re.compile(r'venlig hilsen\s{1,5}([a-zæøå]+\s[a-zæøå]+(\s[a-zæøå]+)?)', re.IGNORECASE)

In [106]:
def get_info(text):
    
    info_dict = {}
    
    try:
        jnr = jnr_re.search(text).group(1)
    except:
        jnr = 'not found'
        
    try:
        birthyear = cpr_re.search(text).group(1)
        if int(birthyear) > 21:
            birthyear = '19' + birthyear
        else:
            birthyear = '20' + birthyear
        
    except:
        birthyear = 'not found'
        
    try:
        gender = cpr_re.search(text).group(2)
        
        if int(gender) % 2 == 0:
            gender = 'female'
        else:
            gender = 'male'
            
    except:
        gender = 'not found'
    
    try:
        kommune = kommune_re.search(text).group(1)
    except:
        kommune = 'not found'
    
    try:
        caseworker = caseworker_re.search(text).group(1)
    except:
        caseworker = 'not found'
    
    info_dict['jnr'] = jnr
    info_dict['birthyear'] = birthyear
    info_dict['gender'] = gender
    info_dict['kommune'] = kommune
    info_dict['caseworker'] = caseworker
    
    return(info_dict)

In [107]:
for entry in data:
    entry.update(get_info(entry.get('text')))

In [108]:
info_df = pd.DataFrame.from_records(data)
info_df = info_df[['filename', 'n', 'jnr', 'birthyear', 'gender', 'kommune', 'caseworker']]

In [109]:
info_df.head()

Unnamed: 0,filename,n,jnr,birthyear,gender,kommune,caseworker
0,5 - Brev fra Ankestyrelsen (D3165783).pdf,1,21-24495,1997,female,Aarhus,Per Anthony
1,5 - Brev fra Ankestyrelsen (D3165783).pdf,2,21-24846,1979,female,Viborg,Per Anthony
2,5 - Brev fra Ankestyrelsen (D3165783).pdf,3,21-22188,1986,female,Ringkøbing-Skjern,Freja Lee Lilja
3,5 - Brev fra Ankestyrelsen (D3165783).pdf,4,21-23191,1978,female,Horsens,Nuzaht Yasmin Ahmad
4,5 - Brev fra Ankestyrelsen (D3165783).pdf,5,21-23206,1989,male,Glostrup,Per Anthony


In [110]:
## begrundelser
### - Vi lægger vægt …
### - Vi lægger også vægt på … 
### - Vi lægger desuden vægt på …

important_regex = re.compile(r'(?<=\n)(vi lægger (?:\w{3,10})? ?vægt på.*?)(?=\s{1,3}\n\s{1,3}\n)', re.IGNORECASE|re.DOTALL) 

grounds = []
for entry in data:
    
    grounds_entry = {'jnr': entry.get('jnr'),
                     'grounds': important_regex.findall(entry.get('text'))}
    
    grounds.append(grounds_entry)
    
    
grounds_df = pd.DataFrame.from_records(grounds)
grounds_df = grounds_df.explode('grounds')
grounds_df = grounds_df.loc[~grounds_df['grounds'].isna(),:]

In [112]:
# export

info_out = join(output_p, 'afgørelser_oversigt.xlsx')
grounds_out = join(output_p, 'agørelser_begrundelser.xlsx')

info_df.to_excel(info_out, index = False)
grounds_df.to_excel(grounds_out, index = False)