In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "local"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
    running_demo = False
elif my_run == "local":
    import config as cf
    running_demo = True


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd


## Notebook Overview
Goal: extract txt from all files in data folder.
Local: extract txt from local_data/data_files.
Azure: see load_txt_azure.ipynb

1. Create DF with folder paths, belonging to each class. This folder contains all docs for that class.
2. Create DF with all file paths from all classes. Assign id to doc, and split.
3. Extract txt from PDF, save DF.


In [3]:

# Get all folder paths in data folder + assign label to that folder
def get_all_paths_to_folders():
    data_folder = f"{cf.output_path}/data_files"
    df_folders = pd.DataFrame(columns = ['label', 'path'])
    labels = []
    paths = []
    for category in os.listdir(data_folder):
        labels.append(category)
        paths.append(f"{data_folder}/{category}")

    df_folders['label'] = labels
    df_folders['path'] = paths

    return df_folders


folder_paths_df = get_all_paths_to_folders()
display(folder_paths_df)

Unnamed: 0,label,path
0,agenda,../local_data/data_files/agenda
1,motie,../local_data/data_files/motie
2,onderzoeksrapport,../local_data/data_files/onderzoeksrapport
3,raadsadres,../local_data/data_files/raadsadres
4,schriftelijke vraag,../local_data/data_files/schriftelijke vraag
5,voordracht,../local_data/data_files/voordracht


In [4]:
import pandas as pd
import os 
import sys

# load functin to split data into subsets (train, test, val, dev)
sys.path.append('../src/') 
from data_split import save_split, save_balanced_split

# get all txtfile paths and label them with the class
# add ID and split data into test, train and val

def get_txt_files_path(files_paths):
    # seperate paths that lead straight to files and paths that lead to folders
    # files_paths = overview.loc[overview['path_end']=='FULL_PATH']
    
    # create dataframe to store data, each row corresponds to one txt file
    df = pd.DataFrame(columns=['label', 'path', 'id'])

    id_counter = 0

    # for each path get the txt files
    for index, row in files_paths.iterrows():

        # get all text files in folder
        path = row['path']
        file_names = os.listdir(f"{path}")
        txt_files = [filename for filename in file_names if filename.endswith('.ocr')]

        # for each txt file extract text
        for file in txt_files:
            
            # path to text file
            file_path = f'{path}/{file}'

            # create row for file
            df.loc[len(df)] = {'label':row['label'], 'path':file_path, 'id':id_counter}
            id_counter+=1

    df = save_split(df)
    df = save_balanced_split(df, demo=running_demo) 
    return df

txtfile_paths = get_txt_files_path(folder_paths_df)
display(txtfile_paths)


Unnamed: 0,label,path,id,4split,2split,balanced_split
0,motie,../local_data/data_files/motie/https:||amsterd...,18,train,train,train
1,schriftelijke vraag,../local_data/data_files/schriftelijke vraag/2...,48,dev,train,val
2,agenda,../local_data/data_files/agenda/545033.pdf.ocr,5,test,test,test
3,raadsadres,../local_data/data_files/raadsadres/562829.pdf...,40,train,train,val
4,voordracht,../local_data/data_files/voordracht/9770645.pd...,60,train,train,test
...,...,...,...,...,...,...
57,raadsadres,../local_data/data_files/raadsadres/562806.pdf...,38,train,train,train
58,onderzoeksrapport,../local_data/data_files/onderzoeksrapport/age...,22,train,train,train
59,onderzoeksrapport,../local_data/data_files/onderzoeksrapport/ams...,30,test,test,train
60,schriftelijke vraag,../local_data/data_files/schriftelijke vraag/2...,51,train,train,train


In [5]:
from collections import Counter
print(Counter(txtfile_paths['balanced_split']))

Counter({'train': 50, 'val': 6, 'test': 6})


In [6]:
# load txt files of the paths in 'input_df'
# choose to only load part of the data -> 'load'

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from PyPDF2 import PdfReader
import numpy as np

def count_pages(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            return len(reader.pages)
    except Exception as e:
        # print(f"Error counting pages for '{pdf_path}': {e}")
        return np.nan

def clean_tokens(tokens):
    stop_words = set(stopwords.words('dutch'))
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]
    tokens_without_punctuation = [word for word in tokens_without_stopwords if word not in string.punctuation and len(word)>1]
    return tokens_without_punctuation

def load_txt_files(input_df):
    df = input_df.copy()

    # create empty dataframe
    columns_list = list(df.columns.values)
    columns_list.extend(['text', 'tokens', 'token_count','clean_tokens','clean_tokens_count','pdf_path', 'num_pages'])
    return_df = pd.DataFrame(columns=columns_list)


    for index, row in df.iterrows():
        # extract text
        with open(row['path']) as txt_file:
            text = txt_file.read()

        # check if text is longer than 5 characters
        if len(text) > 5:
            tokens = word_tokenize(text)
            len_tokens = len(tokens)

            # clean tokens
            cleaned_tokens = clean_tokens(tokens)
            len_cleaned_tokens = len(cleaned_tokens)

            # count pages
            pdf_path = row['path'].replace('.ocr', '')
            num_pages = count_pages(pdf_path)

            # save in dataframe
            return_df.loc[len(return_df)] = {'label':row['label'], 'path':row['path'], 'id':row['id'],'2split':row['2split'],'4split':row['4split'], 'balanced_split':row['balanced_split'], 'text': text, 'tokens':tokens, 'token_count':len_tokens, 'clean_tokens':cleaned_tokens, "clean_tokens_count":len_cleaned_tokens,'pdf_path':pdf_path, 'num_pages':num_pages}
        
    return return_df

txt_files_df = load_txt_files(txtfile_paths)
display(txt_files_df)

# save file
# txt_files_df.to_pickle(f"{cf.output_path}/txtfiles_notcleaned.pkl")

Unnamed: 0,label,path,id,4split,2split,balanced_split,text,tokens,token_count,clean_tokens,clean_tokens_count,pdf_path,num_pages
0,motie,../local_data/data_files/motie/https:||amsterd...,18,train,train,train,x Gemeente Amsterdam R\nGemeenteraad\nx% Gemee...,"[x, Gemeente, Amsterdam, R, Gemeenteraad, x, %...",211,"[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...",110,../local_data/data_files/motie/https:||amsterd...,
1,schriftelijke vraag,../local_data/data_files/schriftelijke vraag/2...,48,dev,train,val,> Gemeente\nAmsterdam\nSchriftelijke vragen\nD...,"[>, Gemeente, Amsterdam, Schriftelijke, vragen...",278,"[Gemeente, Amsterdam, Schriftelijke, vragen, D...",143,../local_data/data_files/schriftelijke vraag/2...,
2,agenda,../local_data/data_files/agenda/545033.pdf.ocr,5,test,test,test,N Gemeente Amsterdam\n% Stadsdeel Amsterdam-No...,"[N, Gemeente, Amsterdam, %, Stadsdeel, Amsterd...",279,"[Gemeente, Amsterdam, Stadsdeel, Amsterdam-Noo...",171,../local_data/data_files/agenda/545033.pdf,2.0
3,raadsadres,../local_data/data_files/raadsadres/562829.pdf...,40,train,train,val,s a-\naren oi gevestigd in het Aalsmeerder Vee...,"[s, a-, aren, oi, gevestigd, in, het, Aalsmeer...",1526,"[a-, aren, oi, gevestigd, Aalsmeerder, Veerhui...",783,../local_data/data_files/raadsadres/562829.pdf,2.0
4,voordracht,../local_data/data_files/voordracht/9770645.pd...,60,train,train,test,VN2021-004350 N Gemeente Raadscommissie voor B...,"[VN2021-004350, N, Gemeente, Raadscommissie, v...",424,"[VN2021-004350, Gemeente, Raadscommissie, Bouw...",234,../local_data/data_files/voordracht/9770645.pdf,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,raadsadres,../local_data/data_files/raadsadres/562806.pdf...,38,train,train,train,Ondernemers in de Jordaan klagen weer luid ove...,"[Ondernemers, in, de, Jordaan, klagen, weer, l...",183,"[Ondernemers, Jordaan, klagen, weer, luid, bep...",79,../local_data/data_files/raadsadres/562806.pdf,1.0
58,onderzoeksrapport,../local_data/data_files/onderzoeksrapport/age...,22,train,train,train,nl n el\nTeen Ni Ge 8\nOEE rl NN mT\nmene | (E...,"[nl, n, el, Teen, Ni, Ge, 8, OEE, rl, NN, mT, ...",18760,"[nl, el, Teen, Ni, OEE, rl, NN, mT, mene, ES, ...",9358,../local_data/data_files/onderzoeksrapport/age...,52.0
59,onderzoeksrapport,../local_data/data_files/onderzoeksrapport/ams...,30,test,test,train,"GR En i L Ee, tt À rde k Me EEP BEE | NE KN È ...","[GR, En, i, L, Ee, ,, tt, À, rde, k, Me, EEP, ...",28872,"[GR, Ee, tt, rde, EEP, BEE, NE, KN, Ni, sail, ...",14792,../local_data/data_files/onderzoeksrapport/ams...,63.0
60,schriftelijke vraag,../local_data/data_files/schriftelijke vraag/2...,51,train,train,train,> Gemeente\nAmsterdam\n\nSchriftelijke vragen\...,"[>, Gemeente, Amsterdam, Schriftelijke, vragen...",633,"[Gemeente, Amsterdam, Schriftelijke, vragen, D...",343,../local_data/data_files/schriftelijke vraag/2...,
