In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import pandas as pd
from collections import Counter

## Notebook overview
Goal: get dataframe with txt files
- Starting point: class_overview.csv (excel file) with an overview of the classes and where they are located
- Checkpoint 1: folder_path.csv -> get all folder_paths based on class_overview. This code is a bit hardcoded, because wo classes had nested folders.
- Checkpoint 2: txtfile_paths.csv -> file with all paths to .ocr of the documents. These are the ones we want to have in the end dataframe
- Ending: txtfiles.pkl -> file with extracted documents


Previous notebook - none, first notebook


Next notebook - clean_data.ipynb

In [5]:
# Define all paths
CLASS_OVERVIEW_PATH = f"{cf.output_path}/path_files/class_overview.csv" # file with an overview of all the classes that need to be included, a little messy with some paths
FOLDER_PATH = f"{cf.output_path}/path_files/folder_paths.csv" # clean file, similar to class_overview_path, but for each folder it's own row (especially necessary for class that have files in multiple folders)
TXTFILES_PATH =f"{cf.output_path}/path_files/txtfile_paths.csv" # file with paths to each txt files
TXTFILES_NOTCLEANED_PATH = f"{cf.output_path}/txtfiles_notcleaned.pkl" # file with text extracted from txtfiles, including data split. 

#### Checkpoint 1: Get all folder paths -> folder_path.csv 

In [8]:
# specifically adjusted for Moties and Onderzoeksrapporten
# Goal: add row for each path, so that each path leads to files, and not folders
def get_all_paths_from_nested_folders(input_df):
    df = input_df.copy()
    folder_paths = df.loc[df['path_end']=='FOLDERS']
    files_paths = df.loc[df['path_end']=='FILES']

    moties_counter = 0
    counter = 0
    # get all paths for folder paths that lead into folder, instead of paths (this is the case for Moties and Onderzoeksrapporten)
    for index, row in folder_paths.iterrows():
        path = row['Path']
        folders = os.listdir(f"{cf.blobfuse_path}{path}")

        if row['Path'].endswith('Moties'):
            for year in folders:
                months = os.listdir(f"{cf.blobfuse_path}{path}/{year}")
                for month in months:
                    full_path = f"{cf.blobfuse_path}{path}/{year}/{month}"
                    moties_counter+=1
                    df.loc[len(df)] = {'Category':row['Category'], 'Scraped docs': 'missing', 'Status': row['Status'], 'Converted':row['Converted'], 'Path':full_path, 'path_end':'FULL_PATH'}

        else:
            for year in folders:
                full_path = f"{cf.blobfuse_path}{path}/{year}"
                counter +=1
                df.loc[len(df)] = {'Category':row['Category'], 'Scraped docs': 'missing', 'Status': row['Status'], 'Converted':row['Converted'], 'Path':full_path, 'path_end':'FULL_PATH'}

    # get all paths from folder that lead into files
    for index, row in files_paths.iterrows():
        full_path = f"{cf.blobfuse_path}{row['Path']}"
        df.loc[len(df)] = {'Category':row['Category'], 'Scraped docs': 'missing', 'Status': row['Status'], 'Converted':row['Converted'], 'Path':full_path, 'path_end':'FULL_PATH'}

    return df

overview_df = pd.read_csv(CLASS_OVERVIEW_PATH)
new_df = get_all_paths_from_nested_folders(overview_df)
display(new_df)
# new_df.to_csv(FOLDER_PATH, index=False)

Unnamed: 0,Category,Scraped docs,Status,Converted,Path,path_end
0,Actualiteit,996,Collected BUT MESSY!,DONE,raadsinformatie/raadsinformatie/search_results...,FILES
1,Adviesaanvraag,2480,Collected,DONE,raadsinformatie/raadsinformatie/search_results...,FILES
2,Agenda,missing,Collected,DONE,raadsinformatie/raadsinformatie/search_results...,FILES
3,Agenda,missing,Collected,DONE,raadsinformatie/raadsinformatie/search_results...,FILES
4,Amendement,1978,Collected,DONE,raadsinformatie/raadsinformatie/search_results...,FILES
...,...,...,...,...,...,...
167,Verslag,missing,Collected (cant get more than this),DONE,/home/azureuser/cloudfiles/code/blobfuse/raads...,FULL_PATH
168,Voordracht,missing,Collected,DONE,/home/azureuser/cloudfiles/code/blobfuse/raads...,FULL_PATH
169,Voordracht,missing,Collected,DONE,/home/azureuser/cloudfiles/code/blobfuse/raads...,FULL_PATH
170,Schriftelijke Vragen,missing,Collected,DONE,/home/azureuser/cloudfiles/code/blobfuse/raads...,FULL_PATH


#### Checkpoint 2: Get all file paths -> txtfile_paths.csv
get all txt_paths, and split the paths into test, training, validation and development set

In [20]:
import pandas as pd
import os 
import sys

# load functin to split data into subsets (train, test, val, dev)
sys.path.append('../scripts/') 
from data_split import save_split, save_balanced_split

keys = ['Actualiteit', 'Agenda', 'Besluit', 'Brief', 'Factsheets', 'Motie', 'Onderzoeksrapport', 'Raadsadres', 'Raadsnotulen', 'Schriftelijke Vragen', 'Termijnagenda', 'Voordracht', 'Adviesaanvraag', 'Amendement', 'Begroting','Verslag']
values = ['Actualiteit', 'Agenda', 'Besluit', 'Brief', 'Factsheet', 'Motie', 'Onderzoeksrapport', 'Raadsadres', 'Raadsnotulen', 'Schriftelijke Vraag', 'Agenda','Voordracht', 'Adviesaanvraag', 'Amendement', 'Begroting', 'Raadsnotulen']
label_mapping = dict(zip(keys, values))

# get all txtfile paths and label them with the class
# add ID and split data into test, train and val

def get_txt_files_path(overview):
    # seperate paths that lead straight to files and paths that lead to folders
    files_paths = overview.loc[overview['path_end']=='FULL_PATH']
    
    # create dataframe to store data, each row corresponds to one txt file
    df = pd.DataFrame(columns=['label', 'path', 'id'])

    id_counter = 0

    # for each path get the txt files
    for index, row in files_paths.iterrows():

        # get all text files in folder
        path = row['Path']
        file_names = os.listdir(f"{path}")
        txt_files = [filename for filename in file_names if filename.endswith('.ocr')]

        # for each txt file extract text
        for file in txt_files:
            
            # path to text file
            file_path = f'{path}/{file}'

            # create row for file
            df.loc[len(df)] = {'label':label_mapping[row['Category']], 'path':file_path, 'id':id_counter}
            id_counter+=1

    df = save_split(df)
    df = save_balanced_split(df)
    return df

folder_paths_df = pd.read_csv(FOLDER_PATH)
txt_df = get_txt_files_path(folder_paths_df)
display(txt_df)
# txt_df.to_csv(TXTFILES_PATH, index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remaining_df['balanced_split'] = 'discard'


Unnamed: 0,label,path,id,4split,2split,balanced_split
0,Besluit,/home/azureuser/cloudfiles/code/blobfuse/raads...,20738,train,train,train
1,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,959,test,test,discard
2,Agenda,/home/azureuser/cloudfiles/code/blobfuse/raads...,25937,train,train,train
3,Raadsadres,/home/azureuser/cloudfiles/code/blobfuse/raads...,25107,train,train,train
4,Amendement,/home/azureuser/cloudfiles/code/blobfuse/raads...,16954,test,test,train
...,...,...,...,...,...,...
33123,Voordracht,/home/azureuser/cloudfiles/code/blobfuse/raads...,28114,train,train,val
33124,Amendement,/home/azureuser/cloudfiles/code/blobfuse/raads...,18283,test,test,train
33125,Agenda,/home/azureuser/cloudfiles/code/blobfuse/raads...,15869,train,train,train
33126,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,3955,train,train,test


#### Ending: Load txt files -> txtfiles.pkl
Takes as input a df where each row contains the path to the document and the label of the doc. Below there is code to get a all the file paths, from the folder paths. 

In [16]:
# load txt files of the paths in 'input_df'
# choose to only load part of the data -> 'load'

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from PyPDF2 import PdfReader
import numpy as np

def count_pages(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            return len(reader.pages)
    except Exception as e:
        print(f"Error counting pages for '{pdf_path}': {e}")
        return np.nan

def clean_tokens(tokens):
    stop_words = set(stopwords.words('dutch'))
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]
    tokens_without_punctuation = [word for word in tokens_without_stopwords if word not in string.punctuation and len(word)>1]
    return tokens_without_punctuation

def load_txt_files(input_df):
    df = input_df.copy()

    # create empty dataframe
    columns_list = list(df.columns.values)
    columns_list.extend(['text', 'tokens', 'token_count','clean_tokens','clean_tokens_count','pdf_path', 'num_pages'])
    return_df = pd.DataFrame(columns=columns_list)


    for index, row in df.iterrows():
        # extract text
        with open(row['path']) as txt_file:
            text = txt_file.read()

        # check if text is longer than 5 characters
        if len(text) > 5:
            tokens = word_tokenize(text)
            len_tokens = len(tokens)

            # clean tokens
            cleaned_tokens = clean_tokens(tokens)
            len_cleaned_tokens = len(cleaned_tokens)

            # count pages
            pdf_path = row['path'].replace('.ocr', '')
            num_pages = count_pages(pdf_path)

            # save in dataframe
            return_df.loc[len(return_df)] = {'label':row['label'], 'path':row['path'], 'id':row['id'],'2split':row['2split'],'4split':row['4split'],'text': text, 'tokens':tokens, 'token_count':len_tokens, 'clean_tokens':cleaned_tokens, "clean_tokens_count":len_cleaned_tokens,'pdf_path':pdf_path, 'num_pages':num_pages}
    
    # combine termijnagenda with agenda
    return_df['old_label'] = return_df['label']
    return_df.loc[return_df['label']=='Termijnagenda','label']='Agenda'
    return return_df

txtfile_paths = pd.read_csv(TXTFILES_PATH)
txt_files_df = load_txt_files(txtfile_paths)
display(txt_files_df)

# save to blobfuse
# txt_files_df.to_pickle(TXTFILES_NOTCLEANED_PATH")

incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/raadsinformatie/search_results/amendement/2275509.pdf': startxref not found


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2023-gemeenteraad/jaarverslag-amsterdams-fonds-voor-de-kunst-2022.pdf': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/raadsinformatie/Moties/2013/10/https:||amsterdam.raadsinformatie.pdf': [Errno 2] No such file or directory: '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/raadsinformatie/Moties/2013/10/https:||amsterdam.raadsinformatie.pdf'


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2020-gemeenteraad/discriminatiecijfers-2017.pdf': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/raadsinformatie/Moties/2013/1/https:||amsterdam.raadsinformatie.pdf': [Errno 2] No such file or directory: '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/raadsinformatie/Moties/2013/1/https:||amsterdam.raadsinformatie.pdf'


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2023-gemeenteraad/rapportage-algoritme-analyse---pilot-gedigitaliseerde-fraudepreventie.pdf': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrec

Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2022-gemeenteraad/rapport-over-de-grens.pdf': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(1)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2023-gemeenteraad/pfas-in-nederlands-drinkwater.pdf': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(1)
incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2020-gemeenteraad/lachgas---risicobeoordeling-rapport-cam.pdf': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(1)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2022-gemeenteraad/pfas-in-nederlands-drinkwater.pdf': PyCryptodome is required for AES algorithm
Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/raadsinformatie/SchriftelijkeVragen/2013-10-https:||amsterdam.raadsinformatie.nl|document|4428416|1|1064B_13_bijlage_2_beantwoording_schriftelijke_vragen_Schimmelpennink_NO_Windtunnelonderzoek_A10_West_Amsterdam': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)


Error counting pages for '/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2023-gemeenteraad/samen-werken-aan-de-stad.pdf': PyCryptodome is required for AES algorithm


incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
Object ID 2,0 ref repaired
Object ID 4,0 ref repaired
Object ID 10,0 ref repaired
Object ID 14,0 ref repaired
Object ID 19,0 ref repaired
Object ID 23,0 ref repaired
Object ID 31,0 ref repaired
Object ID 35,0 ref repaired
Object ID 43,0 ref repaired
Object ID 47,0 ref repaired
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
incorrect startxref pointer(3)


KeyboardInterrupt: 