In [7]:
import os
import camelot
import pandas as pd
import re
import json

In [3]:
data_folder_path = "data/"

rm_file_paths = []
# Walk through the root folder and its subfolders
for folder_path, _, files in os.walk(data_folder_path):
    for file_name in files:
        full_path = os.path.join(folder_path, file_name)
        if('_RM' in file_name):
            europa_number = re.search(r'(\d+)_RM', file_name).group(1)
            rm_file_paths.append({'europa_number': europa_number,
                                  'file_path': full_path})


In [4]:

rm_content_title_pattern = '[A-Z]\.[^0-9].*[^\.]$'

def extract_as_whole_table(file_path):
    tables = camelot.read_pdf(file_path, pages='all')
    whole_table_df = pd.concat([table.df for table in tables], ignore_index=True)
    return whole_table_df

def filter_rows_for_table1(row):
    return len(row[0]) > 0 and len(row[1]) > 0

def extract_table1_from_rm(whole_table_df):
    table1_df = whole_table_df.dropna()
    table1_df = table1_df[table1_df.apply(filter_rows_for_table1, axis=1)]
    return table1_df.set_index(0)[1].to_dict()

def extract_contents_from_rm(whole_table_df):
    table2_df = whole_table_df[0]
    title_indexes = list(table2_df[[bool(re.match(rm_content_title_pattern, x)) for x in table2_df]].index)
    if (len(table2_df.index) not in title_indexes):
        title_indexes.append(len(table2_df.index))
        
    contents = []
    for i in range(len(title_indexes)-1):
        title = table2_df.iloc[title_indexes[i]]
        content = list(table2_df.iloc[title_indexes[i]+1: title_indexes[i+1]])
        contents.append(dict(
            title=title,
            content = content
        ))
    return contents

def extract_whole_document_from_rm(file_path):
    whole_table_df = extract_as_whole_table(file_path)
    header = extract_table1_from_rm(whole_table_df)
    contents = extract_contents_from_rm(whole_table_df)
    return dict(
        header=header,
        contents=contents
    )


In [5]:
parsed_rm_dict = dict()
error_path_list = []
print(len(rm_file_paths))
i = 0
for rm_dict in rm_file_paths:
    europa_number = rm_dict['europa_number']
    file_path = rm_dict['file_path']
    print(f'{i} : {file_path}', end='\r')
    try:
        parsed_rm_dict[europa_number] = extract_whole_document_from_rm(file_path)
    except:
        print(f'error: {file_path}')
        error_path_list.append(file_path)
    i+=1


208
207 : data/12312/12312_RM.pdf7_env_005_reach_refit_en.pdfuation_en.pdf

In [8]:
file_path = 'data_out/rm.json'
with open(file_path, 'w') as json_file:
    json.dump(parsed_rm_dict, json_file)