# Prepare file list
Prepares a file list (CSV) with all jaarverslagen in our input folder. A human should open each file and fill in the language (`dutch` or `english`) for that file in the file list afterwards.

All jaarverslagen should be PDFs and placed in the input folder directly, so not in subfolders. This script does not search recursive into subfolders.

In [31]:
import pandas as pd
from os import listdir, makedirs
from os.path import join, isfile
import re

# Parameters
folder = '../jaarverslagen'
file_list_name = 'file_list.csv'

In [32]:
# Read the folder content (sorted, only PDF)
filenames = [f for f in sorted(listdir(folder)) if re.match('.+\.pdf', f)]
print('Files found:', len(filenames))

# To data frame
df_found_files = pd.DataFrame({'filename': filenames})

# Attach the filename without extension
df_found_files['filename_no_extension'] = df_found_files['filename'].apply(lambda f: re.match('(.+)\.pdf', f).group(1))

# Each jaarverslag gets its own output folder
output_folder = '../output'
df_found_files['output_folder'] = df_found_files['filename_no_extension'].apply(lambda f: join(output_folder, f))

# Also create each output folder if it doesn't exist yet
df_found_files['output_folder'].apply(lambda output_folder: makedirs(output_folder, exist_ok=True))

# Show it
df_found_files

Files found: 8


Unnamed: 0,filename,filename_no_extension,output_folder
0,ABNAMRO_2017.pdf,ABNAMRO_2017,../output/ABNAMRO_2017
1,AEGON_2017.pdf,AEGON_2017,../output/AEGON_2017
2,Akzonobel_2017.pdf,Akzonobel_2017,../output/Akzonobel_2017
3,Heineken_2017.pdf,Heineken_2017,../output/Heineken_2017
4,ING_Groep_2017.pdf,ING_Groep_2017,../output/ING_Groep_2017
5,KPN_2017.pdf,KPN_2017,../output/KPN_2017
6,Philips_2017.pdf,Philips_2017,../output/Philips_2017
7,Unilever_2017.pdf,Unilever_2017,../output/Unilever_2017


In [33]:
# Read the already in place file list (if it exists)
if isfile(join(folder, file_list_name)):
    df_file_list = pd.read_csv(join(folder, file_list_name))
    print('Entries already found in file list:', len(df_file_list))
else:
    # Create a new one
    df_file_list = pd.DataFrame(columns=['filename', 'language'])

Entries already found in file list: 8


In [34]:
# Join the already known languages
df = df_found_files.merge(df_file_list[['filename', 'language']], how='left', on='filename')
df

Unnamed: 0,filename,filename_no_extension,output_folder,language
0,ABNAMRO_2017.pdf,ABNAMRO_2017,../output/ABNAMRO_2017,english
1,AEGON_2017.pdf,AEGON_2017,../output/AEGON_2017,english
2,Akzonobel_2017.pdf,Akzonobel_2017,../output/Akzonobel_2017,english
3,Heineken_2017.pdf,Heineken_2017,../output/Heineken_2017,english
4,ING_Groep_2017.pdf,ING_Groep_2017,../output/ING_Groep_2017,english
5,KPN_2017.pdf,KPN_2017,../output/KPN_2017,english
6,Philips_2017.pdf,Philips_2017,../output/Philips_2017,english
7,Unilever_2017.pdf,Unilever_2017,../output/Unilever_2017,english


In [35]:
# Save to CSV
df.to_csv(join(folder, file_list_name), index=False)