# Prepare file list
Prepares a file list (CSV) with all jaarverslagen in our input folder. A human should open each file and fill in the language (`dutch` or `english`) for that file in the file list afterwards.

All jaarverslagen should be PDFs and placed in the input folder directly, so not in subfolders. This script does not search recursive into subfolders.

In [37]:
import pandas as pd
from os import listdir
from os.path import join, isfile
import re

# Parameters
folder = './jaarverslagen'
file_list_name = 'file_list.csv'

In [43]:
# Read the folder content (sorted, only PDF)
filenames = [f for f in sorted(listdir(folder)) if re.match('.+\.pdf', f)]
print('Files found:', len(filenames))

# To data frame
df_found_files = pd.DataFrame({'filename': filenames})
df_found_files

Files found: 8


Unnamed: 0,filename
0,ABNAMRO_2017.pdf
1,AEGON_2017.pdf
2,Akzonobel_2017.pdf
3,Heineken_2017.pdf
4,ING_Groep_2017.pdf
5,KPN_2017.pdf
6,Philips_2017.pdf
7,Unilever_2017.pdf


In [44]:
# Read the file list (if it exists)
if isfile(join(folder, file_list_name)):
    df_file_list = pd.read_csv(join(folder, file_list_name))
    print('Entries found in file list:', len(df))
else:
    # Create a new one
    df_file_list = pd.DataFrame(columns=['filename', 'language'])

Entries found in file list: 8


In [45]:
# Join the already known languages
df = df_found_files.merge(df_file_list, how='left', on='filename')
df

Unnamed: 0,filename,language
0,ABNAMRO_2017.pdf,english
1,AEGON_2017.pdf,english
2,Akzonobel_2017.pdf,english
3,Heineken_2017.pdf,english
4,ING_Groep_2017.pdf,english
5,KPN_2017.pdf,english
6,Philips_2017.pdf,english
7,Unilever_2017.pdf,english


In [46]:
# Save to CSV
df.to_csv(join(folder, file_list_name), index=False)