-
Notifications
You must be signed in to change notification settings - Fork 0
/
groundtruth.py
40 lines (34 loc) · 1.36 KB
/
groundtruth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- UTF-8 -*-
"""
- author: Floriane Chiffoleau
- date: April 2022
- description: Retrieving the content of the groundtruth
- input : XML files
- output: TXT file
- usage :
======
python name_of_this_script.py arg1 arg2
arg1: folder of the groundtruth in their XML format
arg2: name of the output file in a TXT format
"""
import os
import sys
from bs4 import BeautifulSoup
for root, dirs, files in os.walk(sys.argv[1]):
for filename in files:
with open(sys.argv[1] + filename, 'r') as xml_file:
processed_text_as_list = []
soup = BeautifulSoup(xml_file, 'xml')
#Put one inside air quotes according to the type of files that are worked on
#Extract content from the XML ALTO files
for string in soup.find_all("String"):
content = string["CONTENT"]
processed_text_as_list.append(content)
#Extract content from the PAGE XML files
for string in soup.find_all("TextEquiv"):
content = string.find("Unicode").string
processed_text_as_list.append(content)
new_text = "\n".join(processed_text_as_list)
with open(sys.argv[2], "a") as file_out:
#Warning: the output file is in a "add" mode, which mean that it will add groundtruth, file after file
file_out.write(new_text)