-
Notifications
You must be signed in to change notification settings - Fork 0
/
redactor.py
163 lines (135 loc) · 5.03 KB
/
redactor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import argparse
import glob
import os
from pathlib import Path
import sys
from project1 import redact_pipeline
from project1.models import Settings
def main(settings: Settings):
"""Redacts each file using project1 module
Parameters
----------
settings : Parsed command line args as an instance of Settings class
"""
# Getting input files
input_files = []
for i in settings.input:
input_files += glob.glob(i)
print('\nFiles processing ->', input_files, '\n')
process_stats = []
for input_file in input_files:
redacted_txt = ""
input_file_path = Path(input_file).resolve()
# Try-except block to catch any file access or file reading errors
try:
# Reads the content of the given file and starts the redaction
# process
with open(input_file_path, 'r') as in_f:
unredacted_txt = in_f.read()
redacted_txt, stats = redact_pipeline(
unredacted_txt,
redacts=settings.redacts,
concepts=settings.concepts)
process_stats.append(
f"========== Stats:{input_file} ==========\n{stats}")
except BaseException:
# Exception is written to StdErr and loop is continued for other
# remaining files
sys.stderr.write(f'Could not read and redact {input_file}\n\n')
continue
if settings.output == 'stdout' or settings.output == 'stderr':
print(f'Redacting {input_file} ->')
output_to_std(redacted_txt, settings.output)
else:
# Generating output path and creating any parent folders in the path
# irrespective of OS (Windows or Posix)
output_file_folder = Path(
os.path.join(
settings.output,
input_file)).parent
output_file_folder.resolve().mkdir(parents=True, exist_ok=True)
output_file = Path(os.path.join(
output_file_folder,
f'{Path(input_file).stem}.redacted'))
print(f'Redacting {input_file} -> {output_file.resolve()}')
output_to_file(redacted_txt, output_file)
print(f'Stats added to {settings.stats}\n')
if settings.stats == 'stdout' or settings.stats == 'stderr':
output_to_std("\n\n".join(process_stats), settings.stats)
else:
output_file = Path(os.path.join(os.getcwd(), settings.stats))
output_to_file("\n\n".join(process_stats), output_file)
def output_to_std(content: str, output: str):
"""Writes the content to standard files (stdout ot stderr)
Parameters
----------
content : Content to be written
output : Special file to which content has to be written
"""
old_stdout_state = sys.stdout
if output == 'stderr':
sys.stdout = sys.stderr
sys.stdout.write(content)
sys.stdout.write('\n\n')
sys.stdout = old_stdout_state
def output_to_file(content: str, output_file: Path):
"""Writes the content to the given file
Parameters
----------
content : Content to be written
output_file : File to which content has to be written
"""
with open(output_file.resolve(), 'w', encoding='utf-8') as out_f:
out_f.write(content)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(
description='Redacts sensitive content in a given file.')
arg_parser.add_argument(
"--input",
required=True,
action="append",
help="<Required> glob of input files (eg:'*.txt')")
arg_parser.add_argument(
"--output",
required=True,
help="<Required> directory to store redacted files")
arg_parser.add_argument(
"--names",
action="store_true",
default=False,
help="redacts names (Human, Location and Organizations)")
arg_parser.add_argument(
"--genders",
action="store_true",
default=False,
help="redacts gender revealing words (He, She, Father, Mother)")
arg_parser.add_argument(
"--dates",
action="store_true",
default=False,
help="redacts dates (Feb 3rd, 03/02/2022, February 3, 2022)")
arg_parser.add_argument(
"--phones",
action="store_true",
default=False,
help="redacts phone numbers (only 10 digit phone numbers)")
arg_parser.add_argument(
"--address",
action="store_true",
default=False,
help="redacts US based physical addresses")
arg_parser.add_argument(
"--concept",
action='append',
help='<Required> redacts sentence based on given concept',
required=True)
arg_parser.add_argument(
"--stats",
required=True,
help="<Required> mode to show (stdout, stderr) or save redacted files")
args = arg_parser.parse_args()
settings = Settings.parse(vars(args))
try:
main(settings)
except Exception as e:
sys.stderr.write("Unexpected error occured!\n")