-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathclean_comments.py
89 lines (70 loc) · 3.25 KB
/
clean_comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import glob
import re
from smart_open import smart_open
# set the directory of your exported project
webanno_project = input("Path to exported WebAnno project: (e.g. 'C:/.../curation')")
write_directory = input("Path to folder to write new TSVs to: (e.g. 'C:/.../clean_TSVs')")
# note that if the folder you write to does not already exist, this may cause an error
def getcontents(directory):
"""
Returns the file paths for all files in the specified path. Basically the same as glob.glob, but adds a slash to
the path and changes backslashes to forward slashes.
:param directory: the path to a folder (without a '/' at the end)
:return: a list of the contents of that folder
"""
return [name.replace('\\', '/') for name in glob.glob(directory + '/*')]
# find the folders in your project
folders = getcontents(webanno_project)
# get the paths to each file in the project
files1 = [getcontents(doc) for doc in folders]
files = []
for i in range(len(files1)):
try:
files.append(files1[i][0])
except:
continue
# clean the files so they can be read as tsv's
# generate new names for cleaned files
commonsource = webanno_project + '/'
commonname = '/CURATION_USER.tsv'
cleannames = [name[-(len(name) - len(commonsource)):] for name in files] # get folder/CURATION_USER.tsv
cleannames = [name[:len(name) - len(commonname)] for name in cleannames] # cut out the /CURATION_USER.tsv part
cleannames = [re.sub(r"\..*", "", name) for name in cleannames] # strip the file extension
cleannames = [name + '_cleaned.tsv' for name in cleannames]
# generate new directories for cleaned files
cleandirs = [write_directory.replace('\\', '/') + '/' + name for name in cleannames]
# actually clean those comments
def cleancomment(path):
"""
Cleans a file of any lines beginning with '#' - these lines prevent the file from being read properly into a Pandas
dataframe.
:param path: the path to a file
:return: the contents of the file, with any lines starting with '#' removed
"""
newfile = []
with smart_open(path, 'r') as f:
for line in f.readlines():
if re.match('#', line) is None:
newfile.append(line)
newfile2 = ''
for line in newfile:
for char in line:
newfile2 = newfile2 + char
return newfile2
def cleancomments(readdirs, writedirs, readnames=[]):
"""
Cleans the comments in readdirs and writes them to writedirs. Be sure the two lists are the same length and order,
or it will return an error.
:param readdirs: a list of files to clean
:param writedirs: a list of files to write to; i.e. paths to the new, clean files
:param readnames: a list of names used to report which file has been cleaned. If unspecified, will not report that
any files have been cleaned (but will still clean them)
:return:
"""
for i in range(max(len(readdirs), len(writedirs))):
with smart_open(writedirs[i], 'w') as f:
f.write(cleancomment(readdirs[i]))
if readnames:
print(readnames[i] + ' cleaned')
# Write cleaned comments to assigned folder
cleancomments(files, cleandirs, readnames=cleannames)