/
link_ingreds_meds_prefixes.py
73 lines (55 loc) · 3.05 KB
/
link_ingreds_meds_prefixes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
'''
Date: 6/29/2012
Author: Scott Halgrim, halgrim.s@ghc.org
Functionality:
Flattens a file that has BN/INs followed by all the drug names in which they
appear into one combo per line while also adding a third column that has the
prefix, that part of the drug name that precedes the BN/IN.
'''
from std_import import *
import re, pdb
# get module logger
logger = logging.getLogger('org.ghri.sharp.drug_ner.link_ingreds_meds_prefixes')
if __name__ == '__main__': # if run as main, not if imported
# usage string to give if user asks for help or gets command line wrong
usageStr = '%(prog)s configfile [options]'
parser = opts.ConfigFileParser(usage=usageStr) # create cmd line parser
options = parser.parse_args() # parse command line
# start logging at root according to command line
mylogger.config(logfn=options.logfile, logmode=options.logmode, \
loglevel=options.loglevel)
logger.setLevel(options.loglevel) # set module logging level to input
# TODO: subclass this so i can set defaults and let the get function
# have some error checking, etc.
cp = ConfigParser.SafeConfigParser() # create config file parser
cp.read(options.configfn) # read in config file
# get name of tab-separated file that contains all of the BNs and INs in the
# first column and the n-grams of which they are a part in later columns
bulkyfn = cp.get('Main', 'BulkyFile')
flatfn = options.outfn # get name of output file
# read in lines of bulky file and split by tabs
bulkylines = [line.strip() for line in myos.readlines(bulkyfn)]
cols = [line.split('\t') for line in bulkylines]
flattened = [] # initialize flattened list
for line in cols: # for each line bulk file
bnin = line[0] # get BN/IN
for longer in line[1:]: # for each longer name of which its a part
# find the bnin in the longer string after a space
# (because we know it doesn't start with it)
# and assure it is followed by whitespace or end of string
escaped = bnin.replace('(', '\(')
escaped = escaped.replace(')', '\)')
escaped = escaped.replace('+', '\+')
prefixre = r'(?<=\s)%s(?=(\s|$))'%(escaped)
match = re.search(prefixre, longer)
try:
assert match.start() > 0
except:
print 'error'
# get the part of the string that occurs before the BN/IN
# do the strip because there's probs a trailing space
prefix = longer[:match.start()].strip()
# create a new tab sep line that is BN/IN then long name then prefix
flattened.append(line[0] + '\t' + longer + '\t' + prefix)
flattened.sort() # sort the output lines
myos.writelines(flattened, flatfn) # write output lines to output file