Skip to content

Commit

Permalink
closes #3
Browse files Browse the repository at this point in the history
  • Loading branch information
David Underdown committed Feb 11, 2018
1 parent 1aaadbb commit e64c1fb
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions discovery_api_SearchRecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@
## initialise list of the individual regex groups
descfields_list=[]

## as part of checking if we have all labels, will be useful to know length of longest supplied label
longest_label_length=0

## Go through the label list for each label in turn, construct a normalised label id, and construct the high level regex group for that label and its related text
for label in labels :
if len(label) > longest_label_length :
longest_label_length=len(label)
## construct the normalised label_id, add to list of label_ids
label_id=label
## remove punctuation characters
Expand All @@ -57,6 +62,7 @@
desc_fields=regex.compile(redescfields, flags=regex.POSIX|regex.VERSION1) ## revised version using regex library to get left longest match using POSIX flag under VERSION1
## Confirm the regex to be used
print("regex for extracting data from description:",desc_fields.pattern)
# label_start=regex.compile(r"(?r)(?<start>:|(. )|\[|\])")

def search_for_match(v) :
'''Find match for each row (to be saved in temporary column in DataFrame)'''
Expand Down Expand Up @@ -95,6 +101,32 @@ def no_extracted_data(v) :
print("no data extracted from description for",v["reference"],v["description"])
return no_extracted_data

def other_possible_labels(v) :
desc_without_known_labels=v["description"]
other_possible_labels=[]
for label in labels :
desc_without_known_labels=desc_without_known_labels.replace(label+":","")
max_possible_other_labels=desc_without_known_labels.count(":")
if max_possible_other_labels > 0 :
start_pos=0
for i in range(max_possible_other_labels) :
colon_pos=desc_without_known_labels.find(":",start_pos)
start_pos=colon_pos+1
# begin_label_slice=max(0,colon_pos-longest_label_length)
# match=label_start.search(desc_without_known_labels,colon_pos-1)
# if match :
# begin_label_slice=match.end("start")
# print(str(begin_label_slice),str(colon_pos))
begin_label_slice=max(desc_without_known_labels.rfind(".",0,colon_pos-1),desc_without_known_labels.rfind("[",0,colon_pos-1))
print(str(begin_label_slice),str(colon_pos-1))
new_label_candidate=desc_without_known_labels[begin_label_slice:colon_pos].strip("".join((string.whitespace,string.punctuation,string.digits)))
other_possible_labels.append(new_label_candidate)
if len(other_possible_labels) > 0 :
print("Additional possible data labels found in",v["reference"],str(other_possible_labels))
return other_possible_labels
else :
return None

## Now construct the API call.
## For use via the Python requests library the parameters (following the ? in the URLs above) are expressed as a Python dictionary of key-value pairs,
## if a parameter is used with several different values (as in the first URL), the multiple values are expressed as Python list as in the first example.
Expand Down Expand Up @@ -186,6 +218,9 @@ def no_extracted_data(v) :
## Look for rows that seem to have no data extracted at all.
df["no_extracted_data"]=df.apply(no_extracted_data,axis=1)

## check for any other possible labels in text that we didn't include in original list of labels
df["other_possible_labels"]=df.apply(other_possible_labels,axis=1)

## Match object doesn't give us anything useful to include in overall output so delete the column, keeping everything else in original dataframe.
df.drop(labels="match", axis=1, inplace=True)

Expand Down

0 comments on commit e64c1fb

Please sign in to comment.