Skip to content

Commit

Permalink
dedup fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
hpiwowar committed Mar 13, 2015
1 parent 7071660 commit 3c7bf8c
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 12 deletions.
20 changes: 17 additions & 3 deletions totalimpact/providers/pubmed.py
Expand Up @@ -9,9 +9,23 @@
import logging
logger = logging.getLogger('ti.providers.pubmed')

def clean_pmid(pmid):
pmid = remove_nonprinting_characters(pmid)
pmid = pmid.lower().replace("pmid:", "")
def clean_pmid(input_pmid):
try:
pmid = remove_nonprinting_characters(input_pmid)
pmid = pmid.lower().replace("pmid:", "")
match = re.match("^(\d{3,15})$", pmid)
if match:
pmid = match.group(1)
else:
pmid = None

except AttributeError:
pmid = None

if not pmid:
logger.debug(u"MALFORMED PMID {input_pmid}".format(
input_pmid=input_pmid))

return pmid


Expand Down
19 changes: 16 additions & 3 deletions totalimpactwebapp/aliases.py
Expand Up @@ -123,11 +123,16 @@ def merge_alias_dicts(aliases1, aliases2):
merged_aliases[ns] = [nid]
return merged_aliases

def matches_alias(product1, product2):
def matches_alias(product1, product2, exclude=[]):
alias_tuple_list1 = [alias_row.my_alias_tuple_for_comparing for alias_row in product1.alias_rows]
alias_tuple_list2 = [alias_row.my_alias_tuple_for_comparing for alias_row in product2.alias_rows]
any_matches = any([alias_tuple1 in alias_tuple_list2 for alias_tuple1 in alias_tuple_list1])
return any_matches
has_matches = False
for alias_tuple1 in alias_tuple_list1:
if alias_tuple1:
(ns, nid) = alias_tuple1
if alias_tuple1 in alias_tuple_list2 and ns not in exclude:
has_matches = True
return has_matches



Expand Down Expand Up @@ -232,6 +237,14 @@ def display_arxiv(self):
except AttributeError:
return None

@cached_property
def has_formal_alias(self):
# has something other than urls and mendeley uuids etc
if self.display_arxiv or self.display_doi or self.display_pmid or self.display_pmc:
return True
else:
return False

@cached_property
def resolved_url(self):
try:
Expand Down
31 changes: 25 additions & 6 deletions totalimpactwebapp/product.py
Expand Up @@ -880,10 +880,10 @@ def aliases_not_in_existing_products(retrieved_aliases, tiids_to_exclude):
found = False
if ns=="biblio":
temp_product = put_biblio_in_product(temp_product, nid, provider_name="bibtex")
found = any([matches_biblio(temp_product, product2) for product2 in products_to_exclude])
found = has_equivalent_biblio_in_list(temp_product, products_to_exclude)
else:
temp_product = put_aliases_in_product(temp_product, [alias_tuple])
found = any([matches_alias(temp_product, product2) for product2 in products_to_exclude])
found = has_equivalent_alias_in_list(temp_product, products_to_exclude)
if not found:
new_aliases += [alias_tuple]

Expand Down Expand Up @@ -941,13 +941,32 @@ def import_and_create_products(profile_id, provider_name, importer_input, analyt
return products


def has_dedupable_genres(product1, product2):
if product1.genre==product2.genre:
if product1.is_preprint==product2.is_preprint:
return True
return False


def has_equivalent_alias_in_list(product1, duplicate_products_group):
is_equivalent = any([matches_alias(product1, product2) for product2 in duplicate_products_group])
return is_equivalent
if product1.aliases.has_formal_alias:
exclude = ["url"]
else:
exclude = []

has_equivalent = False
for product2 in duplicate_products_group:
if matches_alias(product1, product2, exclude=exclude) and has_dedupable_genres(product1, product2):
has_equivalent = True
return has_equivalent


def has_equivalent_biblio_in_list(product1, duplicate_products_group):
is_equivalent = any([matches_biblio(product1, product2) for product2 in duplicate_products_group])
return is_equivalent
has_equivalent = False
for product2 in duplicate_products_group:
if matches_biblio(product1, product2) and has_dedupable_genres(product1, product2):
has_equivalent = True
return has_equivalent

def build_duplicates_list(products):
distinct_groups = defaultdict(list)
Expand Down

0 comments on commit 3c7bf8c

Please sign in to comment.