In [2]:
from itertools import chain
from json import load

In [3]:
clusters = load(open("/data01/beard/predicted_clusters.json"))
print len(clusters)

399602


In [4]:
signatures = load(open("/data01/beard/signatures.json"))
signatures = {s["signature_id"]: s for s in signatures}
print len(signatures)

8958311


In [4]:
cluster = "370011"
print len(clusters[cluster])
clusters[cluster][:10]

1113


[u'1000048_Ellis, Jonathan Richard_3262364',
 u'1000177_Ellis, Jonathan Richard_3262984',
 u'1001210_Ellis, Jonathan Richard_3266914',
 u'1001460_Ellis, Jonathan Richard_3267662',
 u'1001565_Ellis, Jonathan Richard_3267740',
 u'1001569_Ellis, Jonathan Richard_3267743',
 u'1002686_Ellis, Jonathan Richard_3270135',
 u'100545_Ellis, Jonathan Richard_13778',
 u'1006573_Ellis, Jonathan Richard_3286656',
 u'1007488_Ellis, Jonathan Richard_3290021']

In [10]:
signatures["526969_Ellis, John_1826426"]

{u'author_affiliation': u'CERN',
 u'author_name': u'Ellis, John',
 u'publication_id': u'526969',
 u'signature_id': u'526969_Ellis, John_1826426'}

In [40]:
import re
from unidecode import unidecode
p = re.compile(r"^([a-zA-Z]+[-'`,.~\s]*[a-zA-Z]*){3,}", flags=re.UNICODE)

# http://fabzter.com/blog/remove-nonspacing-characters-text-python
print(unidecode(u"áéíóú äëïöü ø ñÑ û"))  # Test removing accents and stuff

# Match invalid author names in signatures
invalid_pairs = []  # pair representing (record_id, author_name)
for signature in signatures.itervalues():
    author_name = signature["author_name"]
    author_name.lstrip()  # Strip leading white spaces

    if not p.match(unidecode(author_name)):
        invalid_pairs.append( (signature["publication_id"], author_name) )

print len(invalid_pairs)

aeiou aeiou o nN u
4989


In [69]:
# signature id format: '<publication id>_<author fullname>_<continous number>'
for cluster in clusters:
    if any("_Ellis, John" in signature_id for signature_id in clusters[cluster]):
        print cluster

370011


In [5]:
# Format: {cluster: list of signature ids having a authority id}
clusters_having_cds_id = {}
clusters_having_cern_id = {}
clusters_having_inspire_id = {}

for cluster in clusters:
    for signature_id in clusters[cluster]:
        signature = signatures[signature_id]

        if signature.get("author_cds_id"):
            try:
                clusters_having_cds_id[cluster].append(signature_id)
            except KeyError:
                clusters_having_cds_id[cluster] = [signature_id]

        if signature.get("author_cern_id"):
            try:
                clusters_having_cern_id[cluster].append(signature_id)
            except KeyError:
                clusters_having_cern_id[cluster] = [signature_id]

        if signature.get("author_inspire_id"):
            try:
                clusters_having_inspire_id[cluster].append(signature_id)
            except KeyError:
                clusters_having_inspire_id[cluster] = [signature_id]

print "CDS authority ids: {} signatures in {} clusters.".format(
    sum(len(cluster) for cluster in clusters_having_cds_id.itervalues()),
    len(clusters_having_cds_id))

print "CERN authority ids: {} signatures in {} clusters.".format(
    sum(len(cluster) for cluster in clusters_having_cern_id.itervalues()),
    len(clusters_having_cern_id))

print "INSPIRE authority ids: {} signatures in {} clusters.".format(
    sum(len(cluster) for cluster in clusters_having_inspire_id.itervalues()),
    len(clusters_having_inspire_id))

# Distinct signatures
print len(set(list(chain.from_iterable([s for s in clusters_having_cds_id.itervalues()])) +
        list(chain.from_iterable([s for s in clusters_having_cern_id.itervalues()])) +
        list(chain.from_iterable([s for s in clusters_having_inspire_id.itervalues()]))))

# Distinct clusters
print len(set(clusters_having_cds_id.keys() +
        clusters_having_cern_id.keys() +
        clusters_having_inspire_id.keys()))

CDS authority ids: 1136 signatures in 580 clusters.
CERN authority ids: 1121167 signatures in 7673 clusters.
INSPIRE authority ids: 2922943 signatures in 16288 clusters.
2927170
17373
