In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install requests




In [2]:
import requests
import re
from IPython.display import display

In [110]:
r = requests.get('https://raw.githubusercontent.com/jdh-observer/jdh001-4yxHGiqXYRbX/main/TopicSpecificCorpusBuilding.ipynb')
notebook = r.json()
def flatten(t):
    return [item for sublist in t for item in sublist]

Get citations keys and listed authors from the notebook metadata:

In [111]:
citations = notebook.get('metadata').get('cite2c').get('citations')
citations_ids = [k for k in citations.keys()]

authors_citations_links = flatten([[(a,k) for a in x.get('author', [])] for (k,x) in citations.items()])
# print(citations_ids)
print(authors_citations_links[:1])

[({'family': 'Patwardhan', 'given': 'Siddharth'}, '6142573/2FLWKIR9')]


`authors_citations`is a list of tuples (author, reference) cited in the paper.
Let's see where the citations are looping the nootebook cells

In [113]:
# collect 
authors_citations_grouped_by_author = {}
for a,ref in authors_citations_links:
    author_name = f"{a.get('family')}-{a.get('given')}"
    if authors_citations_grouped_by_author.get(author_name, None) is None:
        authors_citations_grouped_by_author[author_name] = {
            'a': a,
            'refs':[], 
            'count_refs': 0,
            'count_citations': 0
        }
    authors_citations_grouped_by_author[author_name]['refs'].append(ref)
    authors_citations_grouped_by_author[author_name]['count_refs'] += 1 
# sort by number of publications
hall_of_fame = sorted(
    authors_citations_grouped_by_author.values(), 
    key=lambda x: x.get('count_refs'), reverse=True
)
print(hall_of_fame[:2])

[{'a': {'family': 'Pfanzelter', 'given': 'Eva'}, 'refs': ['6142573/5FI5SV3F', '6142573/DHFC4A24', '6142573/FRE6XIFV', '6142573/JMD7CSSP', '6142573/NBV4BG2G'], 'count_refs': 5, 'count_citations': 0}, {'a': {'family': 'Zosa', 'given': 'Elaine'}, 'refs': ['6142573/5ZGM8EKM', '6142573/SCBME2FU', '6142573/VJZ89VCM'], 'count_refs': 3, 'count_citations': 0}]


In [114]:
cells = notebook.get('cells')


for cell in cells:
    tags = cell.get('metadata').get('tags', [])
    if 'hidden' in tags:
        continue
    source = ''.join(cell.get('source', []))
    
    if not source:
        continue
    # does it contains a cite2c marker?
    notes = re.findall(r'data-cite=\"([^\"]+)\"', source)
    if not notes:
        continue
    for note_key in notes:
        for author in hall_of_fame:
            if note_key in author['refs']:
                author['count_citations'] += 1

# sort
authors = sorted(hall_of_fame, key=lambda x: x.get('count_citations'), reverse=True)

for a in authors:
    name = f"{a.get('a').get('family')}, {a.get('a').get('given')}"
    print(f"{name} : {a.get('count_citations')} {a.get('refs')}")

Pfanzelter, Eva : 4 ['6142573/5FI5SV3F', '6142573/DHFC4A24', '6142573/FRE6XIFV', '6142573/JMD7CSSP', '6142573/NBV4BG2G']
Zosa, Elaine : 3 ['6142573/5ZGM8EKM', '6142573/SCBME2FU', '6142573/VJZ89VCM']
Tolonen, Mikko : 3 ['6142573/5ZGM8EKM', '6142573/FRE6XIFV', '6142573/VJZ89VCM']
Oberbichler, Sarah : 3 ['6142573/JMD7CSSP', '6142573/NBV4BG2G', '6142573/YQFJLI5C']
Blei, David M. : 3 ['6142573/CVSFNSE2', '6142573/VPCBKFBD']
Navigli, Roberto : 3 ['6142573/LVI27PCC', '6142573/U48CR9BT']
Hasko, Victoria : 3 ['6142573/DXVIWVKG']
Lu, Jinghui : 3 ['6142573/YJ6WDMIM']
Henchion, Maeve : 3 ['6142573/YJ6WDMIM']
Namee, Brian Mac : 3 ['6142573/YJ6WDMIM']
Föhr, Pascal : 3 ['8918850/AH3TIH3N']
Hengchen, Simon : 2 ['6142573/5ZGM8EKM', '6142573/VJZ89VCM']
Marjanen, Jani : 2 ['6142573/5ZGM8EKM', '6142573/VJZ89VCM']
Pivovarova, Lidia : 2 ['6142573/5ZGM8EKM', '6142573/VJZ89VCM']
Doucet, Antoine : 2 ['6142573/FRE6XIFV', '6142573/G3B3QXYX']
Granroth-Wilding, Mark : 2 ['6142573/FRE6XIFV', '6142573/SCBME2FU']
Fic