In [None]:
%load_ext autoreload
%autoreload 2

In [11]:
import os
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

from foobar.PyScite.scite import Scite
from foobar import data_dir, als_dir

Loading data queried from PMC.

The query used was "...".

In [6]:
# Load ALS DOIs
als_dois = pd.read_csv(als_dir / "all_dois.txt", names=["DOI"])

## Processing of results from Scite

In [9]:
SCITE_KEY = os.getenv("SCITE_KEY")
TOOL = os.getenv("TOOL")
EMAIL = os.getenv("EMAIL")

scite = Scite(SCITE_KEY)

In [None]:
# Collect data
results = {}
als_sample = pd.read_csv(als_dir / "als_sample_5000.csv")
sample = als_sample.doi.tolist()

for doi in tqdm(sample):
    r = scite.doi(doi)
    results[doi] = r.json()

In [None]:
# Export raws
raws = pd.DataFrame.from_dict(results).T

if "message" in raws:
    raws = raws[raws.message.isna()]
    
raws.to_csv(als_dir / "5000_dois_raws.csv")

In [26]:
raws = pd.read_csv(als_dir / "5000_dois_raws.csv", index_col=0)

### Extract datasets from raws

In [50]:
# citations
refs = raws.citations.progress_map(lambda x: pd.DataFrame.from_records(eval(x)))
refs = pd.concat(refs.tolist())
refs.to_csv(als_dir / "citations.csv", index=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [446]:
refs[refs.source=="10.1101/2020.02.25.951905"]

Unnamed: 0,id,negative,neutral,positive,refLocation,section,snippet,source,target,type,typeConfidence,expertClassification
6,1820380048,0.010193,1.0,0.011883,b82/1,Differential selection between marine and plum...,Local adaptations related to skeletal organic ...,10.1101/2020.02.25.951905,10.1038/s41396-017-0005-9,mentioning,1.0,


In [457]:
raws.papers.

10.1038/s41396-017-0005-9      {'10.1101/2020.02.25.951905': {'abstract': '\n...
10.1097/md.0000000000010320    {'10.3389/fncel.2019.00411': {'authors': [{'af...
10.3389/fneur.2017.00356                                                      {}
10.1194/jlr.m086991            {'10.1042/bcj20190647': {'abstract': 'Choleste...
10.1113/jp272591               {'10.1002/mus.26464': {'abstract': '\nABSTRACT...
                                                     ...                        
10.1007/978-90-368-1615-1_1                                                   {}
10.1177/2054358117725294       {'10.1177/2054358119879777': {'abstract': 'Bac...
10.1177/1179069518795874       {'10.1101/683359': {'abstract': '\nTSC2 inacti...
10.1177/0269216318784474       {'10.1016/j.jpainsymman.2019.04.013': {'abstra...
10.3791/56102                  {'10.3389/fnins.2018.00494': {'abstract': 'Per...
Name: papers, Length: 5000, dtype: object

In [463]:
papers = pd.DataFrame()
for target, d in tqdm(raws.papers.iteritems(), total=len(raws)):
    x = pd.DataFrame.from_dict(eval(d)).T
    x["target"] = target
    papers = papers.append(x)
papers.index=range(0,len(papers))
papers.to_csv(als_dir / "papers.csv", index=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [434]:
# tallies
citing_articles = pd.DataFrame(index=refs.source.unique())
citing_articles = raws.tallies.progress_map(lambda x: pd.DataFrame.from_records(list(eval(x).values())))
citing_articles = pd.concat(citing_articles.tolist())
citing_articles = citing_articles.drop_duplicates().set_index("doi")
citing_articles.to_csv(als_dir / "tallies.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




### Artices

In [109]:
articles = pd.DataFrame(index=raws.index)

articles['references'] = refs.groupby("target")["source"].size()
articles['citations'] = refs.groupby("target").source.nunique()

articles

Unnamed: 0,references,citations
10.1038/s41396-017-0005-9,31.0,16.0
10.1097/md.0000000000010320,2.0,1.0
10.3389/fneur.2017.00356,,
10.1194/jlr.m086991,1.0,1.0
10.1113/jp272591,30.0,5.0
...,...,...
10.1007/978-90-368-1615-1_1,,
10.1177/2054358117725294,2.0,1.0
10.1177/1179069518795874,1.0,1.0
10.1177/0269216318784474,15.0,9.0


In [195]:
# weighted citations

source_target_counts = refs.groupby("target")["source"].value_counts()
source_target_counts = source_target_counts.to_frame("mentions").reset_index()

In [199]:
# merge with total reference counts for citing articles
source_target_counts = source_target_counts.set_index("source").join(citing_articles["total"], how="inner")
source_target_counts = source_target_counts.rename(columns={"total":"total_mentions"})

In [208]:
# Calculate the weighted mentions
source_target_counts["weighted_mentions"] = source_target_counts["mentions"] / source_target_counts["total_mentions"]

In [228]:
# Aggregate weighted_counts
articles["agg_weighted_refs"] = source_target_counts.groupby("target")["weighted_mentions"].sum()

# Mean weighted_counts
articles["mean_weighted_refs"] = source_target_counts.groupby("target")["weighted_mentions"].mean()

In [233]:
articles.corr(method="spearman")

Unnamed: 0,references,citations,agg_weighted_refs,mean_weighted_refs
references,1.0,0.946791,0.846871,0.152227
citations,0.946791,1.0,0.801417,0.030568
agg_weighted_refs,0.846871,0.801417,1.0,0.500135
mean_weighted_refs,0.152227,0.030568,0.500135,1.0


In [243]:
df = articles.dropna().rank(ascending=False, pct=True).sort_values("mean_weighted_refs")

In [278]:
# check reference count
doi = "10.1002/jbmr.3944"
r = scite.citations_all(doi)
df = pd.DataFrame.from_dict(r.json()["citations"])
df[df.source=="10.1002/jbmr.3944"]

Unnamed: 0,id,negative,neutral,positive,section,snippet,source,target,type,typeConfidence
0,1704316655,0.013982,1.000000,0.009554,Introduction,Osteocytes are derived from terminally differe...,10.1002/jbmr.3944,10.1002/jbmr.320,mentioning,1.000000
1,1704316657,0.026231,0.979877,0.087343,Introduction,"Therefore, because the survival of osteocytes ...",10.1002/jbmr.3944,10.1210/jc.82.9.3128,mentioning,0.979877
2,1704316658,0.026231,0.979877,0.087343,Introduction,"Therefore, because the survival of osteocytes ...",10.1002/jbmr.3944,10.1172/jci2799,mentioning,0.979877
3,1704316659,0.028239,1.000000,0.030341,Introduction,Glucocorticoid treatment and sex steroid defic...,10.1002/jbmr.3944,10.1002/jbmr.2807,mentioning,1.000000
4,1704316660,0.024215,1.000000,0.053893,Introduction,"Moreover, the osteocyte death observed in the ...",10.1002/jbmr.3944,10.1172/jci6610,mentioning,1.000000
...,...,...,...,...,...,...,...,...,...,...
79,1765289424,0.048704,0.934383,0.109974,Discussion,Massive apoptosis has been observed in Tfam kn...,10.1002/jbmr.3944,10.1038/ng0398-231,mentioning,0.934383
80,1765289425,0.029932,1.000000,0.034638,Discussion,"<cite data-doi=""10.1038/ng0398-231"">(14)</cite...",10.1002/jbmr.3944,10.1038/s41598-017-02557-8,mentioning,1.000000
81,1765289426,0.017366,1.000000,0.011576,Discussion,"Of note, our results suggest that irisin has t...",10.1002/jbmr.3944,10.1016/b978-0-12-397166-1.00015-1,mentioning,1.000000
82,1765289427,0.019405,1.000000,0.048436,Discussion,"As for PTH, which exerts both catabolic and an...",10.1002/jbmr.3944,10.1073/pnas.1516622112,mentioning,1.000000
