In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
import os
from pathlib import Path
import random

import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

from foobar.PyScite.scite import Scite, SciteException

### Setup

Choose a sample of 1000 DOIs from the ALS dataset (which contains full references for 5000 articles)

In [4]:
# Directories
data_dir = Path("../data")
als_dir = data_dir / "ALS_data"

nb_dir = data_dir / "weighted_citations"

Select a random sample and save it as the base dataset for this notebook. Copy & paste this code block to create a different DOI selection

```python
selection = als_dois.DOI.sample(1000).tolist()

print("n references:", len(refs[refs.target.isin(selection)]))
print("total citing papers:", refs[refs.target.isin(selection)].source.nunique())

print("missing citing papers:", refs[refs.target.isin(selection)].source.nunique()-len(selection))

pd.DataFrame({"doi":selection}).to_csv(nb_dir / "1000_doi_sample.csv", index=False)
```

### Load datasets

In [5]:
# Load ALS DOIs
sample_refs = pd.read_csv(als_dir / "citations.csv")

selection = pd.read_csv(nb_dir / "1000_doi_sample.csv").doi

sel_refs = sample_refs[sample_refs.target.isin(selection)]

#### Cited articles

In [7]:
articles = pd.DataFrame(index=selection)

articles['references'] = sel_refs.groupby("target")["source"].size()
articles['citations'] = sel_refs.groupby("target").source.nunique()

articles

Unnamed: 0_level_0,references,citations
doi,Unnamed: 1_level_1,Unnamed: 2_level_1
10.1371/journal.pone.0159593,1,1
10.1080/21541248.2016.1276999,44,28
10.1101/gad.281030.116,29,14
10.3389/fnins.2016.00235,1,1
10.1007/s00439-016-1683-5,203,137
...,...,...
10.2105/ajph.2016.303344,2,2
10.15252/embj.201899023,9,3
10.1038/s41598-017-09257-3,8,4
10.1155/2018/6920213,7,3


#### Citing articles

In [39]:
citing_articles = pd.DataFrame({"doi":sel_refs.source.unique().tolist()})

In [9]:
SCITE_KEY = os.getenv("SCITE_KEY")
TOOL = os.getenv("TOOL")
EMAIL = os.getenv("EMAIL")

scite = Scite(SCITE_KEY)

In [56]:
refs

Unnamed: 0,queried_doi


In [None]:
failed_dois = {}
ref_counts = pd.DataFrame(columns=["total_references", "total_citations"])

for doi in tqdm(citing_articles.doi.tolist()):
    try:
        refs = scite.get_doi(doi, df=True)
    except SciteException as e:
        failed_dois[doi]: str(e)
        continue
    
    if refs.empty:
        continue
    
    refs['queried_doi'] = doi
    refs = refs[refs.source==doi]
    ref_counts.loc[doi] = [len(refs), refs.target.nunique()]
    
    with open(nb_dir / "source_citations.csv", 'a') as f:
        refs.to_csv(f, mode='a', header=f.tell()==0, index=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9945.0), HTML(value='')))

In [51]:
incoming_refs = pd.read_csv(nb_dir / "source_citations.csv")

ParserError: Error tokenizing data. C error: Expected 11 fields in line 133088, saw 12


### Weighted Citations

In [105]:
# weighted citations

source_target_counts = sel_refs.groupby("target")["source"].value_counts()
source_target_counts = source_target_counts.to_frame("mentions").reset_index()

In [106]:
# merge with total reference counts for citing articles
source_target_counts = source_target_counts.set_index("source").join(citing_articles["total"], how="inner")
source_target_counts = source_target_counts.rename(columns={"total":"total_mentions"})

KeyError: 'total'

In [208]:
# Calculate the weighted mentions
source_target_counts["weighted_mentions"] = source_target_counts["mentions"] / source_target_counts["total_mentions"]

In [228]:
# Aggregate weighted_counts
articles["agg_weighted_refs"] = source_target_counts.groupby("target")["weighted_mentions"].sum()

# Mean weighted_counts
articles["mean_weighted_refs"] = source_target_counts.groupby("target")["weighted_mentions"].mean()

In [233]:
articles.corr(method="spearman")

Unnamed: 0,references,citations,agg_weighted_refs,mean_weighted_refs
references,1.0,0.946791,0.846871,0.152227
citations,0.946791,1.0,0.801417,0.030568
agg_weighted_refs,0.846871,0.801417,1.0,0.500135
mean_weighted_refs,0.152227,0.030568,0.500135,1.0


In [243]:
df = articles.dropna().rank(ascending=False, pct=True).sort_values("mean_weighted_refs")