In [1]:
from pathlib import Path
import random

import pandas as pd
from tqdm.auto import tqdm

from typing import List
import ndjson

import numpy as np

In [21]:
tqdm.pandas()

In [2]:
# Data directories
data_dir = Path("../data")

raw_dir = data_dir / "raw"
pubmed_dir = raw_dir / "pubmed"
scite_dir = raw_dir / "scite"

processed_dir = data_dir / "processed"

results_dir = data_dir / "results"

### Load datasets

In [3]:
# Load seed citations
seed_f = pubmed_dir / "seed.csv"
seed_dois = pd.read_csv(seed_f)
seed_dois = seed_dois.doi.tolist()

# Load incoming citations for seed
seed_citations_f = scite_dir / "seed_incoming_citations.jsonl"
seed_citations = pd.read_json(seed_citations_f, orient="records", lines=True)

# Load seed article metadata
seed_articles_file = scite_dir / "seed_papers.jsonl"
seed_articles = pd.read_json(seed_articles_file, lines=True)

source_dois = seed_citations.source.unique().tolist()

In [4]:
# Load metadata for citing articles
source_articles_file = scite_dir / "incoming_papers.jsonl"
source_articles = pd.read_json(source_articles_file, lines=True)

# Load citations for citing articles
source_citations_file = scite_dir / "incoming_citations.jsonl"
source_citations = pd.read_json(source_citations_file, lines=True)

# Limit citations to those that have citing_dois as their source
source_citations = source_citations[source_citations.source.isin(source_dois)]

## Preprocess collected data

In [5]:
# Merge papaer metadata
articles = pd.concat([seed_articles, source_articles])
articles = articles.drop_duplicates(subset="doi").set_index("doi")

In [6]:
# Merge citations | needs to be intersection!
citations = pd.concat([seed_citations, source_citations])
citations = citations.drop_duplicates(subset="id")

In [7]:
# Drop citations with missing source DOIs to branch_articles
extra_dois = seed_citations[~seed_citations.source.isin(source_citations.source)].source
citations = citations[~citations.source.isin(extra_dois)]

In [8]:
#####################################
# Filter out article types/keywords #
#####################################

# Filter out some articles
bad_keywords = ["Erratum", "Correction", "COMMENTARY"]
good_types = ["journal-article", "proceedings-article", "book-chapter", "posted-content"]

# 
dropped_articles = []
for i, a in articles.iterrows():
    if a.type not in good_types:
        dropped_articles.append(i)
        continue
    
    for k in bad_keywords:
        if type(a.keywords) is list:
            if k in a.keywords:
                dropped_articles.append(i)
                continue
                
articles = articles.drop(index=dropped_articles)
citations = citations[~citations.source.isin(dropped_articles)]

In [9]:
# Citing articles which cite another citing article
print("total citing articles:", citations.source.nunique())

print("articles that cite seed:", citations[citations.target.isin(seed_dois)].source.nunique())
print("articles that cite another source citations", citations[(citations.source.isin(source_dois)) & (citations.target.isin(source_dois))].source.nunique())
print("articles that cite other target articles", citations[(~citations.target.isin(seed_dois)) & (~citations.target.isin(source_dois))].source.nunique())

total citing articles: 9848
articles that cite seed: 9848
articles that cite another source citations 4459
articles that cite other target articles 9847


In [10]:
print("total cited articles", citations.target.nunique())
print("seed articles that were cited by source", citations[citations.target.isin(seed_dois)].target.nunique())
print("source articles that were cited by source", citations[citations.target.isin(source_dois)].target.nunique())

total cited articles 439959
seed articles that were cited by source 998
source articles that were cited by source 3148


In [11]:
# Remove two DOIs from the seed set which weren't found in the full citations list
final_dois = citations[citations.target.isin(seed_dois)].target.unique().tolist()
seed_citations = seed_citations[seed_citations.target.isin(final_dois)]

## Contexts

In [12]:
articles.to_csv(processed_dir / "contexts.csv")
articles.head(3)

Unnamed: 0_level_0,slug,type,title,abstract,authors,keywords,year,shortJournal,publisher,issue,volume,page,retracted,memberId,issns,editorialNotices,journalSlug,journal,rwStatus
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10.1371/journal.pone.0159593,shared-genetic-factors-involved-in-j81d1l,journal-article,Shared Genetic Factors Involved in Celiac Dise...,Background and ObjectivesGenome-wide associati...,"[{'family': 'Mostowy', 'given': 'Joanna'}, {'f...",[Research Article],2016.0,PLoS ONE,Public Library of Science (PLoS),8,11,e0159593,0.0,340,[1932-6203],[],plos-one-lZbEk,Plos One,
10.1080/21541248.2016.1276999,molecular-control-of-rab-activity-kpJgay,journal-article,"Molecular control of Rab activity by GEFs, GAP...",ABSTRACTRab proteins are the major regulators ...,"[{'family': 'Müller', 'given': 'Matthias P.'},...",[Review],2017.0,Small GTPases,Informa UK Limited,1-2,9,5-21,0.0,301,"[2154-1248, 2154-1256]",[],small-gtpases-YZrGD,Small GTPases,
10.1101/gad.281030.116,codon-usage-affects-the-structure-Zwy9Zp,journal-article,Codon usage affects the structure and function...,\nCodon usage bias is a universal feature of a...,"[{'family': 'Fu', 'given': 'Jingjing'}, {'fami...","[[Keywords: circadian clock, codon usage, Dros...",2016.0,Genes Dev.,Cold Spring Harbor Laboratory,15,30,1761-1775,0.0,246,"[0890-9369, 1549-5477]",[],genes-development-6828p,Genes & Development,


## Traces

In [13]:
traces = seed_citations[["id", "source", "target", "snippet", "refLocation", "section"]].copy().set_index("id")
traces.to_csv(processed_dir / "traces.csv")
traces

Unnamed: 0_level_0,source,target,snippet,refLocation,section
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1603221744,10.3390/toxins11080443,10.3389/fneur.2017.00535,Most chronic inflammatory CNS disorders have a...,b14-toxins-11-00443/1,1. Introduction
1603221770,10.3390/toxins11080443,10.3389/fneur.2017.00535,The belief that fungi play an important role i...,b14-toxins-11-00443/2,3. Discussion
1589211783,10.3389/fneur.2019.00758,10.3389/fneur.2017.00535,The potential role of microbes and viruses in ...,b18/1,Introduction
1589211918,10.3389/fneur.2019.00758,10.3389/fneur.2017.00535,MS has many direct links with the immune respo...,b18/2,Malassezia Primer
1589211932,10.3389/fneur.2019.00758,10.3389/fneur.2017.00535,"MS is moderately associated with SpA (113, <ci...",b18/3,Malassezia Primer
...,...,...,...,...,...
1071932729,10.1159/000489645,10.3389/fcimb.2017.00216,"To date, approximately 20 tick cystatins, all ...",ref48/1,Discussion
1478093524,10.3389/fcimb.2017.00476,10.3389/fcimb.2017.00216,Protease inhibitors has also been extensively ...,b23/1,Discussion
1506573001,10.1093/gbe/evx279,10.3389/fcimb.2017.00216,"2014, <cite data-doi=""10.1074/jbc.m112.339051""...",evx279-b25/1,Results
1506573004,10.1093/gbe/evx279,10.3389/fcimb.2017.00216,"In nematodes, cystatins modulate host immune r...",evx279-b25/2,Results


## Patterns for source articles

In [14]:
# Metrics for citing (source) article
source_articles = pd.DataFrame(index=citations.source.drop_duplicates())
source_articles["total_source_mentions"] = citations.groupby("source").target.size()
source_articles["total_source_refs"] = citations.groupby("source").target.nunique()

source_articles["mean_source_mentions"] = source_articles["total_source_mentions"] / source_articles["total_source_refs"]

source_articles.to_csv(processed_dir / "source_pattern.csv")
source_articles

Unnamed: 0_level_0,total_source_mentions,total_source_refs,mean_source_mentions
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.3390/toxins11080443,44,37,1.189189
10.3389/fneur.2019.00758,280,125,2.240000
10.3389/fmicb.2018.02968,69,64,1.078125
10.15419/bmrat.v6i12.579,53,39,1.358974
10.3389/fmicb.2018.03249,142,100,1.420000
...,...,...,...
10.1101/357681,69,57,1.210526
10.1371/journal.pntd.0006235,86,41,2.097561
10.1159/000489645,46,41,1.121951
10.3389/fcimb.2017.00476,155,105,1.476190


## Mention patterns

In [15]:
mention_pattern_cols = ["id", "source", "target", "negative", "neutral", "positive", "type", "typeConfidence", "expertClassification"]
mention_patterns = seed_citations[mention_pattern_cols].set_index("id")

mention_patterns.to_csv(processed_dir / "mention_patterns.csv")
mention_patterns

Unnamed: 0_level_0,source,target,negative,neutral,positive,type,typeConfidence,expertClassification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1603221744,10.3390/toxins11080443,10.3389/fneur.2017.00535,0.003697,1.0,0.008609,mentioning,1.0,
1603221770,10.3390/toxins11080443,10.3389/fneur.2017.00535,0.005248,1.0,0.019753,mentioning,1.0,
1589211783,10.3389/fneur.2019.00758,10.3389/fneur.2017.00535,0.008267,1.0,0.005143,mentioning,1.0,
1589211918,10.3389/fneur.2019.00758,10.3389/fneur.2017.00535,0.004740,1.0,0.006388,mentioning,1.0,
1589211932,10.3389/fneur.2019.00758,10.3389/fneur.2017.00535,0.050479,1.0,0.010640,mentioning,1.0,
...,...,...,...,...,...,...,...,...
1071932729,10.1159/000489645,10.3389/fcimb.2017.00216,0.002939,1.0,0.004096,mentioning,1.0,
1478093524,10.3389/fcimb.2017.00476,10.3389/fcimb.2017.00216,0.009755,1.0,0.062056,mentioning,1.0,
1506573001,10.1093/gbe/evx279,10.3389/fcimb.2017.00216,0.026799,1.0,0.060917,mentioning,1.0,
1506573004,10.1093/gbe/evx279,10.3389/fcimb.2017.00216,0.025794,1.0,0.053954,mentioning,1.0,


## Citation Patterns

In [16]:
citation_patterns = traces[["source", "target"]].drop_duplicates(ignore_index=True).set_index(["source", "target"])
citation_patterns = traces.groupby(["source", "target"]).size().reset_index(name="mentions")

refs_norm = 1/source_articles["total_source_refs"].to_frame("refs_norm")
citation_patterns = citation_patterns.merge(refs_norm, left_on="source", right_index=True, how="left")
citation_patterns = citation_patterns[~citation_patterns.isna().any(axis=1)]

citation_patterns = citation_patterns.merge(source_articles["total_source_mentions"], left_on="source", right_index=True, how="left")
citation_patterns = citation_patterns.merge(source_articles["mean_source_mentions"], left_on="source", right_index=True, how="left")

# Normalize by total mentions
citation_patterns["mentions_norm_total"] = citation_patterns["mentions"] / citation_patterns["total_source_mentions"]

# Normalize by mean mentions
citation_patterns["mentions_norm_mean"] = citation_patterns["mentions"] / citation_patterns["mean_source_mentions"]

citation_patterns.to_csv(processed_dir / "citation_patterns.csv")
citation_patterns

Unnamed: 0,source,target,mentions,refs_norm,total_source_mentions,mean_source_mentions,mentions_norm_total,mentions_norm_mean
0,10.1001/jama.2017.20885,10.1093/jnen/nlx101,1,0.040000,34,1.360000,0.029412,0.735294
1,10.1001/jama.2019.16161,10.1038/ng.3482,3,0.111111,15,1.666667,0.200000,1.800000
2,10.1001/jamadermatol.2018.4673,10.1186/s13023-017-0718-x,1,0.050000,28,1.400000,0.035714,0.714286
3,10.1001/jamanetworkopen.2019.6972,10.1089/jpm.2017.0548,1,0.023810,46,1.095238,0.021739,0.913043
4,10.1001/jamaneurol.2018.0035,10.1172/jci.insight.89530,1,0.034483,56,1.931034,0.017857,0.517857
...,...,...,...,...,...,...,...,...
11011,10.7916/tohm.v0.712,10.3390/ijms17020189,1,0.034483,39,1.344828,0.025641,0.743590
11012,10.9734/afsj/2020/v14i330131,10.1016/j.tins.2016.09.002,1,0.111111,9,1.000000,0.111111,1.000000
11013,10.9734/ejmp/2019/v28i230129,10.1155/2017/9208489,1,0.166667,6,1.000000,0.166667,1.000000
11014,10.9734/ejnfs/2019/v11i330148,10.1186/s12284-017-0157-2,1,0.333333,3,1.000000,0.333333,1.000000


## Article Patterns

In [17]:
def compute_eng_consistency(doi: str, pattern: str =  "mentions_norm_mean") -> float:
    df = citation_patterns[citation_patterns.target==doi]
    
    # sort values
    df = df.sort_values(pattern)
    
    # normalize both dimensions
    df["x_perc"] = df[pattern].rank(method="first") - 1
    df["x_perc"] = df["x_perc"] / df["x_perc"].max()
    df[pattern] = df[pattern] / df[pattern].max()
    
    return np.trapz(df[pattern], df["x_perc"])

In [22]:
article_patterns = pd.DataFrame(index=final_dois)

# Aggregate counts
article_patterns["refs_agg"] = citation_patterns.groupby("target").size()
article_patterns["mentions_agg"] = citation_patterns.groupby("target")["mentions"].sum()

# normalized 
article_patterns["refs_norm_agg"] = citation_patterns.groupby("target")["refs_norm"].sum()
article_patterns["eng_norm_avg"] = citation_patterns.groupby("target")["mentions_norm_mean"].mean()

# score
article_patterns["consistency"] = article_patterns.progress_apply(lambda row: compute_eng_consistency(row.name), axis=1)

article_patterns.to_csv(processed_dir / "article_patterns.csv")
article_patterns

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=998.0), HTML(value='')))




Unnamed: 0,refs_agg,mentions_agg,refs_norm_agg,eng_norm_avg,consistency
10.3389/fneur.2017.00535,6,13,0.091612,1.347773,0.587056
10.1016/j.clinph.2018.04.747,3,4,0.091580,1.000784,0.740572
10.3390/ijms18081638,18,23,0.352699,0.925604,0.530720
10.1038/srep19230,15,49,0.306009,1.656534,0.235366
10.1186/s13287-018-0890-5,4,6,0.127793,1.190518,0.716494
...,...,...,...,...,...
10.2174/1570159x15666170102145257,3,3,0.070047,0.902576,0.969760
10.1177/0306312715619783,4,9,0.198035,1.323061,0.587949
10.3390/toxins10010006,2,2,0.023598,0.588253,0.725322
10.1155/2017/3978595,5,6,0.066101,0.965926,0.522249


In [23]:
article_patterns.describe()

Unnamed: 0,refs_agg,mentions_agg,refs_norm_agg,eng_norm_avg,consistency
count,998.0,998.0,998.0,998.0,998.0
mean,10.933868,17.905812,0.240949,1.079449,0.487036
std,18.486072,31.884495,0.416229,0.499001,0.265953
min,1.0,1.0,0.002513,0.363636,0.0
25%,3.0,4.0,0.062509,0.828663,0.322304
50%,6.0,9.0,0.133779,0.96994,0.488198
75%,12.0,19.0,0.261283,1.177741,0.678664
max,299.0,547.0,5.946808,7.602674,1.0
