In [None]:
import re
import polars as pl

from helpers import bronze_dir, silver_dir

In [None]:
# Load the Crossref sample
raw = pl.read_ndjson(bronze_dir / "sample_10000.jsonl", ignore_errors=True)

# Select columns that I'll be working with and slightly process them
sample = raw.select(
    "DOI",
    "type",
    pl.col("published").struct.field("date-parts").list.first().alias("published"),
    "reference-count",
    "is-referenced-by-count",
    "reference",
)

# Extract year from published date
sample = sample.with_columns(pl.col("published").list.first().alias("year"))

# Remove the old `published` field
sample = sample.select(pl.exclude("published"))
sample.write_ndjson(silver_dir / "sample.jsonl")
sample

DOI,type,reference-count,is-referenced-by-count,reference,year
str,str,i64,i64,list[struct[14]],i64
"""10.29173/jaed204""","""journal-article""",0,0,,2005
"""10.1145/191246.191249""","""proceedings-article""",0,0,,1994
"""10.1172/jci34472ds1""","""component""",0,0,,
"""10.1177/003463738808500173""","""journal-article""",0,0,,1988
"""10.1007/s00464-002-4290-6""","""journal-article""",0,12,,2003
…,…,…,…,…,…
"""10.2307/j.ctv11smw3n.16""","""book-chapter""",0,0,,2001
"""10.1055/s-0032-1316389""","""journal-article""",0,0,,2012
"""10.1016/s0014-3057(96)00007-9""","""journal-article""",0,4,,1997
"""10.9755/ejfa.2019.v31.i9.1999""","""journal-article""",0,0,,2019


In [25]:
# This function extracts a year from the date string
def extract_year(date_str):
    match = re.search(r"\b(18|19|20)\d{2}\b", date_str)
    if match:
        return int(match.group(0))
    return None

In [28]:
# Extract reference data and export to CSV
citations = (
    sample.select(pl.col("DOI").alias("citing"), pl.col("reference"))
    .drop_nulls()
    .explode("reference")
    .unnest("reference")
).select(
    "citing",
    "key",
    pl.col("DOI").alias("cited"),
    pl.col("year").map_elements(extract_year, return_dtype=pl.Int32),
)
citations.write_csv(silver_dir / "citations.csv")
citations

citing,key,cited,year
str,str,str,i32
"""10.1016/s0025-7753(01)71961-6""","""10.1016/S0025-7753(01)71961-6_…","""10.1016/S0025-7753(00)71590-9""",2000
"""10.1016/s0025-7753(01)71961-6""","""10.1016/S0025-7753(01)71961-6_…","""10.1093/oxfordjournals.eurhear…",1991
"""10.1016/s0025-7753(01)71961-6""","""10.1016/S0025-7753(01)71961-6_…",,1999
"""10.1016/s0025-7753(01)71961-6""","""10.1016/S0025-7753(01)71961-6_…","""10.3949/ccjm.66.10.615""",1999
"""10.1016/s0025-7753(01)71961-6""","""10.1016/S0025-7753(01)71961-6_…","""10.1016/0735-1097(91)90675-Y""",1991
…,…,…,…
"""10.1016/j.socscimed.2017.11.05…","""10.1016/j.socscimed.2017.11.05…","""10.1007/s12199-008-0037-x""",2008
"""10.1016/j.socscimed.2017.11.05…","""10.1016/j.socscimed.2017.11.05…","""10.1001/jama.280.19.1690""",1998
"""10.1016/j.socscimed.2017.11.05…","""10.1016/j.socscimed.2017.11.05…","""10.1111/j.1600-0447.2004.00388…",2004
"""10.1016/j.socscimed.2017.11.05…","""10.1016/j.socscimed.2017.11.05…","""10.1017/S0033291709990808""",2010
