In [None]:
import datetime
import duckdb
import polars as pl
import json
from src.lfe.impl.utils import not_na


%load_ext autoreload
%autoreload 2

%reload_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

conn = duckdb.connect()
%sql conn --alias duckdb

In [None]:
pubmed_meta_df = conn.query(
    """
set arrow_large_buffer_size=true;

with articles as (
    select 
        MedlineCitation.PMID as PMID,
        MedlineCitation.KeywordList as KeywordList,

        MedlineCitation.InvestigatorList as ArticleInvestigatorList,
        MedlineCitation.Article.AuthorList as ArticleAuthorList,

        MedlineCitation.Article.ArticleTitle as Title,
        MedlineCitation.Article.VernacularTitle as ArticleVernacularTitle,
        MedlineCitation.Article.Abstract.AbstractText as AbstractText,
        MedlineCitation.Article.Abstract.CopyrightInformation as CopyrightInformation,
        MedlineCitation.OtherAbstract as ArticleOtherAbstract,

        MedlineCitation.Article.Journal.ISOAbbreviation as ArticleJournalISO,
        MedlineCitation.MedlineJournalInfo.Country as ArticleJournalInfoCountry,

        PubmedData.ReferenceList as ArticlePubmedDataReferenceList,
        MedlineCitation.CitationSubset as ArticleCitationSubset,

        MedlineCitation.ChemicalList as ChemicalList,

        MedlineCitation.Article.GrantList as GrantList,

        MedlineCitation.Article.ArticleDate as ArticleDate,
        MedlineCitation.DateCompleted as ArticleDateCompleted,
        MedlineCitation.DateRevised as ArticleDateRevised,
        
        'Article' as Type
    from "pubmed_meta.parquet"
        where MedlineCitation is not null
)
    select distinct 
        *
    from articles
"""
).pl()

In [None]:
# Take the min date available in record
# Assumed to be the date available
# TODO: fix date function. Revision history might not have been published https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#:~:text=46.%20%3CHistory%3E,contains%20the%20%3CPubmedPubDate%3E.
def get_min_date(r):
    dc = [r["ArticleDateCompleted"]]
    article_dates = r["ArticleDate"] if not_na(r["ArticleDate"]) else []
    dr = [r["ArticleDateRevised"]]
    # bpd = [r["BookPubDate"]]
    # bc = [r["BookContributionDate"]]
    # br = [r["BookDateRevised"]]

    all_dates = sorted(
        [
            format_json_date(d)
            for d in (dc + article_dates + dr)  # + bpd + bc + br)
            if d is not None
        ]
    )
    return all_dates[0]


def format_json_date(o):
    if o["Day"] is not None:
        return datetime.datetime(int(o["Year"]), int(o["Month"]), int(o["Day"]))
    elif o["Month"] is not None:
        return datetime.datetime(int(o["Year"]), int(o["Month"]), 1)
    return datetime.datetime(int(o["Year"]), 1, 1)


def coalesce_author_colums(r):
    # bda = r["BookDocAuthorList"]
    # ba = r["BookAuthorList"]
    ai = r["ArticleInvestigatorList"]
    aa = r["ArticleAuthorList"]

    filtered = (e for e in [ai, aa] if not_na(e) and len(e) > 0)
    flattened = (cc for c in filtered for cc in c)

    ret = []
    for f in flattened:
        if isinstance(f, list):
            ret.extend(f)
        else:
            ret.append(f)
    return json.dumps(ret)


pubmed_meta_df.with_columns(
    pl.struct("ArticleDateCompleted", "ArticleDate", "ArticleDateRevised")
    .map_elements(get_min_date, return_dtype=pl.Datetime)
    .alias("DateAvail"),
    pl.struct("ArticleInvestigatorList", "ArticleAuthorList")
    .map_elements(coalesce_author_colums, return_dtype=pl.String)
    .str.json_decode(None, infer_schema_length=None)
    .alias("AuthorList"),
).write_parquet("raw_pubmed_meta_df.parquet")