In [1]:
import datasets
import polars as pl
from datetime import datetime

import pandas as pd

The datasets been generated using this [fork](https://github.com/baberabb/cce-python). The main logic is
for each registration and each matching renewal entry:

    1. check if date matches -> exact match
        2. if not then check if year matches
            3. if not then check if the normalized authors/title match


In [2]:
renewals_unmatched = datasets.load_dataset("baber/pdbooks", "renewals_unmatched")[
    "train"
].to_pandas()
renewals_matched = datasets.load_dataset("baber/pdbooks", "renewals_matched")[
    "train"
].to_pandas()
registrations_all = datasets.load_dataset("baber/pdbooks", "registrations_all")[
    "train"
].to_pandas()
registrations_unmatched = datasets.load_dataset(
    "baber/pdbooks", "registrations_not_renewed"
)["train"].to_pandas()

We'll use polars, but it's interchangeable with pandas with .to_pandas()

In [3]:
ren_unmatched = pl.from_pandas(renewals_unmatched)
ren_matched = pl.from_pandas(renewals_matched)
reg_all = pl.from_pandas(registrations_all)
reg_unmatched = pl.from_pandas(registrations_unmatched)

print(f"Total renewals unmatched: {len(ren_unmatched)}")
print(f"Total registrations remaining: {len(reg_unmatched)}")

Total renewals unmatched: 264080
Total registrations remaining: 560350


These are alot of unmatched renewals but these include non-book entries as well. Sub-setting just the registration numbers we have in the reg dataset and filtering the dates we get ~8000 matched numbers with _all_ registrations and ~4000 with the registrations remaining (unmatched, not foreign, etc.). Looking at all registrations:

In [4]:
ren_unmat = (
    ren_unmatched.lazy()
    .explode("regnum")
    .explode("reg_date")
    .with_columns(pl.col("reg_date").str.to_date(strict=False, exact=False))
    .filter(
        pl.col("reg_date").is_between(
            datetime(1929, 0o1, 0o1), datetime(1964, 0o1, 0o1)
        )
        | pl.col("reg_date").is_null()
    )
    .join(
        reg_all.lazy().explode("regnums"),
        how="semi",
        left_on="regnum",
        right_on="regnums",
    )
).filter(pl.col("regnum").str.starts_with("A"))

The renewal entries have been "unrolled" where there's a seperate row for each reg number in the entry. Also I might have introduced a bug somewhere as there are alot of duplicates. We can count by looking at the uuid though. 

In [5]:
ren_unmat.unique("uuid").group_by([pl.col("reg_date").dt.year()]).len().sort(
    "len", descending=True
).collect()

reg_date,len
i32,u32
1953,1119
1952,568
1947,466
1946,383
1950,357
…,…
1958,87
1961,79
1960,69
1959,67


Alot of these entries aren't really books

In [6]:
ren_unmat.unique("uuid").group_by("author").len().sort("len", descending=True).collect()

author,len
str,u32
,3113
"""""",288
"""King Features …",171
"""West Publishin…",168
"""New York Times…",136
…,…
"""CARPENTER, WIL…",1
"""BUCK, PEARL S.…",1
"""Patrick Kavana…",1
"""Case Kusby (Ku…",1


Some manual filtering. This reduces around ~2000 entries

In [7]:
ren_unmat.filter(
    pl.col("full_text").str.contains(
        "(?i)(superman|batman|vogue|hopalong|tarzan|king features|donald|mickey|new york times|vernon law|king aroo)" 
    ).not_() & pl.col("title").str.contains(
        "(?i)superman|batman|vogue|hopalong|tarzan|king features|donald|mickey|new york times|catalog|king aroo|nea" 
    ).not_()
).filter(~pl.col("regnum").str.contains("^(AF|AI)")).unique("uuid").collect()

uuid,regnum,reg_date,renewal_id,renewal_date,author,title,new_matter,see_also_renewal,see_also_registration,full_text,claimants,notes
str,str,date,str,str,str,str,str,list[null],list[null],str,str,str
"""269a88a8-db6f-…","""A75399""",1953-01-09,"""RE106224""",,,"""Motion and tim…","""""",[],[],"""""","""Ralph M. Barne…",
"""70da36c0-5dc3-…","""A87060""",1953-04-06,"""RE99253""",,,"""California rep…","""""",[],[],"""""","""West Publishin…",
"""06b8e2b2-a571-…","""A44""",1944-06-30,"""R510267""","""1971-08-12""","""West Pub. Co. …","""ABBOTT NEW YOR…","""""",[],[],"""ABBOTT NEW YOR…","""West Pub. Co. …",""""""
"""47951365-0f82-…","""A19159""",1947-11-15,"""R597997""","""1975-02-19""","""Walter Eli Yod…","""Junior hymns""","""""",[],[],"""R597997. Junio…","""Herald Press|P…",""""""
"""6285349f-1ad1-…","""A678282""",1950-11-07,"""R678282""","""1977-12-05""","""Walt Disney Pr…","""November 19, 1…","""""",[],[],"""R678282. Uncle…","""Walt Disney Pr…",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""fc78301f-505e-…","""A73635""",1934-07-17,"""R288087""","""1961-12-28""","""CROCKETT, DANI…","""Indiana three …","""""annotations t…",[],[],"""CROCKETT, DANI…","""Bobbs-Merrill …",""""""
"""95666c4a-77ca-…","""A22431""",1930-04-24,"""R206830""","""1958-01-14""","""WARD, C. H.""","""Manual for M. …","""""",[],[],"""WARD, C. H. Ma…","""Scott, Foresma…",""""""
"""a3035462-18e9-…","""A846262""",1960-06-21,"""RE412584""",,,"""Once in a smal…","""""",[],[],"""""","""Barbara Heimer…",
"""74687694-ed51-…","""A168502""",1942-10-30,"""R491846""","""1970-10-02""","""Dodd, Mead & C…","""THE BEST PLAYS…","""""",[],[],"""THE BEST PLAYS…","""Dodd, Mead & C…",""""""


These are also in the registrations (~800), but most of them have been bulk registered at one time. We can filter for that:

In [8]:
reg_unmatched.lazy().filter(pl.col("title").str.contains(
        "(?i)(superman|batman|vogue|hopalong|tarzan|king features|donald duck|mickey mouse|new york times|vernon law|king aroo)" 
    )
    ).collect().filter(pl.col("parent").is_null())

uuid,regnums,reg_dates,title,authors,publishers,disposition,year,group_title,group_uuid,notes,parent
str,list[str],list[str],str,list[str],list[str],str,str,str,str,list[str],str
"""F58A1371-7454-…","[""A180056""]","[""1943-06-09""]","""Style book of …","[""Garst (Robert E.)""]","[""New York""]","""Not renewed.""","""1944""","""""","""""",,
"""7FB177C1-6E18-…","[""A168596""]","[""1942-11-02""]","""Superman.""","[""Lowther (George)""]","[""Superman, inc.""]","""Not renewed.""","""1942""","""""","""""",,
"""97656587-6CFA-…","[""A8019""]","[""1929-05-03""]","""Vogue’s book o…","[""Vogue""]","[""Doubleday, Doran & company, inc."", ""Doubleday, Doran & co., inc.""]","""Not renewed.""","""1929""","""""","""""",,
"""A50C3D52-6CFA-…","[""A12694""]","[""1929-09-28""]","""Tarzan and the…","[""Burroughs, Edgar Rice"", ""Edgar Rice Burroughs"", ""A. W. Sperry""]","[""Metropolitan books"", ""Edgar Rice Burroughs, inc.""]","""Not renewed.""","""1929""","""""","""""",,
"""A63FF405-6CFA-…","[""A10771""]","[""1929-08-10""]","""The illustrate…","[""Burroughs, Edgar Rice"", ""Edgar Rice Burroughs"", ""Harold Foster""]","[""Grosset & Dunlap"", ""Edgar Rice Burroughs, inc.""]","""Not renewed.""","""1929""","""""","""""","[""no. 1, picturized from the novel “Tarzan of the apes”""]",
…,…,…,…,…,…,…,…,…,…,…,…
"""B01231AB-6D13-…","[""A602465""]","[""1962-12-12""]","""VERNON LAW BOO…","[""Frederick D. Lewis, JR."", ""West Pub. Co. & Vernon Law Book Co"", … ""VERNON'S TEXAS RULES OF CIVIL PROCEDURE""]","[""West Pub. Co. & Vernon Law Book Co.""]","""Not renewed.""","""1962""","""""","""""","[""in 29"", ""employers for hire""]",
"""DC51F471-6D13-…","[""A600557""]","[""1962-11-23""]","""Growing crisis…","[""GROWING CRISIS FOR THE CITIES (FILMSTRIP SCRIPT)""]","[""New York Times Co.""]","""Not renewed.""","""1962""","""""","""""",,
"""04848F74-6D14-…","[""A572608""]","[""1962-04-11""]","""SUPERMAN. Draw…","[""SUPERMAN""]","[""National Periodical Publications, Inc.Release week of""]","""Not renewed.""","""1962""","""""","""""",,
"""EDC17AE1-6E8D-…","[""A39742""]","[""1931-07-18""]","""The story of r…","[""Bunge, Martin Ludwig Detloff"", ""Martin L. Bunge""]","[""Fellowship publishing house"", ""Fellowship pub. house.""]","""Not renewed.""","""1931""","""""","""""",,


In [9]:
ren_unmat.filter(pl.col("reg_date").dt.year() == pl.lit(1953)).collect()

uuid,regnum,reg_date,renewal_id,renewal_date,author,title,new_matter,see_also_renewal,see_also_registration,full_text,claimants,notes
str,str,date,str,str,str,str,str,list[null],list[null],str,str,str
"""7eb13df9-7847-…","""A419543""",1953-09-29,"""RE7894""",,,"""Father-Mother …","""""",[],[],"""""","""The Christian …",
"""8537a638-d799-…","""A78252""",1953-02-02,"""RE102259""",,,"""Music for life…","""""",[],[],"""""","""""",
"""64909d6d-db61-…","""A82932""",1953-03-05,"""RE86864""",,,"""Iowa Code anno…","""""",[],[],"""""","""West Publishin…",
"""58e59e17-0e5e-…","""A80360""",1953-01-20,"""RE83002""",,,"""Advance Califo…","""""",[],[],"""""","""Bancroft-Whitn…",
"""02e0848c-3d81-…","""A76559""",1953-01-19,"""RE87335""",,,"""Oklahoma statu…","""""",[],[],"""""","""West Publishin…",
…,…,…,…,…,…,…,…,…,…,…,…,…
"""716ab701-01d1-…","""AF41374""",1953-12-23,"""RE99660""",,,"""Cinco farsas b…","""""",[],[],"""""","""Isabel Garcia …",
"""d046715c-f8ee-…","""A820441""",1953-12-25,"""RE110007""",,,"""Polkas on para…","""""",[],[],"""""","""Alfred Music C…",
"""fc461720-1a6d-…","""A126785""",1953-12-30,"""RE102903""",,,"""Pinocchio lear…","""""",[],[],"""""","""Walt Disney Pr…",
"""3265a9b7-6bee-…","""A12583""",1953-06-01,,"""""","""Oskar Ernst Be…","""Vermaningen""","""""",[],[],"""R636967. Verma…",,""""""
