In [1]:
import datetime
import sqlite3
import time

import pandas as pd
import requests
import sqlite_utils
import vabb
from lxml import etree
from stdnum import isbn
from tqdm.auto import tqdm

In [3]:
con = sqlite3.connect(r"vabb11.sqlite")

In [4]:
df = pd.read_sql(
    """
    select cloi, number_nr
    from number
    where number_ty like '%isbn%'
    """,
    con,
)

In [5]:
def format_isbn(num):
    if not vabb.is_strictly_valid_isbn(num):
        #print("Error for {}".format(num))
        return
    return isbn.compact(num, convert=True)

In [6]:
df["isbn"] = df.number_nr.apply(format_isbn)

In [7]:
isbns = df.loc[df.isbn.notna(), "isbn"].unique()

In [8]:
len(isbns)

46641

In [9]:
def retrieve_isbn_data(num: str):
    # TODO check format of `num`
    base_url = "https://www.unicat.be/sru"
    params = {
        "version": "1.1",
        "operation": "searchRetrieve",
    }
    params["query"] = f"isbn={num}"

    r = requests.get(base_url, params=params)
    # Fix encoding, incorrectly assumed to be iso-8859-1.
    # See https://www.ietf.org/rfc/rfc2376.txt, section 6.4:
    # For text/xml without charset, "MIME and XML processors must assume the charset is "us-ascii"".
    # So the behaviour of requests doesn't follow the RFC, it seems.
    r.encoding = "utf-8"

    return r

In [88]:
class Book:
    def __init__(self, xml_string):
        self.doc = etree.fromstring(xml_string)
        self.ns = {
            "slim": "http://www.loc.gov/MARC21/slim",
            "srw": "http://www.loc.gov/zing/srw/",
            "xcql": "http://www.loc.gov/zing/cql/xcql/",
        }

    def xpath(self, xpath_str, element=None):
        """Convenience method"""
        if element is None:
            element = self.doc
        return element.xpath(xpath_str, namespaces=self.ns)

    def isbns(self):
        all_isbns = set()
        # concatentate results of multiple records, if present
        for isbn_string in self.xpath(
            '//slim:datafield[@tag="020"]/slim:subfield[@code="a"]/text()'
        ):
            all_isbns |= set(isbn_string.split())

        if not all_isbns:
            # ISBN not in Unicat. We can only give back the ISBN as it was looked up.
            all_isbns = {self.looked_up_isbn}
        return all_isbns
    
    @property
    def looked_up_isbn(self):
        return self.xpath("//xcql:searchClause/xcql:term/text()")[0]

    @property
    def raw_xml(self):
        return etree.tostring(self.doc)

    def holdings(self):
        # TODO figure out hwo to handle cases with more than one record!!
        # XML shouldn't contain more than one record
        if len(self.xpath("//slim:record")) > 1:
            raise ValueError(
                f"Found multiple records for one ISBN, XML follows:\n{self.raw_xml}"
            )
        for el in self.xpath('//slim:datafield[@tag="852"]'):
            location = self.xpath('slim:subfield[@code="a"]/text()', el)[0]
            uri = self.xpath('slim:subfield[@code="u"]/text()', el)[0]

            yield str(location), str(uri)

    @property        
    def number_of_records(self):
        # The element srw:numberOfRecords reports 2 when there's 1, so is unreliable
        return len(self.xpath('//slim:record'))

In [11]:
def new_work_id(db):
    res = next(iter(db.query("select max(work_id) + 1 work_id from manifestation")))[
        "work_id"
    ]
    return 1 if res is None else res

In [12]:
db = sqlite_utils.Database("unicat.db")

In [13]:
manifestation = db["manifestation"]

if "manifestation" not in db.table_names():
    # In principle, isbn could be PK. But in practice,
    # we've found ISBNs associated with different works.
    # Hence, no PK is set.
    manifestation = manifestation.create(
        {
            "isbn": str,
            "work_id": int,
        }
    )

unicat_work = db.table("unicat_work", pk="id")

In [22]:
for num in tqdm(isbns):
    if manifestation.count_where("isbn = ?", [num]) > 0:
        # This ISBN is already in the manifestation table, implying that the work is already known.
        continue

    r = retrieve_isbn_data(num)
    try:
        r.raise_for_status()
    except Exception as e:
        db["failure"].insert(
            {
                "isbn": num,
                "error": str(e),
            }
        )

    book = Book(r.text)
    work_id = new_work_id(db)

    # Store all ISBNs (ISBN-13 only) and work ID in manifestation
    book_isbns = {format_isbn(num) for num in book.isbns()} - {None}
    to_insert = [{"isbn": book_num, "work_id": work_id} for book_num in book_isbns]
    manifestation.insert_all(to_insert)

    # Store lookup result in unicat_work
    raw_result = book.raw_xml if book.number_of_records > 0 else None
    unicat_work.insert(
        {
            "id": work_id,
            "url": r.url,
            "checked_on": datetime.datetime.now(),
            "raw_result": raw_result,
        }
    )

    time.sleep(1.5)  # give the server a rest :-)

  0%|          | 0/16641 [00:00<?, ?it/s]

In [14]:
unicat_work.count_where("raw_result is not null")

25528

In [15]:
25528/46641

0.5473296027100619

In [118]:
books = set()

for row in tqdm(unicat_work.rows_where("raw_result is not null")):
    try:
        book = Book(row["raw_result"])
    except:
        print("Error for ", row["id"])
        continue
    if book.number_of_records > 1:
        books.add(book)


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Error for  29418



In [120]:
len(books), unicat_work.count_where("raw_result is not null")

(3189, 25528)