In [37]:
import pandas as pd
import os
import sqlite3 
import sqlite_utils
from lxml import etree
import unicat
from tqdm.auto import tqdm

In [84]:
os.chdir("C:\\Users\\EVandewalle\\Documents\\Libcitations_2")

## Get vabb_isbn, unicat_work, manifestation and failure from the unicat.db

The original unicat database contained information on the isbns from VABB, and the results of the API search.

In [57]:
con = sqlite3.connect(r"unicat_test.db")

In [58]:
unicat_work = pd.read_sql("SELECT id, url, raw_result FROM unicat_work", con)

In [59]:
vabb_isbn = pd.read_sql("SELECT number_nr, isbn FROM vabb_isbn", con)

In [60]:
failure = pd.read_sql("SELECT isbn, error FROM failure", con)

In [105]:
manifestation = pd.read_sql("SELECT isbn, work_id FROM manifestation", con)

## Create new database

We are creating a new database to store all the information from the UniCat search

In [53]:
db = sqlite_utils.Database("new_unicat.db")

In [54]:
db.table_names()

[]

In [55]:
db_con = sqlite3.connect("new_unicat.db")

### Add the data to the new database

In [62]:
unicat_work = unicat_work.set_index("id")

In [63]:
unicat_work.to_sql("unicat_work", db_con)

44229

In [64]:
vabb_isbn = vabb_isbn.set_index("number_nr")

In [65]:
vabb_isbn.to_sql("vabb_isbn", db_con)

96191

In [66]:
failure = failure.set_index("isbn")

In [67]:
failure.to_sql("failure", db_con)

1

In [68]:
db.table_names()

['unicat_work', 'vabb_isbn', 'failure']

### Get the ISBN that was used for data retrieval

In [69]:
unicat_work = db.table("unicat_work",pk="id")

In [70]:
isbn_search = db["isbn_search"]
if "isbn_search" not in db.table_names():
    isbn_search = isbn_search.create({
        "work_id": int,
        "isbn":str,
    })

In [71]:
db.table_names()

['unicat_work', 'vabb_isbn', 'failure', 'isbn_search']

In the following, we check what ISBN was used for data retrieval and store this in the "isbn_search" table.

In [72]:
for row in tqdm(unicat_work.rows_where("raw_result is not null")):
    book = unicat.Book(row["raw_result"])
    data = [
        {"work_id": row["id"], "isbn": book.isbn_look_up()}
    ]
    isbn_search.insert_all(data)

0it [00:00, ?it/s]

In [73]:
db.table_names()

['unicat_work', 'vabb_isbn', 'failure', 'isbn_search']

## Add holding information

In [74]:
# create table "holding"
holding = db["holding"]
if "holding" not in db.table_names():
    holding = holding.create({
        "work_id": int,
        "location": str,
        "uri": str,
    })

In [75]:
db.table_names()

['unicat_work', 'vabb_isbn', 'failure', 'isbn_search', 'holding']

In [76]:
# retrieve the holding information from the results of the API search in the new table "holding"
for row in tqdm(unicat_work.rows_where("raw_result is not null")):
    book = unicat.Book(row["raw_result"])
    data = [
        {"work_id": row["id"], "location": location, "uri": uri}
        for location, uri
        in book.holdings()
    ]
    holding.insert_all(data)

0it [00:00, ?it/s]

In [77]:
# get the "holding" information
df = pd.read_sql(
    """
    SELECT work_id, location
    FROM holding
    """, db_con)

In [78]:
# calculate the holding count for each work-id
holdingcounts = pd.DataFrame(df.groupby("work_id").location.nunique()).reset_index()
holdingcounts = holdingcounts.rename(columns={"location":"holdingcount"})

In [81]:
# add the holdingcount information to the database in the table "holdingcount"
holdingcounts.set_index("work_id", inplace=True)
holdingcounts.to_sql("holdingcount",db_con)

25524

In [82]:
db.table_names()

['unicat_work',
 'vabb_isbn',
 'failure',
 'isbn_search',
 'holding',
 'holdingcount']

## Add the manifestation table

In [114]:
manifestation.set_index("isbn", inplace=True)
manifestation.to_sql("manifestation", db_con)

62198