In [2]:
import pandas as pd
import sqlite3 
import sqlite_utils

# 8. Cultuurconnect - Creating Cultuurconnect SQL

### Importing the data

In [7]:
# dataset with all FRABLs associated with each ISBN
all_frabls = pd.read_csv("cleaning/data/all_frabls.csv", index_col=0, dtype={"isbn":"str"})

In [8]:
# dataset with ISBN's to be taken out
irregular_titles = pd.read_csv("data/irregularities/title_check.csv", dtype={"isbn":"str"})
irregular_titles.shape

(56, 1)

The files with the holding and the availability information contain information we cannot share publicly.

In [11]:
# dataset with FRABL's and Holding counts
holding = pd.read_csv("cleaning/data/cultuurconnect_holding_information.csv", index_col=0, dtype={'isbn':str})

In [12]:
# dataset with FRABL's and Cultuurconnect metadata
metadata = pd.read_csv("cleaning/data/cultuurconnect_details_information.csv", index_col=0)

In [13]:
# dataset with work-id's. The irregular titles have already been taken out here
works = pd.read_csv("cleaning/data/work_id_20221219.csv", dtype={"isbn":"str"}, index_col=0)
works_isbn = works[["work_id","isbn"]].copy()

In [14]:
# changing this dataframe to ISBN level
works_isbn.isbn = works_isbn.isbn.str.split(";").copy()
works_isbn = works_isbn.explode("isbn")
len(works_isbn.isbn.unique())

# changing dataframe to FRABL level
works_frabl = works[["work_id","frabl"]].copy()
works_frabl.frabl = works_frabl.frabl.str.split(";").copy()
works_frabl = works_frabl.explode("frabl")

In [15]:
# restrict to frabls for which we have holding information
works_frabl = works_frabl[works_frabl.frabl.isin(holding.frabl)].copy()

## Cleaning

#### isbns

In [16]:
all_frabls.shape

(46642, 2)

In [17]:
# only use the ISBNs that are not irregular titles
isbns = all_frabls[~all_frabls.isbn.isin(irregular_titles.isbn)].copy()

In [18]:
isbns.shape

(46587, 2)

In [19]:
# set the index of the isbns dataframe to the isbn column
isbns = isbns.set_index("isbn").copy()

#### holding and holdingcount

In [20]:
# set the index of the holding dataframe to the FRABL column
holding = holding.set_index('frabl').copy()

In [21]:
holding.shape

(9271, 7)

In [22]:
# only retain records that are not part of the irregular titles
holding = holding[holding.index.isin(works_frabl.frabl)].copy()

In [23]:
holding.shape

(8025, 7)

In [24]:
counts = holding[["library_list","library_count"]].copy()

In [25]:
holding = holding[holding.library_list.notna()].copy()

In [26]:
holding["library_list"] = holding.library_list.str.split(";")

In [27]:
holding = holding.explode("library_list")

In [28]:
holding.rename(columns={"library_list":"library"}, inplace=True)

In [29]:
holding.shape

(110083, 7)

In [31]:
holding = holding[["library"]].copy()

#### metadata

In [32]:
# set the index of the metadata dataframe to the FRABL column
metadata = metadata.set_index('frabl').copy()

In [33]:
metadata.loc[metadata["type"] == "NonFictie", "type"] = "NonFiction"
metadata.loc[metadata["type"] == "Fictie", "type"] = "Fiction"

In [34]:
# Dropping the xml column to make sure we don't share information we are not allowed to share
metadata = metadata.drop(columns="xml_response").copy()

In [35]:
# only retain records that are not part of the irregular titles
metadata = metadata[metadata.index.isin(works_frabl.frabl)].copy()

#### works

In [38]:
# set the index of the works_frabl dataframe to the FRABL column
works_frabl = works_frabl.set_index("frabl").copy()

In [39]:
# set the index of the works_isbn dataframe to the isbn column
works_isbn = works_isbn.set_index("isbn").copy()

## Creating an SQLite database

In [43]:
# create new database
db = sqlite_utils.Database("cultuurconnect.db")

In [44]:
# database has no tables yet
db.table_names()

[]

In [45]:
# make connection with the database
db_con = sqlite3.connect("cultuurconnect.db")

In [46]:
# put the isbns dataframe in the SQL
isbns.to_sql("isbns", db_con)

46587

In [47]:
# the table isbns has been added
db.table_names()

['isbns']

In [48]:
# put the holding dataframe in the SQL
holding.to_sql("holding", db_con)

110083

In [49]:
# put the metadata dataframe in the SQL
metadata.to_sql("metadata", db_con)

8025

In [50]:
# the tables holding metadata have also been added
db.table_names()

['isbns', 'holding', 'metadata']

In [51]:
works_frabl.to_sql("works_frabl", db_con)

8025

In [52]:
works_isbn.to_sql("works_isbn", db_con)

8641

In [53]:
db.table_names()

['isbns', 'holding', 'metadata', 'works_frabl', 'works_isbn']