In [5]:
import pandas as pd
import sqlite3

# Accessing the data
The data gathered through the API search of ISBNs from the VABB-SHW database was stored in an sqlite database.

In [20]:
# connect to the sqlite database
con = sqlite3.connect(r"..\data\unicat.db")

In [19]:
# get the raw result (url) from unicat_work. Where there was a result found in the UniCat database, there is a url
unicat_work = pd.read_sql("SELECT id, url, raw_result FROM unicat_work", con)

In [25]:
# get the manifestation information, which links the ISBNs to a work-id
manifestation = pd.read_sql("SELECT isbn, work_id FROM manifestation", con)

# Resolving double cases

With te initial retrieval of records from UniCat, it was found that many records had multiple isbn's. In order to find out wheter these records belonged to the same work, we followed the procedure listed below. The result is two files, one with the cases that we think belong to the same work and one with the cases we think do not belong to the same work.

# Import files

In [105]:
df = pd.read_excel("UniCat/unicat_isbns-with-multiple-records_2021-09-30.xlsx")

In [107]:
df.rename(columns={"index":"isbn"}, inplace=True)

In [108]:
df_orig = df.copy()

## Make list of records considered the same record

In [57]:
same_record = []

### 1. Same title and same publication year

In [58]:
same_record = list(df[(df.same_title == True) & (df.same_pubyear == True)].reset_index().isbn)

In [59]:
len(same_record)

2250

In [61]:
df.shape

(3189, 10)

### 2. same title no subtitle and same publication year

In [62]:
df = df[~df.isbn.isin(same_record)].copy()

In [63]:
len(list(df[(df.same_title_no_subtitle == True) & (df.same_pubyear == True)].isbn))

289

In [64]:
same_record += list(df[(df.same_title_no_subtitle == True) & (df.same_pubyear == True)].isbn)

In [65]:
len(same_record)

2539

### 3. Q-ratio above 60 and same publication year

In [66]:
df = df[~df.isbn.isin(same_record)].copy()

In [67]:
df.shape

(650, 10)

In [70]:
len(df[(df.title_qratio > 60) & (df.same_pubyear == True)].isbn)

237

In [71]:
same_record += list(df[(df.title_qratio > 60) & (df.same_pubyear == True)].isbn)

In [72]:
len(same_record)

2776

### 4. Same title, different publication year

In [73]:
df.shape

(650, 10)

In [74]:
df = df[~df.isbn.isin(same_record)].copy()

In [78]:
len(df[(df.same_title==True) & (df.same_pubyear == False)].isbn)

171

In [79]:
same_record += list(df[(df.same_title==True) & (df.same_pubyear == False)].isbn)

In [80]:
len(same_record)

2947

## Left over

In [81]:
df = df[~df.isbn.isin(same_record)].copy()

In [83]:
df.shape

(242, 10)

## Add manual check

In [85]:
manual = pd.read_excel("UniCat/leftover_ISBNs.xlsx")

  warn(msg)


In [89]:
manual.rename(columns={"index":"isbn"}, inplace=True)

In [92]:
same_record += list(manual[manual["Same Work"] == True].isbn)

In [93]:
len(same_record)

3055

In [94]:
df = df[~df.isbn.isin(same_record)].copy()

## Save the files

In [111]:
same_records = df_orig[df_orig.isbn.isin(same_record)].copy()

In [112]:
same_records.shape

(3055, 10)

In [113]:
same_records.to_csv("UniCat/same_records_check.csv")

In [109]:
len(same_record)

3055

In [101]:
df.to_csv("UniCat/not_same_records_check.csv")