# Step 1: Getting frabls from Cultuurconnect

In [1]:
import sqlite3
import vabb
import fos_classification
import pandas as pd
import time
import requests
import lxml.etree as ET
import numpy as np
import json
import importlib



Import the dataset all_books with isbns from all books in vabb, see 'cleaning-all books'

In [2]:
all_books=pd.read_csv("data/all_books.csv", index_col=0, dtype={"isbn":str}) 

Create a list with all isbns, but drop duplicates

In [3]:
isbns = all_books["isbn"].drop_duplicates()

In [4]:
isbns = pd.DataFrame(isbns)
isbns.set_index('isbn', inplace=True)
isbns.reset_index(inplace=True)

In [5]:
isbns.shape

(46642, 1)

The Cultuurconnect key:

In [7]:
# authorization = ... #this is the key from Cultuurconnect

### First, go from the list of isbns to a list of frabl keys

In [14]:
import json
import tqdm
def get_response(isbns):
    """starts with a dataframe that has column 'isbn' with isbns and returns dataframe with responses from API
    in a column 'response'"""
    url_resolver = "http://zbb.staging.aquabrowser.be/api/v1/resolver/isbn/"
    isbns_error=[]
    with open('intermediate_data.json', 'r') as f:
        data=json.load(f)
    for i in tqdm.tqdm(isbns.isbn):
        if i in data:
            continue
        param_dict_resolver = { "id": i, "authorization": authorization }
        try:
            r_resolver = requests.get(url_resolver, params = param_dict_resolver)
            data[i] = ET.tostring(ET.fromstring(r_resolver.content),pretty_print=True,encoding="unicode")
            with open('intermediate_data.json', 'w+') as f:
                json.dump(data, f)
            time.sleep(3)
        except Exception as e:
            print(f'{e} for isbn {i}')
            isbns_error.append(i)
            time.sleep(60)
    return data, isbns_error
  

In [19]:
get_response(isbns)

In [5]:
with open('intermediate_data.json', 'r') as f:
        result_so_far=json.load(f)

In [6]:
df_intermediate=pd.DataFrame.from_dict(result_so_far, orient='index',columns=["xml_responses"])

In [7]:
df_intermediate.reset_index(inplace=True)

In [8]:
df_intermediate.rename(columns={'index':'isbn'}, inplace=True)

Saving the isbns with the xml strings in a csv file.

In [9]:
df_intermediate.to_csv("data/isbns_with_responses.csv",sep=',')

In [10]:
df_intermediate.shape

(46642, 2)

In [11]:
df_intermediate.head()

Unnamed: 0,isbn,xml_responses
0,9781850654551,"<aquabrowser version=""1"" time-taken=""1049"">\n ..."
1,9781405136143,"<aquabrowser version=""1"" time-taken=""825"">\n ..."
2,9781405136150,"<aquabrowser version=""1"" time-taken=""1015"">\n ..."
3,9780415422888,"<aquabrowser version=""1"" time-taken=""944"">\n ..."
4,9782747556927,"<aquabrowser version=""1"" time-taken=""950"">\n ..."


In [12]:
isbns_frabls = pd.read_csv("data/isbns_with_responses.csv", dtype={'isbn':'str'}, index_col="Unnamed: 0")

In [13]:
isbns_frabls.head()

Unnamed: 0,isbn,xml_responses
0,9781850654551,"<aquabrowser version=""1"" time-taken=""1049"">\n ..."
1,9781405136143,"<aquabrowser version=""1"" time-taken=""825"">\n ..."
2,9781405136150,"<aquabrowser version=""1"" time-taken=""1015"">\n ..."
3,9780415422888,"<aquabrowser version=""1"" time-taken=""944"">\n ..."
4,9782747556927,"<aquabrowser version=""1"" time-taken=""950"">\n ..."


In [14]:
isbns_frabls.shape

(46642, 2)

Go from the xml strings to a useable xml object.

In [15]:
isbns_frabls["tree"]=isbns_frabls.xml_responses.apply(lambda x:ET.ElementTree(ET.fromstring(x)))

In [16]:
isbns_frabls.head()

Unnamed: 0,isbn,xml_responses,tree
0,9781850654551,"<aquabrowser version=""1"" time-taken=""1049"">\n ...",<lxml.etree._ElementTree object at 0x0000014F1...
1,9781405136143,"<aquabrowser version=""1"" time-taken=""825"">\n ...",<lxml.etree._ElementTree object at 0x0000014F1...
2,9781405136150,"<aquabrowser version=""1"" time-taken=""1015"">\n ...",<lxml.etree._ElementTree object at 0x0000014F1...
3,9780415422888,"<aquabrowser version=""1"" time-taken=""944"">\n ...",<lxml.etree._ElementTree object at 0x0000014F1...
4,9782747556927,"<aquabrowser version=""1"" time-taken=""950"">\n ...",<lxml.etree._ElementTree object at 0x0000014F1...


In [17]:
def get_frabl(tree):
    frabl_list=[]
    for tree in tree.iterfind("//itemid"):
        if 'frabl' in tree.attrib:
            frabl_list.append(tree.attrib['frabl'])
    return frabl_list

Extract the frabl keys from the xml text and add them in a new column called 'frabls'

In [18]:
isbns_frabls["frabls"]=isbns_frabls["tree"].apply(lambda x: get_frabl(x))

In [19]:
isbns_frabls["frabls"]=isbns_frabls["frabls"].apply(lambda x: list(set(x))) #remove duplicate frabls if they occur

In [20]:
isbns_frabls["frabls"]=isbns_frabls["frabls"].apply(lambda x: ";".join(x) if isinstance(x,list) else x)

Save the isbns, the xml responses and the frabls extracted from the xmls to a csv file.

In [21]:
isbns_frabls[["isbn","xml_responses","frabls"]].to_csv("data/all_frabls.csv", sep=",")

In [24]:
frabls=pd.read_csv("data/all_frabls.csv", dtype={'isbn':'str'}, index_col='Unnamed: 0')

In [25]:
frabls.head()

Unnamed: 0,isbn,xml_responses,frabls
0,9781850654551,"<aquabrowser version=""1"" time-taken=""1049"">\n ...",
1,9781405136143,"<aquabrowser version=""1"" time-taken=""825"">\n ...",
2,9781405136150,"<aquabrowser version=""1"" time-taken=""1015"">\n ...",
3,9780415422888,"<aquabrowser version=""1"" time-taken=""944"">\n ...",
4,9782747556927,"<aquabrowser version=""1"" time-taken=""950"">\n ...",


In [26]:
frabls[frabls["frabls"].notna()]

Unnamed: 0,isbn,xml_responses,frabls
6,9789055445585,"<aquabrowser version=""1"" time-taken=""875"">\n ...",1B9BDF18AF1ACA0
13,9789030171836,"<aquabrowser version=""1"" time-taken=""520"">\n ...",1A0D8B40D9F1ACA0
14,9789030175308,"<aquabrowser version=""1"" time-taken=""459"">\n ...",3B24393C81F1ACA0
15,9789033446573,"<aquabrowser version=""1"" time-taken=""503"">\n ...",492B6CB854F1ACA0
25,9789062158126,"<aquabrowser version=""1"" time-taken=""381"">\n ...",337B1BCD3F1ACA0
...,...,...,...
46625,9789054666028,"<aquabrowser version=""1"" time-taken=""484"">\n ...",2E276BDA7DF1ACA0
46626,9789020963441,"<aquabrowser version=""1"" time-taken=""420"">\n ...",48B05643EBF1ACA0
46631,9789038204130,"<aquabrowser version=""1"" time-taken=""467"">\n ...",9AEAE6867F1ACA0
46633,9780627023460,"<aquabrowser version=""1"" time-taken=""421"">\n ...",1EC0E59EF9F1AC3A
