In [151]:
import pandas as pd

# Creating a work-id

## Import data

In [153]:
# these are all FRABL keys gathered through the API search of the Cultuurconnect database.
all_frabls = pd.read_csv("cleaning/data/all_frabls.csv", index_col=0, dtype={"isbn":"str"})

all_frabls.shape

(46642, 2)

In [154]:
# availability information to filter out FRABL keys that represent no holdings
holdings = pd.read_csv("cleaning/data/cultuurconnect_holding_information.csv", index_col=0, dtype={'isbn':str})

holdings = holdings.query("library_count > 0").filter(items=["frabl","library_count"])

holdings.shape

(8376, 2)

In [155]:
# A title check yielded 56 irregular titles
irregular_titles = pd.read_csv("data/irregularities/title_check.csv", dtype={"isbn":"str"})
irregular_titles.shape

(56, 1)

In [156]:
# A matching of similar titles yielded 47 titles that should be grouped together
title_matches = pd.read_csv("data validation/data/manual_matches.csv", 
                            index_col=0, dtype={"isbn":str, "work_id":str, "index_number":str})
title_matches.shape

(48, 8)

# Create work-id

In [157]:
df = all_frabls.dropna().copy()

In [158]:
df["work_id"] = None

In [159]:
works_by_isbn = {}
works_by_frabl = {}
last_work_id = 1

for index, row in df.iterrows():
    work_ID = None
    frabls = row['frabl'].split(';')
    
    if row['isbn'] in works_by_isbn:
        work_ID = works_by_isbn[row['isbn']]
        
    for frabl in frabls:
        if frabl in works_by_frabl:
            work_ID = works_by_frabl[frabl]
            break
    
    if not work_ID:
        work_ID = last_work_id
        last_work_id += 1
        
    if not row['isbn'] in works_by_isbn:
        works_by_isbn[row['isbn']] = work_ID
        
    for frabl in frabls:
        if not frabl in works_by_frabl:
            works_by_frabl[frabl] = work_ID
            
    row["work_id"] = work_ID

In [10]:
df.to_csv("cleaning/data/work_id_isbn_20221216.csv")

In [11]:
df_grouped = df.groupby("work_id").agg({'frabl':lambda x: ";".join(x),
                            'isbn': lambda x: ";".join(x)})

In [12]:
df_grouped.to_csv("cleaning/data/work_id_20221216.csv")

## Group the title matches

In [160]:
work_id = pd.read_csv("cleaning/data/work_id_20221216.csv", index_col=1, dtype={"work_id":str})

In [161]:
title_matches_grouped = (title_matches
                         .filter(["isbn","match","work_id"])
                         .groupby(["match"])
                         .agg(lambda x: ";".join(x)))

In [162]:
title_matches_grouped["new_work_id"] = [i + 9000 for i in range(len(title_matches_grouped))]

In [163]:
new_work_id = (title_matches_grouped
               .filter(["work_id","new_work_id"])
               .assign(new_work_id=title_matches_grouped.new_work_id.astype(str))
               .assign(work_id=title_matches_grouped.work_id.str.split(";"))
               .explode("work_id")
               )

In [164]:
work_id.reset_index(inplace=True)

In [165]:
work_id_merged = work_id.merge(new_work_id, how="left", on="work_id").fillna("")

In [166]:
work_id_merged

Unnamed: 0,frabl,work_id,isbn,new_work_id
0,1B9BDF18AF1ACA0,1,9789055445585,
1,1A0D8B40D9F1ACA0,2,9789030171836,
2,3B24393C81F1ACA0,3,9789030175308,
3,492B6CB854F1ACA0,4,9789033446573,
4,337B1BCD3F1ACA0,5,9789062158126,
...,...,...,...,...
8701,2E276BDA7DF1ACA0,8702,9789054666028,
8702,48B05643EBF1ACA0,8703,9789020963441,
8703,9AEAE6867F1ACA0,8704,9789038204130,
8704,1EC0E59EF9F1AC3A,8705,9780627023460,


In [167]:
for i in range(len(work_id_merged)):
    if work_id_merged.new_work_id.iloc[i] != "":
        work_id_merged.work_id.iloc[i] = work_id_merged.new_work_id.iloc[i]

In [168]:
work_id_merged[work_id_merged.new_work_id != ""].head(3)

Unnamed: 0,frabl,work_id,isbn,new_work_id
877,6A6328F4EDF1ACA0,9000,9789021453385,9000
1303,655B209712F1ACA0,9000,9789029563895,9000
1480,13D6C89FDCF1ACA0,9011,9782804428341,9011


In [169]:
work_id_merged = (work_id_merged
                  .filter(["work_id","isbn","frabl"])
                  .groupby("work_id")
                  .agg(lambda x: ";".join(x))
                  .reset_index())

In [170]:
work_id_merged.shape

(8680, 3)

## Get out irregular titles

In [None]:
# what is the difference with the following??? 
# it goes further while IGNORING the grouping --> to fix 

In [171]:
work_id_irr = work_id_merged.copy()

In [172]:
work_id_irr = (work_id_irr
               .assign(isbn = work_id_irr.isbn.str.split(";"))
               .explode("isbn")
               .query("isbn not in @irregular_titles.isbn")
               .groupby("work_id")
               .agg(lambda x: list(set(";".join(x).split(";"))))
               )

work_id_irr = (work_id_irr
               .assign(isbn = work_id_irr.isbn.apply(lambda x: ";".join(x)))
              .assign(frabl = work_id_irr.frabl.apply(lambda x: ";".join(x))))


In [173]:
work_id_irr

Unnamed: 0_level_0,isbn,frabl
work_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9789055445585,1B9BDF18AF1ACA0
10,9782804414054;9782804414450;9782804430597,61EF4E53C9F1ACA0
100,9789085060499,355738819AF0C0A0;355738819AF1ACA0
1000,9789044115390,5B8A7026E3F1AC31
1001,9789076704722,37E7373D40F1ACA0
...,...,...
995,9789029073233,39DD244F85F1ACA0
996,9789038206349,B07B1BD27F1ACA0
997,9789045549453;9789045536477;9789045508283,1515FBA2BCF1ACA0
998,9789033455407,56F0277FA9F1ACA0


## Select only the FRABLS that have a holding

In [174]:
# selecting the FRABL keys that have a holding in Cultuurconnect (meaning that they are held in at least one library)
frabls = all_frabls[all_frabls.frabl.notna()]
frabls = (frabls.assign(frabl = frabls.frabl.str.split(";"))
          .explode("frabl"))
          
frabls_with_holding = frabls.query("frabl in @holdings.frabl")
frabls_with_holding.shape

(8962, 2)

In [175]:
# Quite a few of those FRABLs appear more than once
frabls_with_holding[frabls_with_holding.frabl.duplicated()].shape[0]

586

In [176]:
# Also the ISBN keys appear more than once
frabls_with_holding[frabls_with_holding.isbn.duplicated()].shape[0]

287

In [177]:
# change work_id_irr so it can be used to take out the frabls without holding. 
# maybe better to literally use the sum of holdings? because a work_id may be linked to more than one FRABL, 
# and one of these may have a holding wile the others do not.

In [178]:
work_id2 =work_id_irr.copy()
work_id2.shape

(8629, 2)

In [179]:
# split frabl and explode on frabl
work_id2 = (work_id2.assign(frabl = work_id2.frabl.str.split(";"))
            .explode("frabl"))
work_id2.shape

(9249, 2)

In [180]:
work_id2.reset_index(inplace=True)

In [184]:
# merge with holding
works_holdings = work_id2.merge(holdings, on="frabl", how="left")

In [186]:
# calculate holding per work_id
works_holdings = (works_holdings
                  .groupby("work_id")
                  .agg({"library_count": lambda x: sum(x),
                       "frabl": lambda x: ";".join(x),
                       "isbn": lambda x: ";".join(x)})
                  .reset_index())

In [187]:
works_holdings = works_holdings.assign(library_count = works_holdings.library_count.fillna(0))

In [188]:
# remove work_id without any holdings
works_holdings = works_holdings[works_holdings.library_count > 0]

In [189]:
# remove library count column
works_holdings.drop(columns="library_count", inplace=True)

In [191]:
# export the new work_id
works_holdings.to_csv("cleaning/data/work_id_20221219.csv")