In [1]:
import pandas as pd
import os

In [2]:
os.chdir("C:\\Users\\EVandewalle\\Documents\\Libcitations_2")

In [3]:
import fos_classification
import vabb

## Import data

VABB

In [90]:
# VABB dataset used in the project (=VABB 11)
VABB = pd.read_csv("data/cleaning/VABB_books_isbn_level.csv", dtype={"isbn":"str"})

In [5]:
# isbn and cloi information for all VABB books
books = pd.read_csv("data/original_data/all_books.csv", index_col=0, dtype={'isbn':'str'})

In [6]:
# VABB cognitive classification from the vabb package. This is the classification on cloi-level (the identifier used in VABB)
cog_class = vabb.load_data("11-fos-classification")

OCLC

In [7]:
# original data from OCLC
OCLC = pd.read_csv("data/original_data/isxn_subjects.csv")

In [8]:
# original data subjects from OCLC
subjects = pd.read_csv("data/original_data/subjects.csv")

In [9]:
# manual mapping adapted by myself for OCLC
DDC = pd.read_excel("data/mappings_classification/DDC_FOS_EV.xlsx", dtype={"fos_code":"str"})

In [10]:
# manual mapping adapted by myself for OCLC
LCC = pd.read_excel("data/mappings_classification/classification_check/LCC_FOS.xlsx", dtype={"fos_code":"str"})

SISO

In [11]:
# original data from Cultuurconnect mapped by me to FOS
SISO = pd.read_csv("data/mappings_classification/siso_mapped.csv", index_col=0, dtype={'isbn':'str'})

## Cleaning
### Change cognitive classification VABB to ISBN level

In [101]:
books.fillna("", inplace=True)

In [159]:
VABB_class = fos_classification.merge_classification_isbn(books, cog_class)

In [160]:
VABB_class = VABB_class.rename(columns={"fos_code":"fos_dedup","original_code":"fos_code"}).copy()

In [162]:
VABB_class = VABB_class.drop(columns="fos_dedup").copy()

In [166]:
VABB_class = VABB_class.drop_duplicates().copy()

In [167]:
VABB_class.shape

(34303, 3)

In [168]:
def change_column_name(df, name):
    list_columns = df.columns
    for column in list_columns:
        if column != "isbn":
            new_name = name + "_" + column
            df = df.rename(columns ={column:new_name})
    return df

In [169]:
VABB_class = change_column_name(VABB_class, "VABB")

In [170]:
VABB_class

Unnamed: 0,isbn,VABB_proportion,VABB_fos_code
0,9780312226596,1.0,5.4
1,9789042908123,1.0,6.3.1
11,9789053509289,1.0,5.2
16,9789041112347,1.0,5.5
18,9783772027512,1.0,6.2.2
...,...,...,...
96186,9781590331002,1.0,6.2.2
96187,9783893237128,1.0,5.4
96188,9780958488099,1.0,5.9
96189,9780799337051,1.0,6.2.2


In [171]:
len(VABB_class.isbn.unique())

34303

### Clean DDC

In [18]:
#only retain codes for records with isbn in our dataset
OCLC = OCLC[OCLC.isxn.isin(VABB.isbn)]

#only retain columns interesting for further analysis
OCLC = OCLC[["isxn","source","subject"]]

#rename the isxn columns as isbn (there are now only isbn's in it)
OCLC.rename(columns={"isxn":"isbn"}, inplace=True)

DDC.fillna("", inplace=True)

#Put dots inbetween the numbers of the FOS code
DDC.fos_code = DDC.fos_code.map(lambda x: ".".join(x))

In [19]:
#merge the subjects table with the DDC_LCC table
subjects_merged = OCLC.merge(subjects, how="left", left_on="subject", right_on="id").drop(columns=["id"])

#for now, only retain rows that give a DDC subject classification
DDC_subjects = subjects_merged[subjects_merged.subject_type == "DDC"].copy()

#split the subject code because only the first 3 numbers (before the dot) are mapped to FOS codes
DDC_subjects["split_subject"] = DDC_subjects.subject_code.apply(lambda x: x.split(".")[0])

#Now merge the DDC_subjects to the FOS mapping
DDC_merged = DDC_subjects.merge(DDC, how="left", left_on="split_subject", right_on="orig_code")
DDC_merged.drop(columns=["subject_label","subject","source","orig_code", "subject_type", "split_subject","remarks","fos_description"], inplace=True)

In [20]:
def deduplicate_classification(classification):
    """Requires a dataframe with columns isbn and fos_code.
    Returns a dataframe with classifications, but each row represents one ISBN and there are no duplicate ISBNs."""
    classification = classification.fillna("")
    classification = classification.groupby('isbn').agg(lambda x: ";".join(x)).reset_index()
    
    classification.fos_code = classification.fos_code.apply(lambda x: x.split(";"))
    classification['proportion'] = classification.fos_code.apply(lambda x: 1/len(x) if len(x)>0 else 0)
    classification.fos_code = classification.fos_code.apply(lambda x: ';'.join(x))
    
    return classification

In [21]:
DDC_merged = deduplicate_classification(DDC_merged)

In [22]:
DDC_merged = change_column_name(DDC_merged, "DDC")

In [23]:
DDC_merged["DDC_fos_code"] = DDC_merged.DDC_fos_code.replace(["2.1.1"],"2.11")

In [24]:
DDC_merged[DDC_merged.DDC_fos_code == ""]

Unnamed: 0,isbn,DDC_subject_code,DDC_fos_code,DDC_proportion
414,9780198789161,050.904,,1.0
482,9780199211159,050.904,,1.0
731,9780199659586,050.904,,1.0
3838,9780993293238,040,,1.0
3910,9781107038233,031.09,,1.0
...,...,...,...,...
27001,9789401442053,030,,1.0
27158,9789460040399,E,,1.0
27970,9789501245653,E,,1.0
28052,9789612472801,032,,1.0


### Clean LCC 

In [25]:
#for now, only retain rows that give a LCC subject classification
LCC_subjects = subjects_merged[subjects_merged.subject_type == "LCC"].drop(columns=["subject_label"])

#Now merge the LCC_subjects to the FOS mapping
LCC_merged = LCC_subjects.merge(LCC, how="left",left_on="subject_code", right_on="orig_code")

LCC_merged.subject = LCC_merged.subject.astype(str)

In [26]:
LCC_merged = deduplicate_classification(LCC_merged)

In [27]:
LCC_merged = LCC_merged[["isbn","subject_code","fos_code", "proportion"]].copy()

In [28]:
LCC_merged = change_column_name(LCC_merged, "LCC")

In [29]:
LCC_merged = LCC_merged[LCC_merged.LCC_fos_code != ""]

In [30]:
LCC_merged["LCC_fos_code"] = LCC_merged["LCC_fos_code"].map(lambda x: ".".join(x))

In [31]:
LCC_merged

Unnamed: 0,isbn,LCC_subject_code,LCC_fos_code,LCC_proportion
2,9780028649511,GV709,5.9,1.0
3,9780028655949,LB15,5.3,1.0
4,9780028657042,BL240.3,6.3.2,1.0
5,9780028657806,B41,6.3.1,1.0
12,9780028658117,JA61,5.6,1.0
...,...,...,...,...
30215,9791094898321,PQ2672.E25,6.2.2,1.0
30216,9791095254010,Z116,5.8,1.0
30217,9791095457299,PQ2605.O15,6.2.2,1.0
30218,9791095546009,P98,6.2.1,1.0


## Clean SISO

In [131]:
SISO = SISO.rename(columns={"fos_code_specifieker":"SISO_fos_code"}).copy()

## For what publications is a manual check needed?

In [32]:
no_classifications = pd.read_csv("classifications/check missing/no_classification.csv", index_col=0, dtype={"isbn":str})

## Removing conference papers

In [33]:
no_classifications

Unnamed: 0,isbn,VABB_btitle,VABB_lg,VABB_pubyear
36,9780077130381,et al.;International management accounting an...,eng,2010
211,9780156073882,"European accounting guide / David, Alexander [...",eng,2001;2003
1309,9780314913746,International Privacy Guide edition:1,eng,2009
1311,9780323015738,2002 Yearbook of Sports Medicine,eng,2003
1387,9780368090547,Muscat. City of Gates,eng,2018
...,...,...,...,...
47264,9791096909018,Diversity of cultural expressions in the digit...,eng,2016
47265,9791096909025,Diversidade de expressões culturais na era dig...,por,2016
47268,9791097361099,Les esprits animaux (16e - 21e siècles) : litt...,eng,2018
47271,9791195130504,"Sea, Sea Names and Mediterranean Peace: Procee...",eng,2013


In [37]:
VABB_type = VABB[["isbn","VABB_VABB-publicatietype"]]

In [41]:
no_classifications = no_classifications.merge(VABB_type, on="isbn")

In [43]:
no_classifications = no_classifications[no_classifications["VABB_VABB-publicatietype"] != "VABB-5"]

In [46]:
no_classifications.shape

(7278, 5)

In [48]:
no_classifications.to_csv("data/mappings_classification/to_do_manual_no_conferences.csv")

## A first batch of manuals

In [72]:
manual = pd.read_excel("data/mappings_classification/Manual_complete_23-03-2022.xlsx", index_col=0, dtype={"isbn":str})

In [73]:
manual = manual[["isbn","VABB_btitle","Manual_fos_code"]]
manual.shape

(7278, 3)

In [74]:
manual = manual[manual.Manual_fos_code.notna()]

In [75]:
manual.shape

(7277, 3)

## Creating the entire classification dataset

In [53]:
DDC_merged.shape

(28451, 4)

In [54]:
LCC_merged.shape

(25090, 4)

In [146]:
len(LCC_merged.isbn.unique())

25090

In [55]:
manual.shape

(7277, 3)

In [147]:
len(manual.isbn.unique())

7277

In [56]:
VABB_class.shape

(38146, 4)

In [149]:
len(VABB_class.isbn.unique())

34303

In [57]:
SISO.shape

(8569, 3)

In [63]:
isbns = VABB[["isbn"]]

Unnamed: 0,VABB_fos_code,VABB_fos_code.1
0,False,False
1,False,False
11,False,False
16,False,False
18,False,False
...,...,...
96186,False,False
96187,False,False
96188,False,False
96189,False,False


In [172]:
VABB_fos = isbns.merge(VABB_class, how="left", on="isbn")

In [173]:
VABB_DDC_fos = VABB_fos.merge(DDC_merged, how="left", on="isbn")

In [174]:
VABB_DDC_LCC_fos = VABB_DDC_fos.merge(LCC_merged, how="left", on="isbn")

In [175]:
VABB_DDC_LCC_SISO_fos = VABB_DDC_LCC_fos.merge(SISO, how="left", on="isbn")

In [176]:
manual = manual[["isbn", "Manual_fos_code"]]

In [177]:
df = VABB_DDC_LCC_SISO_fos.merge(manual, how="left", on="isbn")

In [178]:
df = df.fillna("").copy()

In [179]:
df.columns

Index(['isbn', 'VABB_proportion', 'VABB_fos_code', 'DDC_subject_code',
       'DDC_fos_code', 'DDC_proportion', 'LCC_subject_code', 'LCC_fos_code',
       'LCC_proportion', 'siso_code', 'SISO_fos_code', 'Manual_fos_code'],
      dtype='object')

In [180]:
df["classification"] = ""
df["classification_source"] = ""

In [181]:
for i in range(len(df)):
    if df.VABB_fos_code.iloc[i] != "":
        df.classification.iloc[i]=df.VABB_fos_code.iloc[i] #first add cognitive classification
        df.classification_source.iloc[i]="Cognitive classification VABB"
    elif df.DDC_fos_code.iloc[i] != "":
        df.classification.iloc[i] = df.DDC_fos_code.iloc[i]
        df.classification_source.iloc[i] = "DDC"
    elif df.LCC_fos_code.iloc[i] != "":
        df.classification.iloc[i] = df.LCC_fos_code.iloc[i]
        df.classification_source.iloc[i]="LCC"
    elif df.SISO_fos_code.iloc[i] != "":
        df.classification.iloc[i] = df.SISO_fos_code.iloc[i]
        df.classification_source.iloc[i]="SISO"
    elif df.Manual_fos_code.iloc[i] != "":
        df.classification.iloc[i]=df.Manual_fos_code.iloc[i]
        df.classification_source.iloc[i]="Manual"

In [264]:
all_classifications = df[["isbn","classification","classification_source"]].copy()

## More missing information

In [207]:
missing = all_classifications[all_classifications.classification == ""]

In [208]:
missing = missing.merge(VABB_type, how="left", on="isbn")

In [209]:
# what publications were missed? These need to be classified manually
# earlier, the conference papers were excluded
missing = missing[missing["VABB_VABB-publicatietype"]!="VABB-5"]

In [210]:
missing = missing.drop(columns=["classification","classification_source","VABB_VABB-publicatietype"])

In [212]:
missing = missing.merge(VABB, how="left", on="isbn")

In [217]:
missing = missing.drop(columns=["VABB_jtitle","VABB_ptitle","VABB_lg", "VABB_infogpstd"])

In [218]:
missing.to_excel("data/mappings_classification/remaining_manual.xlsx")

In [260]:
manual2 = pd.read_excel("data/mappings_classification/remaining_manual20221125.xlsx", index_col=0, dtype={"isbn":"str"})

In [261]:
manual2 = manual2[["isbn","manual fos"]].copy()

## Add second batch of manual classifications

In [266]:
all_classifications = all_classifications.merge(manual2, how="left", on="isbn")

In [267]:
all_classifications = all_classifications.fillna("")

for i in range(len(all_classifications)):
    if (all_classifications.classification.iloc[i] == ""):
        all_classifications.classification.iloc[i]=all_classifications["manual fos"].iloc[i]

In [268]:
for i in range(len(all_classifications)):
    if all_classifications["manual fos"].iloc[i] != "":
        all_classifications.classification_source.iloc[i] == "Manual"

In [269]:
all_classifications.drop(columns=["manual fos"])

Unnamed: 0,isbn,classification,classification_source
0,9780000000002,1.5;2.2;1.2,Cognitive classification VABB
1,9780000001948,5.7,Cognitive classification VABB
2,9780028649511,5.9,Cognitive classification VABB
3,9780028655949,5.3,Cognitive classification VABB
4,9780028657042,6.3.2,Cognitive classification VABB
...,...,...,...
46636,9791097361099,,
46637,9791188296200,5.4,Cognitive classification VABB
46638,9791195038947,6.4,Cognitive classification VABB
46639,9791195130504,,


## Exporting the classification dataset

In [270]:
all_classifications.to_csv("data/mappings_classification/all_classifications20221125.csv")