This is a notebook to extract the book data related scraped from Internet Archive

In [2]:
#install the internet archive library
!pip install internetarchive

Defaulting to user installation because normal site-packages is not writeable
Collecting internetarchive
  Downloading internetarchive-5.7.1-py3-none-any.whl.metadata (5.7 kB)
Downloading internetarchive-5.7.1-py3-none-any.whl (111 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.6/111.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: internetarchive
[0mSuccessfully installed internetarchive-5.7.1


In [None]:
#make inquiry to 
from internetarchive import search_items
import pandas as pd

# core filters: texts + English
query = (
    'mediatype:(texts) AND language:(eng) '
    'AND (subject:(fiction) OR description:(novel))'
)

fields = [
    "identifier",
    "title",
    "creator",
    "date",
    "publisher",
    "subject",
    "description",
    "language",
    "collection",
    "isbn"
]

search = search_items(query, fields=fields, params={"rows": 2000})

rows = [r for r in search]
df = pd.DataFrame(rows)
df.head()


Unnamed: 0,collection,creator,date,description,identifier,isbn,language,publisher,subject,title
0,"[internetarchivebooks, printdisabled, inlibrary]","Taylor, Max",1990-01-01T00:00:00Z,"[122 pages : 19 cm, Determined to raise his sc...",shortcircuit0000tayl,"[0843127147, 9780843127140]",eng,"Los Angeles, Calif. : Price Stern Sloan","[Androids -- Fiction, Science -- Exhibitions -...",Short circuit
1,"[internetarchivebooks, inlibrary, printdisabled]","Mac a'Bháird, Natasha, author",2009-01-01T00:00:00Z,"[32 pages : 27 cm, ""There's great excitement w...",olannasbigday0000maca,"[9781847171719, 1847171710]",eng,Dublin : The O'Brien Press,"[Nigerians -- Ireland -- Juvenile fiction, Sai...",Olanna's big day
2,"[internetarchivebooks, printdisabled, inlibrar...","Ryan, John, 1921-2009",1991-01-01T00:00:00Z,43 pages : 22 cm,captainpugwashhu0000ryan,"[0951707108, 9780951707104, 0951707116, 978095...",eng,"Rye, East Sussex [England] : Gungarden Books","[Pugwash, Captain (Fictitious character) -- Ju...",Captain Pugwash and the huge reward : a tale o...
3,"[opensource, community]",Myself?,2022-02-24T00:00:00Z,My library from Goodreads,goodreads_library_export_202202,,eng,,"[Cookbooks, fiction]",goodreads_library_export
4,"[internetarchivebooks, inlibrary, printdisabled]","Freethy, Barbara, author",2016-01-01T00:00:00Z,"[383 pages (large print) ; 23 cm, ""Dr. Katheri...",lightninglingers0000free,"[9781628999204, 1628999209]",eng,"Thorndike, Maine : Center Point Large Print","[Large type books, Missing persons -- Fiction]",Lightning lingers


In [6]:
df.to_csv("ia_books_metadata.csv", index=False)

In [None]:
import numpy as np
import math

bad_collection_keywords = ["magazine", "periodical", "journal", "newspaper"]

def is_not_book(collection_value):
    # handle missing values
    if collection_value is None:
        return False
    if isinstance(collection_value, float) and math.isnan(collection_value):
        return False

    # Normalize to a list of strings
    if isinstance(collection_value, str):
        cols = [collection_value]
    elif isinstance(collection_value, (list, tuple, np.ndarray)):
        cols = list(collection_value)
    else:
        # unknown type → don't flag it as non-book
        return False

    cols_lower = [str(c).lower() for c in cols if c is not None]

    # check ONLY collection labels for “magazine”, “journal”, etc.
    return any(
        bad in c
        for c in cols_lower
        for bad in bad_collection_keywords
    )

# apply it
df["is_non_booky_collection"] = df["collection"].apply(is_not_book)
df_books_only = df[~df["is_non_booky_collection"]].copy()


In [7]:
df_books_only.head()

Unnamed: 0,collection,creator,date,description,identifier,isbn,language,publisher,subject,title,is_non_booky_collection
0,"[internetarchivebooks, printdisabled, inlibrary]","Taylor, Max",1990-01-01T00:00:00Z,"[122 pages : 19 cm, Determined to raise his sc...",shortcircuit0000tayl,"[0843127147, 9780843127140]",eng,"Los Angeles, Calif. : Price Stern Sloan","[Androids -- Fiction, Science -- Exhibitions -...",Short circuit,False
1,"[internetarchivebooks, inlibrary, printdisabled]","Mac a'Bháird, Natasha, author",2009-01-01T00:00:00Z,"[32 pages : 27 cm, ""There's great excitement w...",olannasbigday0000maca,"[9781847171719, 1847171710]",eng,Dublin : The O'Brien Press,"[Nigerians -- Ireland -- Juvenile fiction, Sai...",Olanna's big day,False
2,"[internetarchivebooks, printdisabled, inlibrar...","Ryan, John, 1921-2009",1991-01-01T00:00:00Z,43 pages : 22 cm,captainpugwashhu0000ryan,"[0951707108, 9780951707104, 0951707116, 978095...",eng,"Rye, East Sussex [England] : Gungarden Books","[Pugwash, Captain (Fictitious character) -- Ju...",Captain Pugwash and the huge reward : a tale o...,False
3,"[opensource, community]",Myself?,2022-02-24T00:00:00Z,My library from Goodreads,goodreads_library_export_202202,,eng,,"[Cookbooks, fiction]",goodreads_library_export,False
4,"[internetarchivebooks, inlibrary, printdisabled]","Freethy, Barbara, author",2016-01-01T00:00:00Z,"[383 pages (large print) ; 23 cm, ""Dr. Katheri...",lightninglingers0000free,"[9781628999204, 1628999209]",eng,"Thorndike, Maine : Center Point Large Print","[Large type books, Missing persons -- Fiction]",Lightning lingers,False


In [8]:
#dataset is large so checking the values manually is encouraged to confirm that keywords are not present within the 'collection' column
df_books_only.to_csv("ia_books_only_metadata.csv", index=False)

Next we're going to remove unecessary columns and clean the formatting in order to preare to merge

In [None]:
#first, dropping irrelevant columns
cols_to_keep = ["identifier", "title", "subject", "description", "isbn"]

df_books_clean = df_books_only[cols_to_keep].copy()

#verify the columns are dropped
df_books_clean.head()

Unnamed: 0,identifier,title,subject,description,isbn
0,shortcircuit0000tayl,Short circuit,"[Androids -- Fiction, Science -- Exhibitions -...","[122 pages : 19 cm, Determined to raise his sc...","[0843127147, 9780843127140]"
1,olannasbigday0000maca,Olanna's big day,"[Nigerians -- Ireland -- Juvenile fiction, Sai...","[32 pages : 27 cm, ""There's great excitement w...","[9781847171719, 1847171710]"
2,captainpugwashhu0000ryan,Captain Pugwash and the huge reward : a tale o...,"[Pugwash, Captain (Fictitious character) -- Ju...",43 pages : 22 cm,"[0951707108, 9780951707104, 0951707116, 978095..."
3,goodreads_library_export_202202,goodreads_library_export,"[Cookbooks, fiction]",My library from Goodreads,
4,lightninglingers0000free,Lightning lingers,"[Large type books, Missing persons -- Fiction]","[383 pages (large print) ; 23 cm, ""Dr. Katheri...","[9781628999204, 1628999209]"


In [None]:
#second, clean the formatting for the subject column
import re
import numpy as np
import pandas as pd

import re
import numpy as np
import pandas as pd

def clean_subject(subject):
    # gather everything as a list of strings
    if isinstance(subject, (list, tuple, np.ndarray)):
        raw_strings = [str(x) for x in subject if x is not None]
    elif subject is None or (isinstance(subject, float) and pd.isna(subject)):
        return []
    else:
        raw_strings = [str(subject)]

    # split each string on commas/semicolons, strip outer brackets if present
    parts = []
    for s in raw_strings:
        s = s.strip()
        if s.startswith("[") and s.endswith("]"):
            s = s[1:-1]
        parts.extend(re.split(r"[;,]", s))

    # clean each piece
    tags = []
    for piece in parts:
        t = piece.strip()
        if not t:
            continue
        t = re.sub(r"\s+", " ", t)
        tags.append(t)

    return tags

df_books_clean["subject_list"] = df_books_clean["subject"].apply(clean_subject)

In [None]:
#function to split subject headings into form and topics
def split_heading(heading: str):
    parts = [p.strip() for p in heading.split("--")]
    if not parts:
        return None, []
    form = parts[-1].lower()       # e.g. "juvenile fiction"
    topics = [p.lower() for p in parts[:-1]]  # e.g. ["crocodiles"]
    return form, topics

forms = []
topics = []
for t in df_books_clean["subject_list"].iloc[0]:  # example row
    form, tps = split_heading(t)
    if form:
        forms.append(form)
    topics.extend(tps)


In [None]:
#confirm the subject column is now a list of strings
df_books_clean["subject_list"].iloc[0], type(df_books_clean["subject_list"].iloc[0])

(['Androids -- Fiction',
  'Science -- Exhibitions -- Fiction',
  'Robots -- Fiction',
  'Androids',
  'Science -- Exhibitions'],
 list)

In [15]:
# Look at a few rows side by side
df_books_clean[["subject", "subject_list"]].head(10)

Unnamed: 0,subject,subject_list
0,"[Androids -- Fiction, Science -- Exhibitions -...","[Androids -- Fiction, Science -- Exhibitions -..."
1,"[Nigerians -- Ireland -- Juvenile fiction, Sai...","[Nigerians -- Ireland -- Juvenile fiction, Sai..."
2,"[Pugwash, Captain (Fictitious character) -- Ju...","[Pugwash, Captain (Fictitious character) -- Ju..."
3,"[Cookbooks, fiction]","[Cookbooks, fiction]"
4,"[Large type books, Missing persons -- Fiction]","[Large type books, Missing persons -- Fiction]"
5,"[Spouses -- Fiction, Husbands -- Crimes agains...","[Spouses -- Fiction, Husbands -- Crimes agains..."
6,"[Immigrants -- Juvenile fiction, Bullying -- J...","[Immigrants -- Juvenile fiction, Bullying -- J..."
7,"[Dublin (Ireland) -- Fiction, Ireland -- Dublin]","[Dublin (Ireland) -- Fiction, Ireland -- Dublin]"
8,[English language -- Alphabet -- Pictorial wor...,[English language -- Alphabet -- Pictorial wor...
9,English fiction,[English fiction]


In [16]:
df_books_clean.head(5)

Unnamed: 0,identifier,title,subject,description,isbn,subject_list
0,shortcircuit0000tayl,Short circuit,"[Androids -- Fiction, Science -- Exhibitions -...","[122 pages : 19 cm, Determined to raise his sc...","[0843127147, 9780843127140]","[Androids -- Fiction, Science -- Exhibitions -..."
1,olannasbigday0000maca,Olanna's big day,"[Nigerians -- Ireland -- Juvenile fiction, Sai...","[32 pages : 27 cm, ""There's great excitement w...","[9781847171719, 1847171710]","[Nigerians -- Ireland -- Juvenile fiction, Sai..."
2,captainpugwashhu0000ryan,Captain Pugwash and the huge reward : a tale o...,"[Pugwash, Captain (Fictitious character) -- Ju...",43 pages : 22 cm,"[0951707108, 9780951707104, 0951707116, 978095...","[Pugwash, Captain (Fictitious character) -- Ju..."
3,goodreads_library_export_202202,goodreads_library_export,"[Cookbooks, fiction]",My library from Goodreads,,"[Cookbooks, fiction]"
4,lightninglingers0000free,Lightning lingers,"[Large type books, Missing persons -- Fiction]","[383 pages (large print) ; 23 cm, ""Dr. Katheri...","[9781628999204, 1628999209]","[Large type books, Missing persons -- Fiction]"


In [None]:
#third, clean the description column to extract blurbs
import re
import numpy as np
import pandas as pd

physical_patterns = [
    r"\b\d+\s*(pages?|p\.)\b",   # "122 pages", "301 p."
    r"\bpages?\b",               # bare "pages"
    r"\bcm\b",                   # "22 cm"
    r"large print",
    r"unnumbered pages",
    r"online resource",
    r"\bvolumes?\b",
    r"\bcm\b",
]

boilerplate_patterns = [
    r"^originally published",    # "Originally published: ..."
    r"^novel$",                  # just "Novel"
    r"^ages?\s+\d",              # "Ages 5-7"
]

def looks_physical(text: str) -> bool:
    t = text.lower()
    return any(re.search(pat, t) for pat in physical_patterns)

def looks_boilerplate(text: str) -> bool:
    t = text.lower().strip()
    return any(re.search(pat, t) for pat in boilerplate_patterns)

def normalize_description(desc):
    # Handle missing
    if desc is None or (isinstance(desc, float) and pd.isna(desc)):
        return ""

    # 1) Normalize into a list of chunks
    if isinstance(desc, (list, tuple, np.ndarray)):
        chunks = [str(x).strip() for x in desc if x is not None]
    else:
        s = str(desc).strip()
        if s.startswith("[") and s.endswith("]"):
            s = s[1:-1]
        chunks = re.split(r"\s*[;,]\s*", s)

    # 2) drop physical + boilerplate chunks
    narrative_chunks = [
        c for c in chunks
        if c
        and not looks_physical(c)
        and not looks_boilerplate(c)
    ]

    # 3) if nothing narrative left, treat as "no blurb"
    if not narrative_chunks:
        return ""

    # 4) join narrative pieces into one blurb
    return " ".join(narrative_chunks).strip()

# apply
df_books_clean["blurb"] = df_books_clean["description"].apply(normalize_description)


In [None]:
#confirm changes
df_books_clean.loc[0:10, ["description", "blurb"]]

Unnamed: 0,description,blurb
0,"[122 pages : 19 cm, Determined to raise his sc...","Determined to raise his science grade, Tim Wat..."
1,"[32 pages : 27 cm, ""There's great excitement w...",
2,43 pages : 22 cm,
3,My library from Goodreads,My library from Goodreads
4,"[383 pages (large print) ; 23 cm, ""Dr. Katheri...","""Dr. Katherine Barrett turns to her former hig..."
5,"[408 pages ; 24 cm, ""A taut and absorbing thri...","""A taut and absorbing thriller about a murdere..."
6,"[1 online resource, Bullied relentlessly, Hira...","Bullied relentlessly, Hiram Goldfarb, a Jewish..."
7,"[362 pages ; 24 cm, Ellie seems to have it all...",Ellie seems to have it all - a loving and secu...
8,"[12 unnumbered pages : 15 cm, 3+, Board book]",3+ Board book
9,136p,136p


In [19]:
df_books_clean.head()

Unnamed: 0,identifier,title,subject,description,isbn,subject_list,blurb
0,shortcircuit0000tayl,Short circuit,"[Androids -- Fiction, Science -- Exhibitions -...","[122 pages : 19 cm, Determined to raise his sc...","[0843127147, 9780843127140]","[Androids -- Fiction, Science -- Exhibitions -...","Determined to raise his science grade, Tim Wat..."
1,olannasbigday0000maca,Olanna's big day,"[Nigerians -- Ireland -- Juvenile fiction, Sai...","[32 pages : 27 cm, ""There's great excitement w...","[9781847171719, 1847171710]","[Nigerians -- Ireland -- Juvenile fiction, Sai...",
2,captainpugwashhu0000ryan,Captain Pugwash and the huge reward : a tale o...,"[Pugwash, Captain (Fictitious character) -- Ju...",43 pages : 22 cm,"[0951707108, 9780951707104, 0951707116, 978095...","[Pugwash, Captain (Fictitious character) -- Ju...",
3,goodreads_library_export_202202,goodreads_library_export,"[Cookbooks, fiction]",My library from Goodreads,,"[Cookbooks, fiction]",My library from Goodreads
4,lightninglingers0000free,Lightning lingers,"[Large type books, Missing persons -- Fiction]","[383 pages (large print) ; 23 cm, ""Dr. Katheri...","[9781628999204, 1628999209]","[Large type books, Missing persons -- Fiction]","""Dr. Katherine Barrett turns to her former hig..."


In [20]:
#third, clean the formatting for the description column

#define the unecessary details/tags
physical_patterns = [
    r"\b\d+\s*(pages?|p\.)\b",   # "122 pages", "301 p."
    r"\bpages?\b",               # bare "pages"
    r"\bcm\b",                   # "22 cm"
    r"large print",
    r"unnumbered pages",
    r"online resource",
    r"\bvolumes?\b",
    r"p\.\s*\(large print\)",
]

def looks_physical(text: str) -> bool:
    t = text.lower()
    for pat in physical_patterns:
        if re.search(pat, t):
            return True
    return False


In [21]:
def normalize_description(desc):
    # Handle missing
    if desc is None or (isinstance(desc, float) and pd.isna(desc)):
        return ""

    # 1) Normalize into a list of chunks
    if isinstance(desc, (list, tuple, np.ndarray)):
        chunks = [str(x).strip() for x in desc if x is not None]
    else:
        s = str(desc).strip()
        # some descriptions have brackets but aren't real lists
        if s.startswith("[") and s.endswith("]"):
            s = s[1:-1]
        # split on commas or semicolons
        chunks = re.split(r"\s*[;,]\s*", s)

    # 2) Drop physical-description chunks
    narrative_chunks = [c for c in chunks if c and not looks_physical(c)]

    # 3) If nothing narrative left, fall back to original (joined)
    if not narrative_chunks:
        return " ".join(chunks).strip()

    # 4) Join narrative pieces into one blurb
    return " ".join(narrative_chunks).strip()

# Apply to your dataframe
df_books_clean["blurb"] = df_books_clean["description"].apply(normalize_description)


In [22]:
df_books_clean[["description", "blurb"]].head(10)

Unnamed: 0,description,blurb
0,"[122 pages : 19 cm, Determined to raise his sc...","Determined to raise his science grade, Tim Wat..."
1,"[32 pages : 27 cm, ""There's great excitement w...","32 pages : 27 cm ""There's great excitement whe..."
2,43 pages : 22 cm,43 pages : 22 cm
3,My library from Goodreads,My library from Goodreads
4,"[383 pages (large print) ; 23 cm, ""Dr. Katheri...","""Dr. Katherine Barrett turns to her former hig..."
5,"[408 pages ; 24 cm, ""A taut and absorbing thri...","""A taut and absorbing thriller about a murdere..."
6,"[1 online resource, Bullied relentlessly, Hira...","Bullied relentlessly, Hiram Goldfarb, a Jewish..."
7,"[362 pages ; 24 cm, Ellie seems to have it all...",Ellie seems to have it all - a loving and secu...
8,"[12 unnumbered pages : 15 cm, 3+, Board book]",3+ Board book
9,136p,136p


In [23]:
#second round cleaning of blurb column to get rid of leftover formatting
import re
import numpy as np

def clean_blurb(text):
    if pd.isna(text):
        return np.nan
    
    s = str(text).strip()
    if not s:
        return np.nan

    # normalize whitespace
    s = s.replace("\n", " ")
    s = re.sub(r"\s+", " ", s)

    # remove leading/trailing quotes
    s = re.sub(r"^['\"]+|['\"]+$", "", s)

    # 1) very short lines that are basically just page counts / size
    if len(s) < 40 and re.search(r"\bp\.|\bpages?\b|\bcm\b", s.lower()):
        return np.nan

    # 2) trim some common catalog-y tails
    s = re.sub(r"Includes bibliographical references.*$", "", s, flags=re.I)
    s = re.sub(r"--\s*back cover.*$", "", s, flags=re.I)

    s = s.strip(" ;,.-")
    return s if s else np.nan

df_books_clean["blurb"] = df_books_clean["blurb"].apply(clean_blurb)


In [31]:
#remove entries where isbn=NAN
import numpy as np
import pandas as pd

def remove_missing_isbn(df, col="isbn"):
    """
    Return a copy of df with rows removed where the ISBN column
    is NaN or an empty string.
    """
    # keep rows where isbn is not NaN and not just ""
    mask = df[col].notna() & (df[col].astype(str).str.strip() != "")
    return df[mask].copy()

df_books_clean = remove_missing_isbn(df_books_clean)

In [None]:
#confirm empty isbn rows are removed
df_books_clean.head()

Unnamed: 0,identifier,title,subject,description,isbn,subject_list,blurb
0,shortcircuit0000tayl,Short circuit,"[Androids -- Fiction, Science -- Exhibitions -...","[122 pages : 19 cm, Determined to raise his sc...","[0843127147, 9780843127140]","[Androids -- Fiction, Science -- Exhibitions -...","Determined to raise his science grade, Tim Wat..."
1,olannasbigday0000maca,Olanna's big day,"[Nigerians -- Ireland -- Juvenile fiction, Sai...","[32 pages : 27 cm, ""There's great excitement w...","[9781847171719, 1847171710]","[Nigerians -- Ireland -- Juvenile fiction, Sai...","32 pages : 27 cm ""There's great excitement whe..."
2,captainpugwashhu0000ryan,Captain Pugwash and the huge reward : a tale o...,"[Pugwash, Captain (Fictitious character) -- Ju...",43 pages : 22 cm,"[0951707108, 9780951707104, 0951707116, 978095...","[Pugwash, Captain (Fictitious character) -- Ju...",
4,lightninglingers0000free,Lightning lingers,"[Large type books, Missing persons -- Fiction]","[383 pages (large print) ; 23 cm, ""Dr. Katheri...","[9781628999204, 1628999209]","[Large type books, Missing persons -- Fiction]",Dr. Katherine Barrett turns to her former high...
5,lovinghusbandnov0000kent,The loving husband : a novel,"[Spouses -- Fiction, Husbands -- Crimes agains...","[408 pages ; 24 cm, ""A taut and absorbing thri...","[9780374194123, 0374194122]","[Spouses -- Fiction, Husbands -- Crimes agains...",A taut and absorbing thriller about a murdered...


In [51]:
#fourth, choose a prinary isbn
import re
import pandas as pd

def normalize_isbn(isbn):
    if pd.isna(isbn):
        return pd.NA
    s = str(isbn)
    # remove anything that's not a digit or X (for ISBN-10)
    s = re.sub(r"[^0-9Xx]", "", s)
    s = s.upper()
    return s or pd.NA

def make_isbn_list(cell):
    """
    Take a raw isbn cell (possibly:
      - a Python list like [0843..., 9780...]
      - or a string like "0843..., 9780..."
      - or NaN),
    and return a list of normalized ISBN strings (10 or 13 digits).
    """
    # case 1: already a list from your dataframe
    if isinstance(cell, list):
        parts = cell

    # case 2: missing
    elif pd.isna(cell):
        return []

    # case 3: a string / other scalar
    else:
        parts = str(cell).split(",")

    out = []
    for p in parts:
        nx = normalize_isbn(p)
        if pd.notna(nx):
            out.append(nx)
    return out


def choose_primary_isbn(isbn_list):
    """
    Prefer an ISBN-13 if present, otherwise fall back to the first one.
    """
    if not isbn_list:
        return pd.NA
    # 1) prefer 13-digit ISBNs
    for s in isbn_list:
        if len(s) == 13:
            return s
    # 2) if no 13-digit, just use the first one (likely ISBN-10)
    return isbn_list[0]

# apply to your dataframe
df_blurbs = df_books_clean.copy()

df_blurbs["isbn_list"] = df_blurbs["isbn"].apply(make_isbn_list)
df_blurbs["primary_isbn"] = df_blurbs["isbn_list"].apply(choose_primary_isbn)



In [53]:
#confirm formatting
df_blurbs[["primary_isbn", "title", "blurb", "subject_list"]].head(10)

Unnamed: 0,primary_isbn,title,blurb,subject_list
0,9780843127140,Short circuit,"Determined to raise his science grade, Tim Wat...","[Androids -- Fiction, Science -- Exhibitions -..."
1,9781847171719,Olanna's big day,"32 pages : 27 cm ""There's great excitement whe...","[Nigerians -- Ireland -- Juvenile fiction, Sai..."
2,9780951707104,Captain Pugwash and the huge reward : a tale o...,,"[Pugwash, Captain (Fictitious character) -- Ju..."
4,9781628999204,Lightning lingers,Dr. Katherine Barrett turns to her former high...,"[Large type books, Missing persons -- Fiction]"
5,9780374194123,The loving husband : a novel,A taut and absorbing thriller about a murdered...,"[Spouses -- Fiction, Husbands -- Crimes agains..."
6,9781467724128,Lightning's run,"Bullied relentlessly, Hiram Goldfarb, a Jewish...","[Immigrants -- Juvenile fiction, Bullying -- J..."
7,9781444743661,The letter,Ellie seems to have it all - a loving and secu...,"[Dublin (Ireland) -- Fiction, Ireland -- Dublin]"
8,9781849589604,Letters,3+ Board book,[English language -- Alphabet -- Pictorial wor...
9,950620114,Journal of a coffin dodger,136p,[English fiction]
10,1590580230,The laughing hangman,,"[Bracewell, Nicholas (Fictitious character) --..."


In [64]:
#create the sheet of the final cleaned dataset
cols_to_keep = ["primary_isbn", "title", "subject_list", "blurb"]
df_books_final = df_blurbs[cols_to_keep].copy()
df_books_final.head()

Unnamed: 0,primary_isbn,title,subject_list,blurb
0,9780843127140,Short circuit,"[Androids -- Fiction, Science -- Exhibitions -...","Determined to raise his science grade, Tim Wat..."
1,9781847171719,Olanna's big day,"[Nigerians -- Ireland -- Juvenile fiction, Sai...","32 pages : 27 cm ""There's great excitement whe..."
2,9780951707104,Captain Pugwash and the huge reward : a tale o...,"[Pugwash, Captain (Fictitious character) -- Ju...",
4,9781628999204,Lightning lingers,"[Large type books, Missing persons -- Fiction]",Dr. Katherine Barrett turns to her former high...
5,9780374194123,The loving husband : a novel,"[Spouses -- Fiction, Husbands -- Crimes agains...",A taut and absorbing thriller about a murdered...


In [65]:
df_books_final.to_csv("ia_clean_dataset.csv", index=False)