In [1]:
import pandas as pd
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import os



## Load CSV File

In [None]:
df = pd.read_csv('../data/RC_books.csv' , encoding='latin-1')

FileNotFoundError: [Errno 2] No such file or directory: '../RC_books.csv'

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36358 entries, 0 to 36357
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Acc. Date           36179 non-null  object 
 1   Acc. No.            36358 non-null  int64  
 2   Title               36358 non-null  object 
 3   ISBN                36358 non-null  object 
 4   Author/Editor       36295 non-null  object 
 5   Ed./Vol.            5779 non-null   object 
 6   Place & Publisher   36358 non-null  object 
 7   Year                36188 non-null  float64
 8   Page(s)             36357 non-null  object 
 9   Class No./Book No.  36358 non-null  object 
 10  Unnamed: 10         23 non-null     object 
 11  Unnamed: 11         20 non-null     object 
 12  Unnamed: 12         13 non-null     object 
 13  Unnamed: 13         13 non-null     object 
 14  Unnamed: 14         11 non-null     object 
 15  Unnamed: 15         10 non-null     object 
 16  Unna

## Using Google API to fetch the description

In [None]:


# =========================
# CONFIG
# =========================
INPUT_CSV = "../RC_books.csv"        # adjust if needed
OUTPUT_CSV = "books_with_descriptions_safe.csv"
ISBN_COL = "ISBN"              # MUST exist
SLEEP_TIME = 0.2
MAX_RETRIES = 3
SAVE_EVERY = 100                     # save progress every 100 rows

# =========================
# LOAD DATA (encoding-safe)
# =========================
df = pd.read_csv(
    INPUT_CSV,
    encoding="latin1",
    low_memory=False
)

df = df[:10].copy()
# Ensure column exists
if "book_description" not in df.columns:
    df["book_description"] = None

# =========================
# GOOGLE BOOKS FETCH
# =========================
def fetch_description(isbn):
    if pd.isna(isbn):
        return None

    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.get(url, timeout=5)
            if r.status_code == 200:
                data = r.json()
                if "items" in data:
                    return data["items"][0]["volumeInfo"].get("description")
        except Exception as e:
            pass

        time.sleep(SLEEP_TIME)

    return None

# =========================
# MAIN LOOP (SEQUENTIAL)
# =========================
start_time = time.time()

for idx, isbn in enumerate(df[ISBN_COL]):

    # Skip already fetched rows (resume-safe)
    if pd.notna(df.at[idx, "book_description"]):
        continue

    desc = fetch_description(isbn)
    df.at[idx, "book_description"] = desc

    # Sleep between requests
    time.sleep(SLEEP_TIME)

    # Periodic save
    if (idx + 1) % SAVE_EVERY == 0:
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"Saved progress at row {idx + 1}")

# Final save
df.to_csv(OUTPUT_CSV, index=False)

elapsed = (time.time() - start_time) / 3600
print(f"Completed safely in {elapsed:.2f} hours ✅")


Saved progress at row 100
Saved progress at row 200
Saved progress at row 300
Saved progress at row 400
Saved progress at row 500
Saved progress at row 600
Saved progress at row 700
Saved progress at row 800
Saved progress at row 900
Saved progress at row 1000
Saved progress at row 1100
Saved progress at row 1200
Saved progress at row 1300
Saved progress at row 1400
Saved progress at row 1500
Saved progress at row 1600
Saved progress at row 1700
Saved progress at row 1800
Saved progress at row 1900
Saved progress at row 2000
Saved progress at row 2100
Saved progress at row 2200
Saved progress at row 2300
Saved progress at row 2400
Saved progress at row 2500
Saved progress at row 2600
Saved progress at row 2700
Saved progress at row 2800
Saved progress at row 2900
Saved progress at row 3000
Saved progress at row 3100
Saved progress at row 3200
Saved progress at row 3300
Saved progress at row 3400
Saved progress at row 3500
Saved progress at row 3600
Saved progress at row 3700
Saved prog

## Using OpenLibrary API to fetch the Description

In [None]:
INPUT_CSV = "books_with_descriptions_safe.csv"
OUTPUT_CSV = "books_with_descriptions_full.csv"
ISBN_COL = "ISBN"

df = pd.read_csv(INPUT_CSV, encoding="latin1", low_memory=False)

def fetch_openlibrary(isbn):
    if pd.isna(isbn):
        return None

    url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json&jscmd=data"
    try:
        r = requests.get(url, timeout=5)
        data = r.json()
        key = f"ISBN:{isbn}"

        if key in data:
            return data[key].get("notes") or data[key].get("subtitle")
    except:
        pass
    return None

count = 0
for idx, row in df.iterrows():
    if pd.isna(row["book_description"]):
        desc = fetch_openlibrary(row[ISBN_COL])
        df.at[idx, "book_description"] = desc
        count += 1
        time.sleep(0.2)

        if count % 200 == 0:
            df.to_csv(OUTPUT_CSV, index=False)
            print(f"Filled {count} rows from Open Library")

df.to_csv(OUTPUT_CSV, index=False)
print("Open Library fallback completed ✅")


Filled 200 rows from Open Library
Filled 400 rows from Open Library
Filled 600 rows from Open Library
Filled 800 rows from Open Library
Filled 1000 rows from Open Library
Filled 1200 rows from Open Library
Filled 1400 rows from Open Library
Filled 1600 rows from Open Library
Filled 1800 rows from Open Library
Filled 2000 rows from Open Library
Filled 2200 rows from Open Library
Filled 2400 rows from Open Library
Filled 2600 rows from Open Library
Filled 2800 rows from Open Library
Filled 3000 rows from Open Library
Filled 3200 rows from Open Library
Filled 3400 rows from Open Library
Filled 3600 rows from Open Library
Filled 3800 rows from Open Library
Filled 4000 rows from Open Library
Filled 4200 rows from Open Library
Filled 4400 rows from Open Library
Filled 4600 rows from Open Library
Filled 4800 rows from Open Library
Filled 5000 rows from Open Library
Filled 5200 rows from Open Library
Filled 5400 rows from Open Library
Filled 5600 rows from Open Library
Filled 5800 rows from Op

## Using GoogleAPI again to fetch Remaining Description

In [None]:

# =========================
# CONFIG
# =========================
INPUT_CSV = "books_with_descriptions_full.csv"
OUTPUT_CSV = "books_google_books.csv"
ISBN_COL = "ISBN"

SLEEP_TIME = 0.2
MAX_RETRIES = 3
SAVE_EVERY = 200

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(INPUT_CSV, encoding="latin1", low_memory=False)

if "book_description" not in df.columns:
    df["book_description"] = None

# =========================
# GOOGLE BOOKS FETCH
# =========================
def fetch_google_books(isbn):
    if pd.isna(isbn):
        return None

    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"

    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, timeout=5)
            if r.status_code == 200:
                data = r.json()
                if "items" in data:
                    return data["items"][0]["volumeInfo"].get("description")
        except:
            pass

        time.sleep(SLEEP_TIME)

    return None

# =========================
# MAIN LOOP
# =========================
for idx, isbn in enumerate(df[ISBN_COL]):

    if pd.notna(df.at[idx, "book_description"]):
        continue

    desc = fetch_google_books(isbn)
    if desc:
        df.at[idx, "book_description"] = desc
        df.at[idx, "description_source"] = "google_books"

    time.sleep(SLEEP_TIME)

    if (idx + 1) % SAVE_EVERY == 0:
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"Saved at row {idx + 1}")

df.to_csv(OUTPUT_CSV, index=False)
print("Google Books step completed ✅")


Saved at row 200
Saved at row 800
Saved at row 1400
Saved at row 1800
Saved at row 2000
Saved at row 4400
Saved at row 5200
Saved at row 5400
Saved at row 5800
Saved at row 6400
Saved at row 6800
Saved at row 7000
Saved at row 7200
Saved at row 7400
Saved at row 7800
Saved at row 8400
Saved at row 9400
Saved at row 10000
Saved at row 10400
Saved at row 10800
Saved at row 11200
Saved at row 12000
Saved at row 12200
Saved at row 12800
Saved at row 13000
Saved at row 13800
Saved at row 14000
Saved at row 14400
Saved at row 14600
Saved at row 15000
Saved at row 15200
Saved at row 15600
Saved at row 16000
Saved at row 16800
Saved at row 17200
Saved at row 17400
Saved at row 17600
Saved at row 17800
Saved at row 18400
Saved at row 18800
Saved at row 19000
Saved at row 19200
Saved at row 20000
Saved at row 20200
Saved at row 20400
Saved at row 21400
Saved at row 21600
Saved at row 21800
Saved at row 22400
Saved at row 22800
Saved at row 23000
Saved at row 24400
Saved at row 24600
Saved at row

# * Ingestion Step is completed. *
## Sources : Openlibrary API , googlebooks API
#
## Workflow : We had load the raw data of the books. but description was missing in given file , so we used openlibrary API and googlebooks API to fetch the books description using unique ISBN number. 

## Next Goal : Transformation