In [2]:
import pandas as pd
import numpy as np
books=pd.read_csv("../data/books_cleaned.csv")

In [9]:
pd.set_option('display.max_rows', 20)
books["categories"].value_counts().reset_index().head(20)

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Religion,117
6,Philosophy,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [None]:
category_mapping = {
    'Fiction': 'Fiction',
    'Juvenile Fiction': 'Children\'s Fiction',
    'Children\'s Fiction': 'Children\'s Fiction',
    'Biography & Autobiography': 'Biography',
    'Autobiography': 'Biography',
    'History': 'History',
    'Literary Criticism': 'Literary Studies',
    'Philosophy': 'Philosophy',
    'Religion': 'Religion',
    'Comics & Graphic Novels': 'Comics/Graphic Novels',
    'Comics': 'Comics/Graphic Novels',
    'Graphic Novels': 'Comics/Graphic Novels',
    'Drama': 'Drama',
    'Juvenile Nonfiction': 'Children\'s Nonfiction',
    'Children\'s Nonfiction': 'Children\'s Nonfiction',
    'Science': 'Science',
    'Science & Nature': 'Science',
    'Poetry': 'Poetry',
    'Health & Fitness': 'Health',
    'Self-Help': 'Self-Help',
    'Cooking': 'Cooking',
    'Education': 'Education',
    'Computers': 'Technology',
    'Technology': 'Technology',
    'Mathematics': 'Mathematics',
    'Business & Economics': 'Business',
    'Art': 'Art',
    'Music': 'Music',
    'Travel': 'Travel',
    'Social Science': 'Social Science',
    'True Crime': 'Crime',
    'Crime': 'Crime',
    'Mystery': 'Mystery',
    'Fantasy': 'Fantasy',
    'Adventure': 'Adventure',
    'Horror': 'Horror',
    'Romance': 'Romance',
}
books["mapped_category"] = books["categories"].map(category_mapping)
books[~(books["mapped_category"]).isna()]

In [15]:
from transformers import pipeline

# Step 1: Define your categories
candidate_labels = [
    "Fiction", "Nonfiction", "Children's Fiction", "Biography", "Science", "Philosophy",
    "Poetry", "History", "Technology", "Health", "Business", "Crime", "Mystery", "Fantasy",
    "Adventure", "Horror", "Romance", "Education", "Self-Help", "Comics/Graphic Novels",
    "Social Science", "Art", "Music", "Travel"
]

# Step 2: Load zero-shot classifier pipeline
pipe = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
)

Device set to use cpu


In [None]:
max_index=np

In [16]:
def classify_book_category(sequence,categories):
    predictions=pipe(sequence,categories)
    max_index=np.argmax(predictions["scores"])
    max_label=predictions["labels"][max_index]
    return max_label

In [17]:
classify_book_category("A thrilling mystery novel set in a small town.", candidate_labels)

'Mystery'

In [26]:
from tqdm import tqdm
actual_cats = []
predicted_cats = []


# Optional: limit categories you want to test
categories_to_test = ['Fiction', 'Nonfiction']

# Step 5: Evaluation loop with tqdm
for category in categories_to_test:
    samples = books.loc[books["mapped_category"] == category, "description"].dropna().reset_index(drop=True)
    for i in tqdm(range(min(300, len(samples))), desc=f"Classifying {category}"):
        desc = samples[i]
        predicted = classify_book_category(desc, categories_to_test)  # Only classify into Fiction vs Nonfiction here
        predicted_cats.append(predicted)
        actual_cats.append(category)

# Step 6: Create results DataFrame and show accuracy
results_df = pd.DataFrame({
    'actual': actual_cats,
    'predicted': predicted_cats
})

accuracy = (results_df["actual"] == results_df["predicted"]).mean()
print(f"\nAccuracy: {accuracy:.2%}")

Classifying Fiction: 100%|██████████| 300/300 [08:13<00:00,  1.64s/it]
Classifying Nonfiction: 0it [00:00, ?it/s]


Accuracy: 70.33%





In [None]:
pd.set_option('display.max_rows', 20)
results_df.head(20)


In [28]:
isbns = []
predicted_cats = []

missing_cats = books.loc[books["mapped_category"].isna(), ["isbn13", "description"]].reset_index(drop=True)[:500]
for i in tqdm(range(0, len(missing_cats))):
    sequence = missing_cats["description"][i]
    predicted_cats += [classify_book_category(sequence,candidate_labels[:15])]
    isbns += [missing_cats["isbn13"][i]]



100%|██████████| 500/500 [8:42:28<00:00, 62.70s/it]      


In [29]:
missing_predicted_df = pd.DataFrame({
    "isbn13": isbns,
    "predicted_category": predicted_cats
})

In [32]:
missing_predicted_df["predicted_category"].value_counts().reset_index().head(20)

Unnamed: 0,predicted_category,count
0,Mystery,87
1,History,83
2,Adventure,81
3,Fiction,70
4,Nonfiction,44
5,Biography,27
6,Crime,21
7,Health,20
8,Science,16
9,Fantasy,14


In [30]:
books=pd.merge(books,missing_predicted_df, on="isbn13", how="left")
books["mapped_category"] = np.where(books["mapped_category"].isna(),books["predicted_category"],books["mapped_category"])

In [33]:
books=books.drop(columns=["predicted_category"])

In [None]:
books[books["title"].str.contains("Harry Potter", case=False, na=False)]


In [41]:
books.to_csv("books_classified.csv", index=False)