In [1]:

import pandas as pd

books = pd.read_csv('book_cleaned.csv')

In [2]:
books["categories"].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Aged women,1
475,Imperialism,1
476,Human-animal relationships,1
477,Amish,1


In [3]:
books[books["categories"] == "Juvenile Fiction"].value_counts()

isbn13         isbn10      title                                                          authors                              categories        thumbnail                                                                                                        description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  published_year  average_rating  num_pages  ratings_count  title and subti

In [4]:
category_mapping = {
    "Fiction": "Fiction",
    "Juvenile Fiction": "Children's Fiction",
    "Biography & Autobiography": "Nonfiction",
    "History": "Nonfiction",
    "Literary Criticism": "Nonfiction",
    "Philosophy": "Nonfiction",
    "Religion": "Nonfiction",
    "Comics & Graphic Novels": "Fiction",
    "Drama": "Fiction",
    "Juvenile Nonfiction": "Children's Nonfiction",
    "Science": "Nonfiction",
    "Poetry": "Fiction"
}
books["simple_categories"] = books["categories"].map(category_mapping)

In [5]:
books[~(books["simple_categories"].isna())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title and subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079: Tricked once more by his wily h...,Fiction
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006: Joe and his parents are enjoyin...,Children's Fiction
46,9780007121014,0007121016,Taken at the Flood,Agatha Christie,Fiction,http://books.google.com/books/content?id=3gWlx...,A Few Weeks After Marrying An Attractive Young...,2002.0,3.71,352.0,8852.0,Taken at the Flood,9780007121014: A Few Weeks After Marrying An A...,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,9781933648279,1933648279,Night Has a Thousand Eyes,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,Night Has a Thousand Eyes,"9781933648279: ""Cornell Woolrich's novels defi...",Fiction
5188,9784770028969,4770028962,Coin Locker Babies,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,Coin Locker Babies,9784770028969: Rescued from the lockers in whi...,Fiction
5189,9788122200850,8122200850,"Cry, the Peacock",Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,"Cry, the Peacock",9788122200850: This book is the story of a you...,Fiction
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535: This collection of the timeless...,Nonfiction


In [6]:
import torch
from transformers import pipeline

fiction_categories = ["Fiction", "Nonfiction"]

pipe = pipeline("zero-shot-classification",
                model="facebook/bart-large-mnli",
                device=0)

Device set to use cuda:0


In [7]:
import numpy as np


def generate_prediction(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

In [8]:

from tqdm import tqdm

actual_cats = []
prediction_cats = []
for i in tqdm(range(300)):
    sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
    prediction_cats.append(generate_prediction(sequence, fiction_categories))
    actual_cats.append("Fiction")

  3%|▎         | 10/300 [00:01<00:34,  8.45it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 300/300 [00:23<00:00, 12.63it/s]


In [None]:
for i in tqdm(range(300)):
    sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
    prediction_cats.append(generate_prediction(sequence, fiction_categories))
    actual_cats.append("Nonfiction")

In [None]:
prediction_df = pd.DataFrame({"predicted": prediction_cats, "actual": actual_cats})

In [None]:
prediction_df

In [None]:
prediction_df["correct_prediction"] = np.where(prediction_df["predicted"] == prediction_df["actual"], 1, 0)

In [None]:
prediction_df

In [None]:
prediction_df['correct_prediction'].sum() / len(prediction_df)

In [None]:
isbns = []
predicted_cats = []

missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
missing_cats

In [None]:
for i in tqdm(range(len(missing_cats))):
    sequence = missing_cats["description"][i]
    predicted_cats.append(generate_prediction(sequence, fiction_categories))
    isbns.append(missing_cats["isbn13"][i])

In [None]:
missing_predicted_cats = pd.DataFrame({"isbn13": isbns, "predicted": predicted_cats})

In [None]:
# on将isbn13作为匹配依据
books = pd.merge(books, missing_predicted_cats, on="isbn13", how="left")

In [None]:
books

In [None]:
# simple_categories 不为空保持不变，为空替换为simple_categories
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted"], books['simple_categories'])
books.drop(columns=["predicted"])

In [None]:
books[books["categories"].str.lower().isin(
    [
        "romance",
        "science fiction",
        "scifi",
        "fantasy",
        "horror",
        "mystery",
        "thriller",
        "comedy",
        "crime",
        "historical"
    ]
)]

In [None]:
books.to_csv("books_with_categories.csv", index=False)