In [15]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import pipeline

In [2]:
books = pd.read_csv('books_cleaned.csv')

In [3]:
books['categories'].value_counts()

categories
Fiction                       2111
Juvenile Fiction               390
Biography & Autobiography      311
History                        207
Literary Criticism             124
                              ... 
Human-animal relationships       1
Imperialism                      1
Aged women                       1
Humorous stories                 1
Butlers                          1
Name: count, Length: 479, dtype: int64

In [4]:
books['categories'].value_counts().reset_index().query('count > 50')

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Religion,117
6,Philosophy,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [5]:
books[books['categories'] == 'Juvenile Nonfiction']

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book,words_in_description,title_and_subtitle,tagged_description
107,138,9780060277406,0060277408,The Secret Garden Cookbook,Recipes Inspired by Frances Hodgson Burnett's ...,Amy Cotler,Juvenile Nonfiction,http://books.google.com/books/content?id=c7E_H...,Frances Hodgson Burnett's The Secret Garden de...,1999.0,4.28,128.0,142.0,0,25.0,180,The Secret Garden Cookbook: Recipes Inspired b...,9780060277406 Frances Hodgson Burnett's The Se...
108,139,9780060278427,0060278420,Laura's Album,A Remembrance Scrapbook of Laura Ingalls Wilder,William Anderson,Juvenile Nonfiction,http://books.google.com/books/content?id=_zTkq...,Though best known as the author of the Little ...,1998.0,4.3,80.0,713.0,0,26.0,134,Laura's Album: A Remembrance Scrapbook of Laur...,9780060278427 Though best known as the author ...
228,266,9780060782139,0060782137,Time For Kids: Butterflies!,,Editors of TIME For Kids,Juvenile Nonfiction,http://books.google.com/books/content?id=OdZxn...,"Butterflies There are 20,000 different kinds o...",2006.0,4.0,32.0,20.0,0,18.0,48,Time For Kids: Butterflies!,"9780060782139 Butterflies There are 20,000 dif..."
267,306,9780060882600,0060882603,The Annotated Charlotte's Web,,E. B. White,Juvenile Nonfiction,http://books.google.com/books/content?id=vaYYH...,"Charlotte's Web, one of America's best-loved c...",2006.0,4.16,320.0,41.0,0,18.0,79,The Annotated Charlotte's Web,"9780060882600 Charlotte's Web, one of America'..."
434,512,9780064462044,0064462048,My Little House Crafts Book,18 Projects from Laura Ingalls Wilder's,Carolyn Strom Collins,Juvenile Nonfiction,http://books.google.com/books/content?id=lTzrs...,Make the same pioneer crafts that Laura did! I...,1998.0,4.05,64.0,56.0,0,26.0,145,My Little House Crafts Book: 18 Projects from ...,9780064462044 Make the same pioneer crafts tha...
435,513,9780064462341,006446234X,Pioneer Girl,The Story of Laura Ingalls Wilder,William Anderson,Juvenile Nonfiction,http://books.google.com/books/content?id=Sj4UD...,The pioneer spirit lives on... Readers around ...,2000.0,4.15,32.0,414.0,0,24.0,264,Pioneer Girl: The Story of Laura Ingalls Wilder,9780064462341 The pioneer spirit lives on... R...
439,517,9780066236179,0066236177,A Light in the Attic Book and CD,,Shel Silverstein,Juvenile Nonfiction,http://books.google.com/books/content?id=FJfQs...,Last night while I lay thinking here Some What...,2001.0,4.34,176.0,590.0,0,23.0,169,A Light in the Attic Book and CD,9780066236179 Last night while I lay thinking ...
812,988,9780142302279,0142302279,Dirty Beasts,,Roald Dahl,Juvenile Nonfiction,,Poems tell the stories of a smart pig who outw...,2002.0,4.02,32.0,3953.0,0,22.0,33,Dirty Beasts,9780142302279 Poems tell the stories of a smar...
821,999,9780142407226,0142407224,The Tough Guide to Fantasyland,,Diana Wynne Jones,Juvenile Nonfiction,http://books.google.com/books/content?id=v5jxA...,A unique guide to fantasy literature helps rea...,2006.0,3.94,234.0,3897.0,0,18.0,38,The Tough Guide to Fantasyland,9780142407226 A unique guide to fantasy litera...
823,1001,9780142407929,0142407925,The BFG,A Set of Plays,Roald Dahl;David Wood;Jane Walmsley,Juvenile Nonfiction,http://books.google.com/books/content?id=aDf-O...,"With notes on staging, props, and costumes, a ...",2007.0,4.26,128.0,452.0,0,17.0,27,The BFG: A Set of Plays,"9780142407929 With notes on staging, props, an..."


In [6]:
category_mapping = {
    'Fiction': "Fiction", 
    'Juvenile Fiction': "Children's Fiction", 
    'Biography & Autobiography': "Nonfiction", 
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction", 
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}

books['simple_categories'] = books['categories'].map(category_mapping)

In [7]:
books

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book,words_in_description,title_and_subtitle,tagged_description,simple_categories
0,0,9780002005883,0002005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,0,20.0,199,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,1,9780002261982,0002261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,0,24.0,205,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,
2,3,9780006178736,0006178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,0,31.0,57,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,4,9780006280897,0006280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,0,22.0,45,The Four Loves,9780006280897 Lewis' work on the nature of lov...,
4,5,9780006280934,0006280935,The Problem of Pain,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,0,22.0,75,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,6802,9788172235222,8172235224,Mistaken Identity,,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,0,21.0,288,Mistaken Identity,9788172235222 On A Train Journey Home To North...,
5193,6803,9788173031014,8173031010,Journey to the East,,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,0,22.0,63,Journey to the East,9788173031014 This book tells the tale of a ma...,
5194,6804,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,0,21.0,117,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,
5195,6805,9788185300535,8185300534,I Am that,Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,0,25.0,174,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [8]:
books[~(books['simple_categories'].isna())]

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book,words_in_description,title_and_subtitle,tagged_description,simple_categories
0,0,9780002005883,0002005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,0,20.0,199,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
2,3,9780006178736,0006178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,0,31.0,57,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
8,12,9780006482079,0006482074,Warhost of Vastmark,,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,0,29.0,136,Warhost of Vastmark,9780006482079 Tricked once more by his wily ha...,Fiction
30,35,9780006646006,000664600X,Ocean Star Express,,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,0,22.0,129,Ocean Star Express,9780006646006 Joe and his parents are enjoying...,Children's Fiction
46,54,9780007121014,0007121016,Taken at the Flood,,Agatha Christie,Fiction,http://books.google.com/books/content?id=3gWlx...,A Few Weeks After Marrying An Attractive Young...,2002.0,3.71,352.0,8852.0,0,22.0,100,Taken at the Flood,9780007121014 A Few Weeks After Marrying An At...,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,6778,9781933648279,1933648279,Night Has a Thousand Eyes,,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,0,17.0,100,Night Has a Thousand Eyes,"9781933648279 ""Cornell Woolrich's novels defin...",Fiction
5188,6797,9784770028969,4770028962,Coin Locker Babies,,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,0,22.0,41,Coin Locker Babies,9784770028969 Rescued from the lockers in whic...,Fiction
5189,6799,9788122200850,8122200850,"Cry, the Peacock",,Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,0,44.0,33,"Cry, the Peacock",9788122200850 This book is the story of a youn...,Fiction
5195,6805,9788185300535,8185300534,I Am that,Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,0,25.0,174,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [10]:
pipe = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

fiction_categories = ['Fiction', 'Nonfiction']




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['model.decoder.version', 'model.encoder.version']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use 0


In [13]:
sequence = books.loc[books['simple_categories'] == 'Fiction', 'description'].reset_index(drop=True)[0]

In [14]:
pipe(sequence, fiction_categories)

{'sequence': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst

In [17]:
max_index = np.argmax(pipe(sequence, fiction_categories)['scores'])
max_label = pipe(sequence, fiction_categories)['labels'][max_index]
max_label

'Fiction'

In [19]:
def generate_predictions(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    
    return max_label

In [20]:
actual_cats = []
predicted_cats = []

for i in tqdm(range(0, 300)):
    sequence = books.loc[books['simple_categories'] == 'Fiction', 'description'].reset_index(drop=True)[i]
    predicted_cats += [generate_predictions(sequence, fiction_categories)]
    actual_cats += ['Fiction']

100%|██████████| 300/300 [27:41<00:00,  5.54s/it]


In [21]:
for i in tqdm(range(0, 300)):
    sequence = books.loc[books['simple_categories'] == 'Nonfiction', 'description'].reset_index(drop=True)[i]
    predicted_cats += [generate_predictions(sequence, fiction_categories)]
    actual_cats += ['Nonfiction']

100%|██████████| 300/300 [23:18<00:00,  4.66s/it]
