In [2]:
import pandas as pd
import numpy as np

In [3]:
books = pd.read_csv("cleaned_books_dataset.csv")
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tag_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883_A NOVEL THAT READERS and critics...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web_A Novel,9780002261982_A new 'Christie for Christmas' -...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736_A memorable, mesmerizing heroine..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897_Lewis' work on the nature of lov...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934_""In The Problem of Pain, C.S. Le..."


In [4]:
books['categories'].value_counts().reset_index().query('count > 50')

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Religion,117
6,Philosophy,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [5]:
# pd.set_option("display.ma x_colwidth", None)

In [6]:
books[books['categories']=="Juvenile Fiction"][['title','categories', 'description']]

Unnamed: 0,title,categories,description
30,Ocean Star Express,Juvenile Fiction,Joe and his parents are enjoying a summer holi...
79,The voyage of the Dawn Treader,Juvenile Fiction,"The ""Dawn Treader"" is the first ship Narnia ha..."
85,Where the Red Fern Grows,Juvenile Fiction,A young boy living in the Ozarks achieves his ...
86,Poppy's Return,Juvenile Fiction,"There's trouble at Gray House, the girlhood ho..."
87,Diary of a Spider,Juvenile Fiction,This is the diary ... of a spider. But don't b...
...,...,...,...
4845,Fablehaven,Juvenile Fiction,When Kendra and Seth go to stay at their grand...
4930,Sherlock Holmes and the Case of the Hound of t...,Juvenile Fiction,Sherlock Holmes and Dr. Watson travel to the b...
4942,The Drift House,Juvenile Fiction,Sent to stay with their uncle in a ship-like h...
5010,Attack of the Jaguar,Juvenile Fiction,This training manual for operatives of Xtreme ...


In [7]:
books[books['categories']=="Juvenile Nonfiction"][['title','categories', 'description']].head()

Unnamed: 0,title,categories,description
107,The Secret Garden Cookbook,Juvenile Nonfiction,Frances Hodgson Burnett's The Secret Garden de...
108,Laura's Album,Juvenile Nonfiction,Though best known as the author of the Little ...
228,Time For Kids: Butterflies!,Juvenile Nonfiction,"Butterflies There are 20,000 different kinds o..."
267,The Annotated Charlotte's Web,Juvenile Nonfiction,"Charlotte's Web, one of America's best-loved c..."
434,My Little House Crafts Book,Juvenile Nonfiction,Make the same pioneer crafts that Laura did! I...


In [8]:

pd.set_option("display.max_colwidth", 50)
category_mapping = {
    'Fiction': "Fiction",
    'Juvenile Fiction': "Children's Fiction",
    'Biography & Autobiography' : "Nonfiction",
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction",
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction", 
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}
books['simple_categories'] = books['categories'].map(category_mapping)


In [9]:
(~books['simple_categories'].isna()).sum() # in total with that simple mapping we could put ~3700 of the books in simple categories

np.int64(3743)

# Using huggingface to perform zero-shot classification

In [10]:
from transformers import pipeline

# Use a tiny model for a quick check
# SMALL_MODEL = "finiteautomata/bertweet-base-sentiment-analysis" 
LARGE_MODEL = "facebook/bart-large-mnli"
fiction_categories = ["Fiction", "Nonfiction"]

pipe = pipeline("zero-shot-classification", 
                model=LARGE_MODEL,
                device=0,
                local_files_only=False) # Force redownload

print("Pipeline initialized successfully!")

result = pipe("This is a simple test sentence.", fiction_categories)
print(result)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


Pipeline initialized successfully!
{'sequence': 'This is a simple test sentence.', 'labels': ['Nonfiction', 'Fiction'], 'scores': [0.7708202004432678, 0.2291797697544098]}


In [11]:
# Lets check how the classifier is working 
seq = books.loc[books['simple_categories']== "Fiction", "description"].reset_index(drop=True)[0]
print(pipe(seq, fiction_categories)['labels'], pipe(seq, fiction_categories)['scores'])

seq = books.loc[books['simple_categories']== "Nonfiction", "description"].reset_index(drop=True)[0]
print(pipe(seq, fiction_categories)['labels'], pipe(seq, fiction_categories)['scores'])


['Fiction', 'Nonfiction'] [0.8438265323638916, 0.1561734527349472]
['Nonfiction', 'Fiction'] [0.8751672506332397, 0.12483272701501846]


In [12]:
seq = books.loc[books['simple_categories']== "Nonfiction", "description"].reset_index(drop=True)[0]
result = pipe(seq, fiction_categories)
max_index = np.argmax(result['scores'])
category = result['labels'][max_index]
print(category)

Nonfiction


In [13]:
def generate_predictions(sequence, categories):

    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]

    return max_label


# Lets perform classification and evaluate the classifier

In [14]:
# We check the preformance for the fiction category

predicted_categories = []
actual_categories = []

for i, description in enumerate(books.loc[books['simple_categories']=="Fiction", 'description'][0:300]):
    sequence = books.loc[books['simple_categories']=="Fiction", 'description'].tolist()[i]
    predicted_categories.append(generate_predictions(sequence, fiction_categories))
    actual_categories.append("Fiction")

In [15]:
# We check the preformance for the Nonfiction category

for i, description in enumerate(books.loc[books['simple_categories']=="Nonfiction", 'description'][0:300]):
    sequence = books.loc[books['simple_categories']=="Nonfiction", 'description'].tolist()[i]
    predicted_categories.append(generate_predictions(sequence, fiction_categories))
    actual_categories.append("Nonfiction")

In [16]:
df = pd.DataFrame({"predicted_cats": predicted_categories, "actual_cats": actual_categories})
df.head()

Unnamed: 0,predicted_cats,actual_cats
0,Fiction,Fiction
1,Fiction,Fiction
2,Fiction,Fiction
3,Nonfiction,Fiction
4,Fiction,Fiction


In [17]:
len(predicted_categories)

600

In [18]:
print("classifier_accuracy:", (df['predicted_cats'] == df['actual_cats']).sum() / df.shape[0] * 100)

classifier_accuracy: 77.83333333333333


We decide that this accuracy is good enough for the time being and we will continue to use this classifier for classifying all the books we have

# Perform classification for all the books

We are going to perform calssification for those that we did not define a simple_category manually. 

In [19]:
# How many books are missing simple_categories
books.simple_categories.isna().sum()
    

np.int64(1454)

In [20]:
missing_simple_cats = books[books.simple_categories.isna()][['isbn13', 'description']].reset_index(drop=True)
missing_simple_cats.head()

Unnamed: 0,isbn13,description
0,9780002261982,A new 'Christie for Christmas' -- a full-lengt...
1,9780006280897,Lewis' work on the nature of love divides love...
2,9780006280934,"""In The Problem of Pain, C.S. Lewis, one of th..."
3,9780006380832,Until Vasco da Gama discovered the sea-route t...
4,9780006470229,A new-cover reissue of the fourth book in the ...


In [21]:
predicted_cats = [generate_predictions(des, fiction_categories) for des in missing_simple_cats['description']]
print(len(predicted_cats))

1454


In [22]:
missing_simple_cats['predicted_cats'] = predicted_cats
missing_simple_cats.head()

Unnamed: 0,isbn13,description,predicted_cats
0,9780002261982,A new 'Christie for Christmas' -- a full-lengt...,Fiction
1,9780006280897,Lewis' work on the nature of love divides love...,Nonfiction
2,9780006280934,"""In The Problem of Pain, C.S. Lewis, one of th...",Nonfiction
3,9780006380832,Until Vasco da Gama discovered the sea-route t...,Nonfiction
4,9780006470229,A new-cover reissue of the fourth book in the ...,Fiction


In [23]:
books = pd.merge(books, missing_simple_cats[['isbn13','predicted_cats']], on="isbn13", how='left')

In [24]:
books['simple_categories'] = np.where(books['simple_categories'].isna(), books['predicted_cats'], books['simple_categories'])
books = books.drop(columns=['predicted_cats'])

In [25]:
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tag_description,simple_categories
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883_A NOVEL THAT READERS and critics...,Fiction
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web_A Novel,9780002261982_A new 'Christie for Christmas' -...,Fiction
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736_A memorable, mesmerizing heroine...",Fiction
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897_Lewis' work on the nature of lov...,Nonfiction
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934_""In The Problem of Pain, C.S. Le...",Nonfiction


In [26]:
books.to_csv("books_with_categories.csv")

The question here is that, why not using the classifier to make more categories. but the thing is that we should have enough examples to make sure the model is able to generate correct calsses. For example for fiction and non-fiction we could evaluate the model to see how it works but for other classes we require more examples or data. so it really depend on the data we have

# Sentiment analysis (fine tunning)

We are going to calssify our texts to seven different categories: Anger, disgust, fear, joy, sadness, suprise, neutral. The LLM can predict the emotion from the description

since this is a time consuming and complicated task we will use a fine_tuned model from the hugging-face to do this task for us. See the next jupyter notebook for the rest