In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
_
import pickle
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
%run 00_preprocessing_fuctions.ipynb

In [3]:
FILE_PATHS = {
    'processed': 'testing_datasets/processed_data.csv', 
    'compiled': 'testing_datasets/compiled_books.csv',
}

In [4]:
def load_model(file_name: str) -> object:
    """
    Loads in previously saved model.
    """
    with open(f"models/{file_name}", 'rb') as f:
        return pickle.load(f)

_
le = load_model('label_encoder.pkl')  # Label encoder
clf = load_model('classifier.pkl')  # Classifier
vectorizer = load_model('vectorizer.pkl')  # TD-iDF

### Testing
Using test data but also random book blurbs from the internet.

In [5]:
df = pd.read_csv(FILE_PATHS['compiled'])
_
print(df.shape, '\n')
df.head()

(654, 5) 



Unnamed: 0,title,author,genre,text,id
0,The Wind in the Willows,Kenneth Grahame,Fantasy,"The tales of Ratty, Mole, Badger and Toad. Whe...",1
1,Bridge to Terabithia,Katherine Paterson,Fantasy,The life of a ten-year-old boy in rural Virgin...,2
2,Through the Looking Glass,Lewis Carroll,Fantasy,"In this sequel to Alice in Wonderland, Alice c...",3
3,"The Hobbit, Or, There and Back Again",John Ronald Reuel Tolkien,Fantasy,"Tolkien's ""The Hobbit,"" which first appeared o...",4
4,Throne of Glass,Sarah J. Maas,Fantasy,"A hugely commercial, fabulously addictive fant...",5


In [6]:
# Preprocess text
for index, row in tqdm(df.iterrows(), total=len(df)):
    try:
        cleaned_text = clean_text(row['original_text'])
    except:
        cleaned_text = clean_text(row['text'])
    processed_text = preprocess_text(cleaned_text)
    df.loc[index, 'filtered_text'] = processed_text
_
df.sample(5)

  0%|          | 0/654 [00:00<?, ?it/s]

Unnamed: 0,title,author,genre,text,id,filtered_text
241,The War of the Worlds,H. G. Wells,Science Fiction,"The War of the Worlds (1898), by H. G. Wells, ...",242,war world well early describes invasion englan...
533,The Silence That Speaks,Andrea Kane,Thriller,Former Navy SEAL Marc Deveraux teams up with F...,534,former navy seal marc deveraux team forensic i...
641,Son of Rosemary,Ira Levin,Thriller,Son of Rosemary opens at the dawn of the new m...,642,son rosemary open dawn millennium human hope s...
343,Modern Biology & Visions of Humanity,European Group on Life Sciences,Science Fiction,How do the most recent developments in the lif...,344,recent development life affect understanding h...
339,"The ""Thaw"" in Bulgarian Literature",Atanas Slavov,Science Fiction,A critical review of post-World War II Bulgari...,340,critical review post world war bulgarian liter...


In [7]:
# Encode target labels
df['genres_encoded'] = le.transform(df['genre'])

In [8]:
X, y = df['filtered_text'], df['genres_encoded']
_
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (654,)
y shape: (654,)


In [9]:
# Run predictions
X = tdm_converter(vectorizer, X)
y_pred = clf.predict(X)
y_probas = clf.predict_proba(X)

# Display results
display_metrics(clf.__class__.__name__, y, y_pred, y_probas)

Model: ComplementNB
Accuracy score: 0.7155963302752294
Recall score: 0.7346340614862944
Precision score: 0.7234624040223439
f1 score: 0.7121217604080508
Log loss 0.989741009292532
ROC AUC: 0.9201729099434937


In [10]:
results = pd.DataFrame({'True': y, 'Pred': y_pred})
_
results.sample(7)

Unnamed: 0,True,Pred
199,2,2
573,3,3
458,1,1
603,3,3
351,2,1
623,3,3
204,2,2


### Individual testing

In [24]:
row = df.sample(1)
try:
    text, genre_label = row['text'].item(), row['genre'].item()
except:
    text, genre_label = row['original_text'].item(), row['genre'].item()
_
text

"It's been thirty years since Antonie's and Melanie's mother died. But when a visit to the sea at Noirmoutier Island triggers painful memories of their haunting childhood--and Melanie lies in the hospital recovering from a near fatal accident--Antoine must confront his past and also his troubled relationships with his own children."

In [25]:
# # Just testing random book synopses from Goodreads
# text = """
# Six days ago, astronaut Mark Watney became one of the first people to walk on Mars.

# Now, he’s sure he’ll be the first person to die there.

# After a dust storm nearly kills him and forces his crew to evacuate while thinking him dead, Mark finds himself stranded and completely alone with no way to even signal Earth that he’s alive—and even if he could get word out, his supplies would be gone long before a rescue could arrive.

# Chances are, though, he won’t have time to starve to death. The damaged machinery, unforgiving environment, or plain-old “human error” are much more likely to kill him first.

# But Mark isn’t ready to give up yet. Drawing on his ingenuity, his engineering skills — and a relentless, dogged refusal to quit — he steadfastly confronts one seemingly insurmountable obstacle after the next. Will his resourcefulness be enough to overcome the impossible odds against him?
# """
# genre_label = 'Science Fiction'
# _
# text

In [26]:
# Preprocess text
text = clean_text(text)
text = preprocess_text(text)
_
text

'thirty year since antonie melanie mother died visit sea noirmoutier island trigger painful memory haunting childhood melanie lie hospital recovering near fatal accident antoine must confront past also troubled relationship child'

In [27]:
# Run predictions
text_tdm = tdm_converter(vectorizer, [text])
prediction = clf.predict(text_tdm)
probas = clf.predict_proba(text_tdm)
_
label_keys = {i:le.classes_[i] for i in range(len(le.classes_))}
print(f"true: {genre_label}\npred: {label_keys[prediction[0]]}")
# {0: 'Fantasy', 1: 'Romance', 2: 'Science Fiction', 3: 'Thriller'}
probas

true: Thriller
pred: Thriller


array([[0.15323036, 0.30065117, 0.20462533, 0.34149314]])