In [2]:
import re
import numpy as np
_
# NLP
import nltk
from nltk.corpus import stopwords
from unicodedata import normalize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
_
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, log_loss

A Bunch of help functions I'll use throughout the project. I thought it would be easier to keep them all in one notebook.

In [6]:
def get_stop_words() -> set:
    global list_of_authors
    """
    Compile stop words to remove from text.
    """
    custom_words = [
        'edition', 'isbn', 'alternate', 'cover', 'one', 'story', 'book', 'novel', 'page', 'pages', 'new', 'latest', 'written', 'hyped', 'tiktok',
        'cover', 'note', 'bestseller', 'newest', 'list', 'print', 'ever', 'tale', 'author', 'reader', 'work', 'literary', 'epic', 'fantasy', 
        'scifi', 'romance', 'fiction', 'nonfiction', 'thriller', 'paperback', 'hardback', 'contemporary', 'science', 'character', 'epic', 
        'series', 'turner', 'classic', 'plot', 'synopsis', 'blurb', 'collection', 'short', 'writer', 'writing', 'reader', 'reprint', 'librarian'
        'print', 'chapter', 'published', 'paper', 'anticipated', 'science fiction', 'beautiful' 'type', 'stories', 'genre', 'write', 'award',
        'winning', 'debut', 'masterpiece', 'youtube', 'goodreads', 'prize', 'booker', 'pulitzer', 'hugo', 'longlist', 'shortlist', 'acclaim',
        'critcally', 'adapation', 'tv', 'show', 'motion', 'picture', 'netflix', 'streaming', 'stream', 'best', 'woman', 'man', 'family', 'know',
        'time', 'winner', 'thats', 'youve', 'youre'
    ] 
    
    stop_words = stopwords.words('english')
    # Appened custom_words to stop_words
    for word in custom_words:
        stop_words.append(word)

    return set(stop_words)


def clean_text(text: str, get_word_count=False) -> str | int:
    """
    Remove special symbols, punctuation, and specific patterns in text.
    """
    if get_word_count:
        # Returns word count for text
        text = re.sub(r'[^\w ]+', ' ', text)  # Remove special symbols
        text = re.sub("\s+", ' ', text)  # Remove additional whitespaces
        return len(text.split(' '))
    
    stop_words = get_stop_words()
    text = text.lower()
    text = text.strip()    
    text = " ". join([word for word in text.split(' ') if word not in stop_words])  # Remove stop words before processing individual words
    text = re.sub('<.*?>', '', text)  # Remove HTML tags
    text = normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')  # Remove accented characters
    text = re.sub(r"[0-9]", '', text)  # Removes numbers
    text = re.sub(r'[^\w ]+', ' ', text)  # Remove special symbols
    text = re.sub("\s+", ' ', text)  # Remove additional whitespaces
    text = re.sub(r'[_]', '', text)  # Remove underscores
    return text
    

def preprocess_text(text: str) -> str:
    """
    Tokenize, lemmatize, and remove stop words from text.
    """
    stop_words = get_stop_words()
    lem = WordNetLemmatizer()
    
    for _ in range(2):
        text = word_tokenize(text)  # Tokenize text
        text = [lem.lemmatize(word) for word in text]  # Lemmatize text
        text = [word for word in text if len(word) > 2]  # Remove words with a length of 2 or less
        # REMOVE STOP WORDS ------------------------------------------------------
        text = [word for word in text if word not in stop_words]
        text = " ".join(text)
    
    return text


def tdm_converter(vect: object, docs: list) -> np.array:
    """
    Convert vectorized documents into a text document matrix.
    """
    vect_transformer = vect.transform(docs)
    return vect_transformer.toarray()


def display_metrics(model_name: str, y_true: list, y_hat: list, y_hat_probas: list):
    """
    Displays numerous metrics about current model predictions.
    """
    print(f"Model: {model_name}")
    print(f"Accuracy score: {accuracy_score(y_true, y_hat)}")
    print(f"Recall score: {recall_score(y_true, y_hat, average='macro')}")
    print(f"Precision score: {precision_score(y_true, y_hat, average='macro')}")
    print(f"f1 score: {f1_score(y_true, y_hat, average='macro')}")
    try:
        print(f"Log loss {log_loss(y_true, y_hat_probas, labels=y_true)}")
        print(f"ROC AUC: {roc_auc_score(y_true, y_hat_probas, multi_class='ovr')}")
    except Exception as e:
        print(f"Error: {e}")