In [7]:
import pandas as pd
import regex as re
import numpy as np
import logging

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import mplcyberpunk
# ML Modeling
from sklearn.metrics import precision_recall_fscore_support,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay,roc_curve
from sklearn.preprocessing import normalize
# Saving and importing trained models
import pickle


df = pd.read_csv('../data/data.csv', delimiter=',', encoding='utf-8')


In [8]:
# doing a quick review of the data before developing the model. It looks like we have 4,657 books, and 4 collumns
df.shape

(4657, 4)

In [10]:
df.title

0                       Drowned Wednesday
1                           The Lost Hero
2               The Eyes of the Overworld
3                         Magic's Promise
4                          Taran Wanderer
                      ...                
4652                              Hounded
4653    Charlie and the Chocolate Factory
4654                           Red Rising
4655                            Frostbite
4656                             Radiance
Name: title, Length: 4657, dtype: object

In [9]:
df.columns

Index(['index', 'title', 'genre', 'summary'], dtype='object')

In [12]:
#processing the data for classification

#summaru + title
df['sum_title'] = df['summary'] + ' '+ df['title']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4657 entries, 0 to 4656
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      4657 non-null   int64 
 1   title      4657 non-null   object
 2   genre      4657 non-null   object
 3   summary    4657 non-null   object
 4   sum_title  4657 non-null   object
dtypes: int64(1), object(4)
memory usage: 182.0+ KB


In [19]:
#Now we beging training our model
def _reciprocal_rank(true_genre_labels: list, machine_predicted_genre_labels: list):
  #determining recirprical rank at cutoff
#now wer creaate parameters
        # `true_genre_labels` (List): # List of the actual book genre labels
        # `machine_predicted_genre_labels` (List): # List of book genre labels predicted by the LR algorithm
    
    #The return value will be reciprical rank
    
 # add index to list ONLY if machine predicted label is true 
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_predicted_genre_labels) if r in true_genre_labels]

    recip_rank = 0
    if len(tp_pos_list) > 0:
        # finds fist corectly predicted item
        first_pos_list = tp_pos_list[0]
        
    
        recip_rank = 1 / float(first_pos_list)

    return recip_rank

def compute_mrr_at_k(eval_news_category_items:list):
    
    ## creating a function that computes Mean reciprical rank

    rr_total = 0
    
    for item in eval_news_category_items:
        actual_label = item[0]
        pred_label_list = item[1]

        # Finds the reciprocal rank  for this row
        rr_at_k = _reciprocal_rank(actual_label, pred_label_list)

        # Add the row's RR to  scores for the entire data
        rr_total = rr_total + rr_at_k

        # Updates the Mean Reciprocal Rank  score with new row value
        mean_reciprocal_rank_score = rr_total / 1/float(len(eval_news_category_items))

    return mean_reciprocal_rank_score

def collect_preds(Y_test, Y_preds):
    ##This function will gather all predicted book genres and the true book genres 
    pred_gold_list = [ [ [Y_test[index]], pred ] for index, pred in enumerate(Y_preds) ]
    return pred_gold_list
             
def compute_accuracy(eval_book_genre_items:list):
    
    #this will compute the overall accuracy score our the model 
    correct_book_cat = 0
    
    for book_genre_cat in eval_book_genre_items:
        true_gen = book_genre_cat[0]
        machine_gen = set(book_genre_cat[1])
        
        for book_cat in true_gen:
            if book_cat in machine_gen:
                correct_book_cat += 1
                break
    
    book_cat_prediction_accuracy = correct_book_cat / float(len(eval_book_genre_items))
    return book_cat_prediction_accuracy

In [20]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(df, field, training_data, testing_data, type='binary'):
# this function will extract features using different method types: binary, counts, or TF-IDF

    logging.info()
    
    if 'binary' in type:
        
        # Now we are creating a new CountVectorizer()
        cv = CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        train_feature_set = cv.transform(training_data[field].values)
        test_feature_set = cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif 'counts' in type:
        
        cv = CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set = cv.transform(training_data[field].values)
        test_feature_set = cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    elif 'tfidf':    
        
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

def get_top_b_predictions(model, X_test, k, threshold=False):
 # this will use our input to return the book genre with the top estimated probability of being accurate
    if threshold == False:
        probs = model.predict_proba(X_test)
        best_n = np.argsort(probs, axis=1)[:,-k:]
        preds = [[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
        preds = [ item[::-1] for item in preds]
    
        return preds
    else:
        # get probabilities instead of predicted labels
        probs = (model.predict_proba(X_test)[:,1] >= thresh_val)

        # top predictions by probability
        best_n = np.argsort(probs, axis=1)[:,-k:]
        
        # gets category of predictions
        preds = [[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
        
        preds = [ item[::-1] for item in preds]
    
        return preds
   
def train_model(df, field="summary", feature_rep="binary", top_k=3):
 # now we are creating the main control of the function

    y = df['genre']
    x_training_data,x_testing_data = train_test_split(
        df,
        random_state=2000  )

    # getting labels and category values from each split data
    Y_train = x_training_data['genre'].values
    Y_test = x_testing_data['genre'].values
     
    # Extracting features
    X_train,X_test,feature_transformer = extract_features(
        df,
        field,
        x_training_data,
        x_testing_data,
        type=feature_rep
    )

    # start classifier object
    logging.info("Training a Logistic Regression Model. This may take a few minutes. ...")
    scikit_log_reg = LogisticRegression(
        verbose=0, 
        solver='liblinear',
        random_state=0,
        C=5,
        penalty='l2',
        max_iter=1000
    )
    # Create the model
    model = scikit_log_reg.fit(X_train, Y_train)

    # top predictions
    preds = get_top_k_predictions(model, X_test, top_k)
    
    eval_items = collect_preds(Y_test, preds)
    
    # Evaluation on test dat
    logging.info("Starting evaluation...")
    simple_mean_avg_correct_prediction_accuracy = compute_accuracy(eval_items)
    mean_recip_rank_at_k = compute_mrr_at_k(eval_items)
    
    logging.info("Done training and evaluation.")

    # Return the computed model 
    return model,feature_transformer,simple_mean_avg_correct_prediction_accuracy,mean_recip_rank_at_k,X_train,X_test,Y_test,Y_train,preds,eval_items