#### This notebook includes just the model preprocessing, training, and evaluation

To see the entire evolution of the project, view "recommender.ipynb" as most this code was copied over from that

In [None]:
# Misc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Model + Evaluation
from sklearn.neighbors import NearestNeighbors

import re

In [2]:
book_data = pd.read_csv('data/books_data.csv')
books_rating_data = pd.read_csv('data/Books_rating.csv')
brd = books_rating_data.dropna(subset=('User_id', 'Title'))
bd = book_data.dropna(subset=('Title'))

In [3]:
# Preprocessing methods
def preprocess_categories(row):
    # If nan
    if isinstance(row, float):
        return ''
    m = re.match(r"\['(.*?)'\]", row)
    if m:
        return m.group(1)
    return ''

def preprocess(data : pd.DataFrame):
    merged_book_data = data[['User_id', 'Title', 'categories', 'review/score', 'description', 'authors']]
    merged_book_data.loc[:,'categories'] = merged_book_data['categories'].apply(preprocess_categories)
    merged_book_data['description'].fillna('', inplace=True)
    merged_book_data['authors'].fillna('', inplace=True)
    merged_book_data['combined_text_features'] = (
        merged_book_data['description'] + ' ' +
        merged_book_data['categories'] + ' ' +
        merged_book_data['authors']).str.strip()

    grouped_by_title = merged_book_data.groupby('Title')

    # Average/Normalize Ratings of books
    merged_book_data['average rating'] = grouped_by_title['review/score'].transform(lambda x : round(x.mean(), 2))
    merged_book_data['average rating'] = MinMaxScaler((0, 5)).fit_transform(merged_book_data[['average rating']])
    merged_book_data['Title'] = merged_book_data['Title'].astype('category')

    #merged_book_data.drop_duplicates(subset=('Title'), inplace = True)
    merged_book_data.drop(['categories', 'authors', 'description'], axis=1, inplace=True)

    merged_book_data.reset_index(drop=True, inplace=True)
    return merged_book_data

In [4]:
merged = bd.merge(brd, how='left', on='Title')
X_train, X_test = train_test_split(merged, random_state=1)
X_train_preprocessed = preprocess(X_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_book_data['description'].fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_book_data['description'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df

In [5]:
# Compute similarity of text features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(X_train_preprocessed['combined_text_features'])

Testing and Evaluation will only occur on users within the training set. This is because I don't know how to compute the F1 score if I don't have the user data already in the train set. If my recommender system was a collaborative-filtering RS, then I could use related/similar user recommendations from the training set to provide recommendations for an unknown user. Although this recommendation system is content-based, thus only the data from the user and their reviews are used

In [6]:
train_users = X_train_preprocessed['User_id'].unique()

In [7]:
knn = NearestNeighbors(metric='cosine')
knn.fit(tfidf_matrix)

In [None]:
# Model methods
def recommend_books_knn(title, books_data, knn_model, tfidf_matrix, n_recommendations=5):
    idx = books_data[books_data['Title'] == title].index[0]
    book_vector = tfidf_matrix[idx]
    _, indices = knn_model.kneighbors(book_vector, n_neighbors=n_recommendations+1)
    
    recommended_indices = indices[0][1:]
    recommendations = books_data.iloc[recommended_indices]
    return recommendations


def recommend_books_knn_userid(user_id, books_data, knn_model, tfidf_matrix, n_recommendations=5):
    user_books = books_data[books_data['User_id'] == user_id]['Title'].unique()
    if len(user_books) == 0:
        print('User had no books')
        return pd.DataFrame()
    
    user_indices = books_data[books_data['Title'].isin(user_books)].index
    user_vectors = tfidf_matrix[user_indices]
    user_profile_vector = user_vectors.mean(axis=0)

    _, indices = knn_model.kneighbors(np.asarray(user_profile_vector), n_neighbors=len(user_indices) + 2 * n_recommendations + 1)
    recommended_indices = []
    recommended_books = []
    for i in indices[0]:
        if len(recommended_indices) == n_recommendations:
            break
        if books_data.iloc[i]['Title'] not in user_books \
        and books_data.iloc[i]['Title'] not in recommended_books:
            recommended_books.append(books_data.iloc[i]['Title'])
            recommended_indices.append(i)
    return books_data.iloc[recommended_indices]

In [9]:
example_user_id = X_train_preprocessed['User_id'].iloc[500]
matrix = recommend_books_knn_userid(example_user_id, X_train_preprocessed, knn, tfidf_matrix)
print("Recommending books based on: ", example_user_id)
matrix.drop(['User_id', 'review/score', 'combined_text_features'], axis=1)   # Drop features for readability

Recommending books based on:  A3GGKPTXHARNGL


Unnamed: 0,Title,average rating
48285,Batman: Mitefall,2.5
888257,The Caves Of Steel,5.0
1134172,Gladiators (Pageant of history series),3.5
817111,Memoirs of a Highland Lady (Canongate Classic),5.0
1568376,An introduction to Viking mythology,3.75


In [10]:
# Model evaluation

def precision_k(recommended, relevant, k):
    rec = recommended[:k]
    rel = set(rec) & set(relevant)
    return len(rel) / k

def recall_k(recommended, relevant, k):
    rec = recommended[:k]
    rel = set(rec) & set(relevant)
    return len(rel) / len(relevant) if relevant else 0

def mean_reciprocal_rank(recommended, relevant):
    for rank, rec in enumerate(recommended, start=1):
        if rec in relevant:
            return 1 / rank
    return 0

def evaluate_recommender(test, books_data, knn_model, tfidf_matrix, n_recommendations=5):
    precision_scores = []
    recall_scores = []
    mrr_scores = []
    
    for user_id in test['User_id'].unique():
        user_test_books = test[test['User_id'] == user_id]['Title'].tolist()
        
        recommended_books = recommend_books_knn(
            user_id=user_id,
            books_data=books_data,
            knn_model=knn_model,
            tfidf_matrix=tfidf_matrix,
            n_recommendations=n_recommendations
        )['Title'].tolist()
        
        if not recommended_books:
            continue
        
        precision_scores.append(precision_k(recommended_books, user_test_books, k=n_recommendations))
        recall_scores.append(recall_k(recommended_books, user_test_books, k=n_recommendations))
        mrr_scores.append(mean_reciprocal_rank(recommended_books, user_test_books))

    
    return {
        "Precision": np.average(precision_scores),
        "Recall": np.average(recall_scores),
        "MRR": np.average(mrr_scores)
    }


In [None]:
# Testing


Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,Id,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
1315533,Love In The Time Of Cholera,Florentino Ariza has never forgotten his first...,"['Gabriel García Márquez', 'Edith Grossman']",,http://books.google.com/books?id=GIhnPwAACAAJ&...,,1989,http://books.google.com/books?id=GIhnPwAACAAJ&...,['Colombia'],195.0,B000GRQ542,,A28BV10NZBSBDU,"Mansura Minhas ""Mansura Minhas""",2/4,4.0,1.216944e+09,Love in the time of Cholera,"I was left awed, shocked, disgusted, amused, i..."
120949,Great Expectations,Penguin Classics e-books give you the best pos...,['Charles Dickens'],http://books.google.com/books/content?id=CV2ZQ...,http://books.google.nl/books?id=CV2ZQtY3G7kC&p...,Penguin UK,2003-01-30,https://play.google.com/store/books/details?id...,['Fiction'],2.0,0681994940,,A3SGHQU56YHC2Y,David Savage,20/22,5.0,9.516096e+08,A book everone should read,I chose to write a review of Great Expectation...
1518904,Adventures of Huckleberry Finn,"Referring to ""Adventures of Huckleberry Finn, ...",['Mark Twain'],http://books.google.com/books/content?id=mWHcD...,http://books.google.nl/books?id=mWHcDAAAQBAJ&p...,Courier Corporation,1994-05-26,http://books.google.nl/books?id=mWHcDAAAQBAJ&d...,['Fiction'],,0531002039,,A1U1SGQ7ZBQDCD,"R.J. ""tree climber""",6/8,5.0,1.294186e+09,One of the best classics ever!,I am a sixteen year old girl and I say that Hu...
1132751,The Wicked Day,Born of an incestuous relationship between Kin...,['Mary Stewart'],http://books.google.com/books/content?id=Zhquw...,http://books.google.com/books?id=ZhquwTP5ITcC&...,Harper Collins,2003-05-06,http://books.google.com/books?id=ZhquwTP5ITcC&...,['Fiction'],7.0,B000OVAI48,,AYFZ6RAXGZTMV,Michael D Ward,4/4,4.0,9.526464e+08,Not as good as the Merlin trilogy but still ve...,Though not as good as the Merlin Trilogy this ...
904909,Cases and Materials on the Law of Torts,This casebook is designed for the professor wh...,"['Harry Shulman', 'Oscar S. Gray']",http://books.google.com/books/content?id=kaJPA...,http://books.google.com/books?id=kaJPAQAAIAAJ&...,,2010,http://books.google.com/books?id=kaJPAQAAIAAJ&...,['Law'],,B000OU9TEY,,A1AVOTFFYNGFH6,the WATCHer,1/2,1.0,1.286237e+09,Waste of Time and money,"This book is the worst. That said, i did not h..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390038,The French Lieutenant's Woman (Turtleback Scho...,,,,,,,,,,0613172426,,A1F0O78HKGBRKO,"David K. Hill ""beecnul8r""",4/9,2.0,1.291766e+09,Usual Fowles,John Fowles has a real problem with writing en...
531633,There's a Mouse about the House,Children will enjoy the fun and surprises in s...,['Richard Fowler'],,http://books.google.com/books?id=2yVytgEACAAJ&...,,1983,http://books.google.com/books?id=2yVytgEACAAJ&...,,,0881101540,,A1RSK3CBRHLM7H,Bud Krieger,0/0,5.0,1.361923e+09,Really? You actually need to read a review to ...,This book should be a gift to every new parent...
639077,Jane Eyre: Complete and Unabridged (Puffin Cla...,"Published on 16 October 1847, Charlotte Brontë...",['Charlote Brontë'],http://books.google.com/books/content?id=NVFnD...,http://books.google.com/books?id=NVFnDwAAQBAJ&...,Om Books International,2018-07-05,https://play.google.com/store/books/details?id...,['Fiction'],,0140351310,,A1D2C0WDCSHUWZ,"E. A Solinas ""ea_solinas""",0/1,5.0,1.229126e+09,"It's Jane Eyre, sir",It's hard to imagine a better gothic romance t...
822158,The Lord of the Rings: The Fellowship of the R...,,,http://books.google.com/books/content?id=UNhZj...,http://books.google.com/books?id=UNhZjwEACAAJ&...,,2002,http://books.google.com/books?id=UNhZjwEACAAJ&...,,,1559350334,,A31V30IKS5EA03,Claire Abeltin(jabeltin@home.com),5/23,1.0,9.261216e+08,Took me years...,I used to trust the opinions of others. Not an...
