This is to make everything object-oriented

In [1]:
# Misc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Model + Evaluation
from sklearn.neighbors import NearestNeighbors

import re
from tqdm import tqdm

In [2]:
class RecommenderSystem:

    def __init__(self):
        self.train_data = pd.DataFrame()
        self.review_scaler = MinMaxScaler((0,5))
        self.tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = None
        self.knn = NearestNeighbors(metric='cosine')
        self.unique_users = []


    def __preprocess_categories__(self, row):
        # If nan
        if isinstance(row, float):
            return ''
        m = re.match(r"\['(.*?)'\]", row)
        if m:
            return m.group(1)
        return ''

    # For training
    def fit_transform(self, X : pd.DataFrame):
        X_tmp = X.dropna(subset=('User_id', 'Title'))
        merged_book_data = X_tmp[['User_id', 'Title', 'categories', 'review/score', 'description', 'authors']]
        merged_book_data.loc[:,'categories'] = merged_book_data['categories'].apply(self.__preprocess_categories__)
        merged_book_data['description'].fillna('', inplace=True)
        merged_book_data['authors'].fillna('', inplace=True)
        merged_book_data['combined_text_features'] = (
            merged_book_data['description'] + ' ' +
            merged_book_data['categories'] + ' ' +
            merged_book_data['authors']).str.strip()

        grouped_by_title = merged_book_data.groupby('Title')

        # Average/Normalize Ratings of books
        merged_book_data['average rating'] = grouped_by_title['review/score'].transform(lambda x : round(x.mean(), 2))
        merged_book_data['average rating'] = self.review_scaler.fit_transform(merged_book_data[['average rating']])
        merged_book_data['Title'] = merged_book_data['Title'].astype('category')

        #merged_book_data.drop_duplicates(subset=('Title'), inplace = True)
        merged_book_data.drop(['categories', 'authors', 'description'], axis=1, inplace=True)

        merged_book_data.reset_index(drop=True, inplace=True)
        self.tfidf_matrix = self.tfidf.fit_transform(merged_book_data['combined_text_features'])
        self.knn.fit(self.tfidf_matrix)
        self.unique_users = merged_book_data['User_id'].unique()
        self.train_data = merged_book_data.copy()
        return merged_book_data

    # For testing
    def transform(self, X):
        X_tmp = X.dropna(subset=('User_id', 'Title'))
        merged_book_data = X_tmp[['User_id', 'Title', 'categories', 'review/score', 'description', 'authors']]
        merged_book_data.loc[:,'categories'] = merged_book_data['categories'].apply(self.__preprocess_categories__)
        merged_book_data['description'].fillna('', inplace=True)
        merged_book_data['authors'].fillna('', inplace=True)
        merged_book_data['combined_text_features'] = (
            merged_book_data['description'] + ' ' +
            merged_book_data['categories'] + ' ' +
            merged_book_data['authors']).str.strip()

        grouped_by_title = merged_book_data.groupby('Title')

        # Average/Normalize Ratings of books
        merged_book_data['average rating'] = grouped_by_title['review/score'].transform(lambda x : round(x.mean(), 2))
        merged_book_data['average rating'] = self.review_scaler.transform(merged_book_data[['average rating']])
        merged_book_data['Title'] = merged_book_data['Title'].astype('category')

        #merged_book_data.drop_duplicates(subset=('Title'), inplace = True)
        merged_book_data.drop(['categories', 'authors', 'description'], axis=1, inplace=True)

        merged_book_data.reset_index(drop=True, inplace=True)
        return merged_book_data

    def recommend(self, user_id, n_recommendations=5):
        books_data = self.train_data
        user_books = books_data[books_data['User_id'] == user_id]['Title'].unique()
        if len(user_books) == 0:
            print('User had no books')
            return pd.DataFrame()
        
        user_indices = books_data[books_data['Title'].isin(user_books)].index
        user_vectors = self.tfidf_matrix[user_indices]
        user_profile_vector = user_vectors.mean(axis=0)

        _, indices = self.knn.kneighbors(np.asarray(user_profile_vector), n_neighbors=len(user_indices) + 2 * n_recommendations + 1)
        recommended_indices = []
        recommended_books = []
        for i in indices[0]:
            if len(recommended_indices) == n_recommendations:
                break
            if books_data.iloc[i]['Title'] not in user_books \
            and books_data.iloc[i]['Title'] not in recommended_books:
                recommended_books.append(books_data.iloc[i]['Title'])
                recommended_indices.append(i)
        return books_data.iloc[recommended_indices]

    # Model evaluation
    def precision_k(self, recommended, relevant, k):
        rec = recommended[:k]
        rel = set(rec) & set(relevant)
        return len(rel) / k

    def recall_k(self, recommended, relevant, k):
        rec = recommended[:k]
        rel = set(rec) & set(relevant)
        return len(rel) / len(relevant) if relevant else 0

    def mean_reciprocal_rank(self, recommended, relevant):
        for rank, rec in enumerate(recommended, start=1):
            if rec in relevant:
                return 1 / rank
        return 0

    def evaluate_recommender(self, X, n_recommendations=5):
        precision_scores = []
        recall_scores = []
        mrr_scores = []
        
        for user_id in tqdm(X['User_id'].unique()):
            user_test_books = X[X['User_id'] == user_id]['Title'].tolist()
            
            recommended_books = self.recommend(
                user_id=user_id,
                n_recommendations=n_recommendations
            )['Title'].tolist()
            
            if not recommended_books:
                continue
            
            precision_scores.append(self.precision_k(recommended_books, user_test_books, k=n_recommendations))
            recall_scores.append(self.recall_k(recommended_books, user_test_books, k=n_recommendations))
            mrr_scores.append(self.mean_reciprocal_rank(recommended_books, user_test_books))

        
        return {
            "Precision": np.average(precision_scores),
            "Recall": np.average(recall_scores),
            "MRR": np.average(mrr_scores)
        }


In [3]:
book_data = pd.read_csv('data/books_data.csv')
books_rating_data = pd.read_csv('data/Books_rating.csv')

bd_X_train, bd_X_test = train_test_split(book_data)
brd_X_train, brd_X_test = train_test_split(books_rating_data)

X_train = bd_X_train.merge(brd_X_train, how='left', on='Title')
X_test = bd_X_test.merge(brd_X_test, how='left', on='Title')

In [4]:
# Fitting
recommender = RecommenderSystem()
X_train_processed = recommender.fit_transform(X_train)
X_train_processed[:5]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_book_data['description'].fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_book_data['description'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df

Unnamed: 0,User_id,Title,review/score,combined_text_features,average rating
0,A1MZB6UBKOVSK0,Little wars: A game for boys from twelve years...,3.0,Little Wars is a set of rules for playing with...,3.125
1,AUTBHG6070SL4,Little wars: A game for boys from twelve years...,4.0,Little Wars is a set of rules for playing with...,3.125
2,A2N1VOV3N4ZR22,Little wars: A game for boys from twelve years...,4.0,Little Wars is a set of rules for playing with...,3.125
3,A7H4LNXXJ3DM6,Little wars: A game for boys from twelve years...,4.0,Little Wars is a set of rules for playing with...,3.125
4,AHA1U1QCT41HC,Little wars: A game for boys from twelve years...,5.0,Little Wars is a set of rules for playing with...,3.125


In [5]:
# Example recommendation
sample_user_id = X_train_processed['User_id'].iloc[343]
# First view what the user currently has read.
X_train_processed[X_train_processed['User_id'] == sample_user_id]

Unnamed: 0,User_id,Title,review/score,combined_text_features,average rating
343,A1DAOL3NMHEPLS,Earth abides,5.0,"Returning from a field trip, Isherwood William...",3.7
3731,A1DAOL3NMHEPLS,Doctor from Lhasa,1.0,,2.8125
13480,A1DAOL3NMHEPLS,Gladiator at Law,5.0,CAUTION! You are about to enter a world... whe...,4.375
101987,A1DAOL3NMHEPLS,The other side of the sky,5.0,"""Amie Kaufman and Meagan Spooner prove they ar...",4.6875
113914,A1DAOL3NMHEPLS,ACME Catalog: Quality is Our #1 Dream,4.0,With such offerings as jet-powered pogo sticks...,4.5625
129933,A1DAOL3NMHEPLS,Little Scarlet: An Easy Rawlins Mystery,4.0,When a man who fled the 1965 Watts riots is su...,4.5375
191999,A1DAOL3NMHEPLS,The Darwin Awards: Evolution in Action,3.0,The hilarious New York Times bestselling pheno...,3.3
310658,A1DAOL3NMHEPLS,This Perfect Day,5.0,"By the author of Rosemary‘s Baby, a horrifying...",4.525
775139,A1DAOL3NMHEPLS,"Stupidest Angel, The LP",5.0,"Dennis Wilson, Beach Boys drummer, 60's pin-up...",4.1625
793298,A1DAOL3NMHEPLS,"Nightmare At 20,000 Feet (Turtleback School & ...",4.0,In één avond verdwijnen vier jonge kinderen op...,4.1625


In [6]:
# Then view recommendations
recommender.recommend(sample_user_id)

Unnamed: 0,User_id,Title,review/score,combined_text_features,average rating
421762,A3ULNP89GTG7P,Jack the Ripper: End of a Legend,1.0,I've got no time to tell you how I came to be ...,0.0
636726,A2IZP47QL229P3,Jack the Ripper: Crime Scene Investigation,1.0,Over 100 years have elapsed since what is beli...,0.4125
538497,A3NER4ZESH9JSN,Jack the Ripper: First American Serial Killer,3.0,Stewart Evans is a policeman whose hobby is co...,3.75
1065445,A3H4RXHJLYDCG6,The Mammoth Book of Jack the Ripper (Mammoth B...,5.0,Updated and expanded edition of the fullest ev...,4.1125
1305801,A2TJYYHTC1M7HH,The American Murders of Jack the Ripper,1.0,"For the first time, the American murders of Ja...",0.0


In [None]:
# Evaluation on the training set first
recommender.evaluate_recommender(X_train_processed)

  0%|          | 148/663854 [05:10<267:27:44,  1.45s/it] 

In [None]:
# Evaluation using testing data
X_test_processed = recommender.transform(X_test)
recommender.evaluate_recommender(X_test_processed)