In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix
from sklearn.model_selection import train_test_split

# Ignore futurewarnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/PostBooksEDA.csv', index_col=0)
df_books = pd.read_csv('../data/Books_data.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher,language,category,continent,author_frequency,author_implicit_encoded,author_explicit_encoded,author_avg_all_reviews,publisher_frequency,publisher_implicit_encoded,publisher_explicit_encoded,publisher_avg_all_reviews
0,2,18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,en,science & technology,North America,2,0.5,7.0,3.5,2542,0.564123,7.75,3.378049
1,8,34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222
2,11400,49.0,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222
4,41385,34.7439,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222
5,67544,30.0,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222


## Content Based Recommendations

These recommendation engines are built around the idea if a user likes some item (or a particular basket of items) then they will like similar items based on the item content/description. If I watched the Avengers, then I probably would want to watch other superhero-themed movies. If we look at a few books along with their descriptions:

In [4]:
df_titles = df['book_title'].value_counts()
df_titles.head(10)

book_title
Wild Animus                                        2365
The Lovely Bones: A Novel                          1202
The Da Vinci Code                                   868
A Painted House                                     793
The Nanny Diaries: A Novel                          787
Bridget Jones's Diary                               772
The Secret Life of Bees                             740
Divine Secrets of the Ya-Ya Sisterhood: A Novel     714
The Red Tent (Bestselling Backlist)                 684
Angels & Demons                                     654
Name: count, dtype: int64

Because each one of our rows represents a review with not an unique book title we are going to group the dataframe by book_title to use the vectorizer.

In [5]:
df_filtered = df[df['rating'] > 0]

unique_titles = df_filtered.groupby('book_title').agg(
                            review_count=('rating', 'count'),
                            avg_review_score=('rating', 'mean')).reset_index()

In [6]:
unique_titles

Unnamed: 0,book_title,review_count,avg_review_score
0,A Light in the Storm: The Civil War Diary of ...,1,9.000000
1,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
2,Dark Justice,1,10.000000
3,Earth Prayers From around the World: 365 Pray...,7,7.142857
4,Final Fantasy Anthology: Official Strategy Gu...,2,10.000000
...,...,...,...
129832,Ã?Â?bernachten mit Stil. Die schÃ?Â¶nsten Coun...,1,8.000000
129833,Ã?Â?rger mit Produkt X. Roman.,3,7.000000
129834,Ã?Â?sterlich leben.,1,7.000000
129835,Ã?Â?stlich der Berge.,1,8.000000


In [7]:
vectorizer = TfidfVectorizer(stop_words = "english", min_df=2)
TF_IDF_matrix = vectorizer.fit_transform(unique_titles['book_title'])

In [8]:
TF_IDF_matrix.shape

(129837, 27840)

In [9]:
TF_IDF_matrix

<129837x27840 sparse matrix of type '<class 'numpy.float64'>'
	with 522584 stored elements in Compressed Sparse Row format>

In [10]:
unique_titles[unique_titles['book_title'].str.contains('Harry Potter', na=False)].head()

Unnamed: 0,book_title,review_count,avg_review_score
1364,A Closer Look at Harry Potter: Bending and Sha...,1,4.0
11289,Beacham's Sourcebook For Teaching Young Adult ...,2,9.0
39538,Garri Potter i uznik Azkabana (Harry Potter an...,1,10.0
43411,Harri Potter maen yr Athronydd (Harry Potter a...,1,9.0
43424,Harrius Potter et Philosophi Lapis (Harry Pott...,1,8.0


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

book_1 = TF_IDF_matrix[ (unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)').values,  ]
book_2 = TF_IDF_matrix[ (unique_titles['book_title'] == 'Harry Potter and the Prisoner of Azkaban (Book 3)').values,  ]

print("Similarity:", cosine_similarity(book_1, book_2)) # Notice the result is a 2D 1X1 array, so to grab
                                                          # the number we will need to index                                               

Similarity: [[0.46690432]]


Not only can we use the sklearn.metrics.pairwise.cosine_similiarity function to compute that between two different vectors, we can pass the entire tf-idf matrix into the function as a single argument and it will compute the similarity between each column and every other column, giving back a square matrix, where the entry at is the similarity between movie and (like a correlation matrix for features).

In [12]:
similarities = cosine_similarity(TF_IDF_matrix, dense_output=False)

In [13]:
# Check the shape
# rows and columns should be equal, and the number of movies we started with (rows)
similarities.shape

(129837, 129837)

Now that we can directly compare two movies and we can make recommendations of the form: if you like movie $a$ then you will also like movies $b$, $c$, $d$, $etc$.

We can do this just picking a candidate film and taking its column in the similarity matrix, and then finding those rows where the similarities are highest:

In [14]:
# Test with a sample movie
unique_titles[unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)']

Unnamed: 0,book_title,review_count,avg_review_score
43453,Harry Potter and the Chamber of Secrets (Book 2),313,8.84345


In [15]:
# Get the column based upon the index
unique_title_index = unique_titles[unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)'].index

# Create a dataframe with the movie titles
sim_df = pd.DataFrame({'book': unique_titles ['book_title'],
                       'similarity': np.array(similarities[unique_title_index, :].todense()).squeeze()})

In [16]:
# Return the top 10 least similar movies
sim_df.sort_values(by='similarity', ascending=True).head(10)

Unnamed: 0,book,similarity
0,A Light in the Storm: The Civil War Diary of ...,0.0
84792,Shadow Watch (Tom Clancy's Power Plays (Paperb...,0.0
84791,Shadow Warriors: Inside the Special Forces,0.0
84789,Shadow Walker,0.0
84788,Shadow Steed,0.0
84786,Shadow Spinner (Jean Karl Books (Paperback)),0.0
84785,Shadow Song,0.0
84784,Shadow Ranch: A Novel,0.0
84783,Shadow Queen,0.0
84781,Shadow Puppets (Ender),0.0


#### Testing

In [17]:
from recommendation import content_recommender

In [18]:
# Test the recommender
similar_books = content_recommender("Harry Potter and the Chamber of Secrets (Book 2)", unique_titles, similarities, vote_threshold=10)
similar_books.head(10)

Unnamed: 0,book,similarity,Number of reviews,Avg Rating
43453,Harry Potter and the Chamber of Secrets (Book 2),1.0,313,8.84345
43456,Harry Potter and the Chamber of Secrets Postca...,0.871738,23,9.869565
96651,The Chamber,0.587331,199,7.417085
43481,Harry Potter and the Sorcerer's Stone (Harry P...,0.57532,297,8.915825
43460,Harry Potter and the Goblet of Fire (Book 4),0.524694,233,9.154506
43464,Harry Potter and the Order of the Phoenix (Boo...,0.505216,201,9.034826
43478,Harry Potter and the Sorcerer's Stone (Book 1),0.500895,168,9.077381
43471,Harry Potter and the Prisoner of Azkaban (Book 3),0.466904,263,9.076046
83838,Secrets,0.393631,29,7.068966
43495,Harry Potter y la piedra filosofal,0.373569,13,8.615385


## Collaborative Based Recommendations Filtering - Item-Based Filtering 

Collaborative filtering also relies on similarity between items, as well as similarity between users.

Unlike content-based systems, a collaborative system looks at an item as a collection of ratings. Every item has ratings by some users, and if two items get very similar ratings from users, the items themselves are similar (notice this system is not at all aware of the items' content).

Similarly, we can define users to be similar if they rate items similarly.

In [None]:
len(df['isbn'].value_counts())

In [None]:
len(df['user_id'].value_counts())

Matrix size would be 86405 * 264011

We are going to select the columns that we are going to build the matrix on

In [None]:
df_temp = df[['isbn', 'rating', 'user_id']]

In [None]:
df_temp

In [None]:
df_temp.shape

First we are going to get the number of ratings for each book

In [None]:
books_rated = pd.DataFrame(df.groupby('isbn')['rating'].count()).rename(columns={'rating':'total_rating_count'}).reset_index()

In [None]:
books_rated

We are going to merge this new dataframe to the df_temp to later on work with only the books with a certain threshold of number of reviews, in this case books with more than 5 ratings.

In [None]:
final = pd.merge(df_temp , books_rated, how='left', left_on='isbn',right_on='isbn')
# The number of Total Rating Count is the same threshold on the Books_data.csv, containing the metadata of the books
final = final[final['total_rating_count']>5]
final.shape

In [None]:
final.head()

In [None]:
'''
 Prepare a matrix
  rows: Users
  columns: Books ISBN
  values : ratings
'''

matrix = pd.pivot_table(data=final, index='isbn', columns='user_id', values='rating')
matrix.head()

In [None]:
matrix.shape

In [None]:
# We are going to fill the null values with 0
matrix.fillna(0,inplace=True)
matrix.head()

- The matrix provides the human-readable information (e.g., book titles and user details).
- The csr_data provides the numerical format needed for computational efficiency in algorithms like KNN.

In [None]:
from scipy.sparse import csr_matrix
csr_data = csr_matrix(matrix.values)
matrix.reset_index(inplace=True)

#### Saving the matrix and csr_data

In [None]:
print(matrix.columns)

In [None]:
from scipy.sparse import save_npz
import json

# To save the data from the original matrix for later extract the book titles we are going to get the indexes of the book_titles
# and then save it using json 
# Assuming rows correspond to books in the csr_matrix
book_isbn = matrix['isbn'].tolist()

# Assuming columns correspond to user IDs
user_ids = matrix.columns[1:].tolist()  # Skip the 'book_title' column

# Save book titles and user IDs as JSON files
with open('../data/book_isbn.json', 'w') as f:
    json.dump(book_isbn, f)

with open('../data/user_ids.json', 'w') as f:
    json.dump(user_ids, f)

# Save the csr_matrix to a file
save_npz('../data/csr_data.npz', csr_data)

### Model Creation Nearest Neighbors

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Define hyperparameters to tune
param_grid = {
    'n_neighbors': [10, 15, 20],
    'metric': ['cosine', 'euclidean', 'manhattan'],
    'algorithm': ['auto', 'brute']
}

# Initialize the NearestNeighbors model
knn = NearestNeighbors()

# Use GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Fit with the csr data
grid_search.fit(csr_data)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Save the model to a file
from joblib import dump
best_model = grid_search.best_estimator_
dump(best_model, '../models/model.joblib')

## Testing

In [None]:
import json
from recommendation import recommend
from scipy.sparse import load_npz
from joblib import load

# Load the model from the file
optimized_knn = load('../models/model.joblib')

# Load the csr_matrix from the file
csr_data = load_npz('../data/csr_data.npz')

# Load book titles IDs
with open('../data/book_isbn.json', 'r') as f:
    book_isbns = json.load(f)

# load dataframe
df_books = pd.read_csv('../data/Books_data.csv')


In [None]:
recommendations = recommend(
    book_isbn="0439064864",  # Harry Potter and the Chamber of Secrets (Book 2)
    knn_model=optimized_knn,
    csr_data=csr_data,
    book_isbns=book_isbns,  # List of ISBNs corresponding to the rows in csr_data
    dataframe=df_books,  # Original DataFrame with book metadata
    n_neighbors=10
)

recommendations

In [None]:
recommendations = recommend(
    book_isbn="0451525078",  # Harry Potter and the Chamber of Secrets (Book 2)
    knn_model=optimized_knn,
    csr_data=csr_data,
    book_isbns=book_isbns,  # List of ISBNs corresponding to the rows in csr_data
    dataframe=df_books,  # Original DataFrame with book metadata
    n_neighbors=10
)

recommendations

In [None]:
recommendations = recommend(
    book_isbn="0553564684",  # Star Wars: Tales from the Mos Eisley Cantina
    knn_model=optimized_knn,
    csr_data=csr_data,
    book_isbns=book_isbns,  # List of ISBNs corresponding to the rows in csr_data
    dataframe=df_books,  # Original DataFrame with book metadata
    n_neighbors=15
)

recommendations

## Evaluation Using Precision and Recall

In [None]:
import json
from scipy.sparse import load_npz, csr_matrix
from joblib import load
from sklearn.model_selection import train_test_split
import numpy as np

# Load the model from the file
optimized_knn = load('../models/model.joblib')

# Load the csr_matrix from the file
csr_data = load_npz('../data/csr_data.npz')

# Load book titles and Users IDs
with open('../data/book_isbn.json', 'r') as f:
    book_isbns = json.load(f)

with open('../data/user_ids.json', 'r') as f:
    user_ids = json.load(f)


In [None]:
print(f"Number of columns in csr_data: {csr_data.shape[1]}")
print(f"Number of rows in csr_data: {csr_data.shape[0]}")
print(f"Length of user_ids: {len(user_ids)}")
print(f"Length of book_isbns: {len(book_isbns)}")

In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# Convert book_isbns to a NumPy array for indexing
book_isbns = np.array(book_isbns)

# Convert the sparse matrix to dense format for splitting
csr_dense = csr_data.toarray()

# Split data into training and testing sets (80/20 split)
train_data, test_data = train_test_split(csr_dense, test_size=0.2, random_state=42)

# Convert training data back to sparse format
csr_train = csr_matrix(train_data)

# Helper function for Precision@k
def precision_at_k(recommended_items, relevant_items, k):
    """Compute Precision@k"""
    recommended_at_k = recommended_items[:k]
    relevant_at_k = set(recommended_at_k).intersection(set(relevant_items))
    return len(relevant_at_k) / k

# Helper function for Recall@k
def recall_at_k(recommended_items, relevant_items, k):
    """Compute Recall@k"""
    recommended_at_k = recommended_items[:k]
    relevant_at_k = set(recommended_at_k).intersection(set(relevant_items))
    return len(relevant_at_k) / len(relevant_items) if relevant_items else 0

# Function to evaluate the recommendation system
def evaluate_recommender(knn_model, csr_train, test_data, book_isbns, k=10):
    """
    Evaluate the recommendation system using Precision@k and Recall@k.

    Parameters:
    - knn_model: Trained kNN model.
    - csr_train: Sparse matrix for training data.
    - test_data: Dense matrix for testing data.
    - book_isbns: List or array of book ISBNs (rows of the matrix).
    - k: Number of top recommendations to consider.

    Returns:
    - mean_precision: Average Precision@k across all books.
    - mean_recall: Average Recall@k across all books.
    """
    precision_scores = []
    recall_scores = []

    for book_idx, test_row in enumerate(test_data):  # Iterate over books (rows)
        # Get the users who interacted with this book (non-zero entries in the test set)
        relevant_users = set(np.flatnonzero(test_row))

        if not relevant_users:
            continue  # Skip books with no test interactions

        # Generate recommendations for the book
        distances, indices = knn_model.kneighbors(csr_train[book_idx], n_neighbors=k)
        recommended_books = book_isbns[indices.flatten()]  # Map indices to book ISBNs

        # Get books that share users with the current book in the test set
        relevant_books = set(
            book_isbns[np.flatnonzero(csr_dense[:, list(relevant_users)].sum(axis=1))]
        )

        # Compute Precision@k and Recall@k
        precision_scores.append(precision_at_k(recommended_books, relevant_books, k))
        recall_scores.append(recall_at_k(recommended_books, relevant_books, k))

    # Calculate average Precision@k and Recall@k
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)

    return mean_precision, mean_recall

# Evaluate the kNN model
mean_precision, mean_recall = evaluate_recommender(optimized_knn, csr_train, test_data, book_isbns, k=10)

# Print evaluation metrics
print(f"Mean Precision@k: {mean_precision}")
print(f"Mean Recall@k: {mean_recall}")


## Evaluation 2

In [None]:
import json
from scipy.sparse import load_npz, csr_matrix
from joblib import load
from sklearn.model_selection import train_test_split
import numpy as np
import surprise

# Load the model from the file
optimized_knn = load('../models/model.joblib')

# Load the csr_matrix from the file
csr_data = load_npz('../data/csr_data.npz')

# Load book titles and Users IDs
with open('../data/book_isbn.json', 'r') as f:
    book_isbns = json.load(f)

with open('../data/user_ids.json', 'r') as f:
    user_ids = json.load(f)


In [None]:
from surprise import accuracy
from surprise.model_selection import train_test_split
