In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix
from sklearn.model_selection import train_test_split

# Ignore futurewarnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/PostBooksEDA.csv', index_col=0)
df_encoded = pd.read_csv('../data/PostEncodedBooksEDA.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher,language,category,continent,author_frequency,author_implicit_encoded,author_explicit_encoded,author_avg_all_reviews,publisher_frequency,publisher_implicit_encoded,publisher_explicit_encoded,publisher_avg_all_reviews,language_grouped
0,2,18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,en,science & technology,North America,2,0.5,7.0,3.5,2542,0.564123,7.75,3.378049,en
1,8,34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222,en
2,11400,49.0,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222,en
4,41385,34.7439,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222,en
5,67544,30.0,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,en,unknown_category,North America,13,0.461538,7.428571,4.0,45,0.533333,7.333333,3.422222,en


## Content Based Recommendations

These recommendation engines are built around the idea if a user likes some item (or a particular basket of items) then they will like similar items based on the item content/description. If I watched the Avengers, then I probably would want to watch other superhero-themed movies. If we look at a few books along with their descriptions:

In [4]:
df_titles = df['book_title'].value_counts()
df_titles.head(10)

book_title
Wild Animus                                        2365
The Lovely Bones: A Novel                          1202
The Da Vinci Code                                   868
A Painted House                                     793
The Nanny Diaries: A Novel                          787
Bridget Jones's Diary                               772
The Secret Life of Bees                             740
Divine Secrets of the Ya-Ya Sisterhood: A Novel     714
The Red Tent (Bestselling Backlist)                 684
Angels & Demons                                     654
Name: count, dtype: int64

Because each one of our rows represents a review with not an unique book title we are going to group the dataframe by book_title to use the vectorizer.

In [5]:
df_filtered = df[df['rating'] > 0]

unique_titles = df_filtered.groupby('book_title').agg(
                            review_count=('rating', 'count'),
                            avg_review_score=('rating', 'mean')).reset_index()

In [6]:
vectorizer = TfidfVectorizer(stop_words = "english", min_df=2)
TF_IDF_matrix = vectorizer.fit_transform(unique_titles['book_title'])

In [7]:
TF_IDF_matrix.shape

(129837, 27840)

In [8]:
TF_IDF_matrix

<129837x27840 sparse matrix of type '<class 'numpy.float64'>'
	with 522584 stored elements in Compressed Sparse Row format>

In [9]:
unique_titles[unique_titles['book_title'].str.contains('Harry Potter', na=False)]

Unnamed: 0,book_title,review_count,avg_review_score
1364,A Closer Look at Harry Potter: Bending and Sha...,1,4.000000
11289,Beacham's Sourcebook For Teaching Young Adult ...,2,9.000000
39538,Garri Potter i uznik Azkabana (Harry Potter an...,1,10.000000
43411,Harri Potter maen yr Athronydd (Harry Potter a...,1,9.000000
43424,Harrius Potter et Philosophi Lapis (Harry Pott...,1,8.000000
...,...,...,...
111011,The Science of Harry Potter: How Magic Really ...,3,8.333333
112199,The Sorcerer's Companion: A Guide to the Magic...,10,7.900000
120929,Ultimate Unofficial Guide to the Mysteries of ...,6,8.000000
124240,We Love Harry Potter!,3,7.333333


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

book_1 = TF_IDF_matrix[ (unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)').values,  ]
book_2 = TF_IDF_matrix[ (unique_titles['book_title'] == 'Harry Potter and the Prisoner of Azkaban (Book 3)').values,  ]

print("Similarity:", cosine_similarity(book_1, book_2)) # Notice the result is a 2D 1X1 array, so to grab
                                                          # the number we will need to index                                               

Similarity: [[0.46690432]]


Not only can we use the sklearn.metrics.pairwise.cosine_similiarity function to compute that between two different vectors, we can pass the entire tf-idf matrix into the function as a single argument and it will compute the similarity between each column and every other column, giving back a square matrix, where the entry at is the similarity between movie and (like a correlation matrix for features).

In [11]:
similarities = cosine_similarity(TF_IDF_matrix, dense_output=False)

In [12]:
# Check the shape
# rows and columns should be equal, and the number of movies we started with (rows)
similarities.shape

(129837, 129837)

Now that we can directly compare two movies and we can make recommendations of the form: if you like movie $a$ then you will also like movies $b$, $c$, $d$, $etc$.

We can do this just picking a candidate film and taking its column in the similarity matrix, and then finding those rows where the similarities are highest:

In [13]:
# Test with a sample movie
unique_titles[unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)']

Unnamed: 0,book_title,review_count,avg_review_score
43453,Harry Potter and the Chamber of Secrets (Book 2),313,8.84345


In [14]:
# Get the column based upon the index
unique_title_index = unique_titles[unique_titles['book_title'] == 'Harry Potter and the Chamber of Secrets (Book 2)'].index

# Create a dataframe with the movie titles
sim_df = pd.DataFrame({'book': unique_titles ['book_title'],
                       'similarity': np.array(similarities[unique_title_index, :].todense()).squeeze()})

In [15]:
# Return the top 10 least similar movies
sim_df.sort_values(by='similarity', ascending=True).head(10)

Unnamed: 0,book,similarity
0,A Light in the Storm: The Civil War Diary of ...,0.0
84792,Shadow Watch (Tom Clancy's Power Plays (Paperb...,0.0
84791,Shadow Warriors: Inside the Special Forces,0.0
84789,Shadow Walker,0.0
84788,Shadow Steed,0.0
84786,Shadow Spinner (Jean Karl Books (Paperback)),0.0
84785,Shadow Song,0.0
84784,Shadow Ranch: A Novel,0.0
84783,Shadow Queen,0.0
84781,Shadow Puppets (Ender),0.0


In [16]:
def content_recommender(title, books, similarities, vote_threshold=10) :

    # Get the movie by the title
    book_index = books[books['book_title'] == title].index

    # Create a dataframe with the movie titles
    sim_df = pd.DataFrame(
        {'book': books['book_title'],
         'similarity': np.array(similarities[book_index, :].todense()).squeeze(),
         'Number of reviews': books['review_count'],
         'Avg Rating': books['avg_review_score']
        })

    # Get the top 10 movies with > 10 votes
    top_books = sim_df[sim_df['Number of reviews'] > vote_threshold].sort_values(by='similarity', ascending=False).head(10)

    return top_books

In [17]:
# Test the recommender
similar_movies = content_recommender("Harry Potter and the Chamber of Secrets (Book 2)", unique_titles, similarities, vote_threshold=10)
similar_movies.head(10)

Unnamed: 0,book,similarity,Number of reviews,Avg Rating
43453,Harry Potter and the Chamber of Secrets (Book 2),1.0,313,8.84345
43456,Harry Potter and the Chamber of Secrets Postca...,0.871738,23,9.869565
96651,The Chamber,0.587331,199,7.417085
43481,Harry Potter and the Sorcerer's Stone (Harry P...,0.57532,297,8.915825
43460,Harry Potter and the Goblet of Fire (Book 4),0.524694,233,9.154506
43464,Harry Potter and the Order of the Phoenix (Boo...,0.505216,201,9.034826
43478,Harry Potter and the Sorcerer's Stone (Book 1),0.500895,168,9.077381
43471,Harry Potter and the Prisoner of Azkaban (Book 3),0.466904,263,9.076046
83838,Secrets,0.393631,29,7.068966
43495,Harry Potter y la piedra filosofal,0.373569,13,8.615385


## Collaborative Based Recommendations Filtering - Item-Based Filtering 

Collaborative filtering also relies on similarity between items, as well as similarity between users.

Unlike content-based systems, a collaborative system looks at an item as a collection of ratings. Every item has ratings by some users, and if two items get very similar ratings from users, the items themselves are similar (notice this system is not at all aware of the items' content).

Similarly, we can define users to be similar if they rate items similarly.

In [18]:
len(df['isbn'].value_counts())

264011

In [19]:
len(df['user_id'].value_counts())

86405

Matrix size would be 86405 * 264011

We are going to select the columns that we are going to build the matrix on

In [20]:
df_temp = df[['isbn', 'rating', 'user_id']]

In [21]:
df_temp

Unnamed: 0,isbn,rating,user_id
0,0195153448,0,2
1,0002005018,5,8
2,0002005018,0,11400
4,0002005018,0,41385
5,0002005018,8,67544
...,...,...,...
1031170,0743203763,0,278851
1031171,0767907566,5,278851
1031172,0884159221,7,278851
1031173,0912333022,7,278851


In [22]:
df_temp.shape

(978859, 3)

First we are going to get the number of ratings for each book

In [23]:
books_rated = pd.DataFrame(df.groupby('isbn')['rating'].count()).rename(columns={'rating':'total_rating_count'}).reset_index()

In [24]:
books_rated

Unnamed: 0,isbn,total_rating_count
0,0000913154,1
1,0001010565,2
2,0001046438,1
3,0001046713,1
4,000104687X,1
...,...,...
264006,B000234N76,1
264007,B000234NC6,1
264008,B00029DGGO,1
264009,B0002JV9PY,1


We are going to merge this new dataframe to the df_temp to later on work with only the books with a certain threshold of number of reviews, in this case books with more than 5 ratings.

In [25]:
final = pd.merge(df_temp , books_rated, how='inner', left_on='isbn',right_on='isbn')
final = final[final['total_rating_count']>5]
final.shape

(588399, 4)

In [26]:
final.head()

Unnamed: 0,isbn,rating,user_id,total_rating_count
1,2005018,5,8,12
2,2005018,0,11400,12
3,2005018,0,41385,12
4,2005018,8,67544,12
5,2005018,0,85526,12


In [27]:
'''
 Prepare a matrix
  rows: Users
  columns: Books ISBN
  values : ratings
'''

matrix = pd.pivot_table(data=final, index='isbn', columns='user_id', values='rating')
matrix.head()

user_id,8,9,10,14,16,17,20,23,26,32,...,278832,278836,278838,278843,278844,278846,278849,278851,278852,278854
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005018,5.0,,,,,,,,,,...,,,,,,,,,,
2243962,,,,,,,,,,,...,,,,,,,,,,
2244098,,,,,,,,,,,...,,,,,,,,,,
2251760,,,,,,,,,,,...,,,,,,,,,,
2255081,,,,,,,,,,,...,,,,,,,,,,


In [28]:
matrix.shape

(30902, 66780)

In [29]:
# We are going to fill the null values with 0
matrix.fillna(0,inplace=True)
matrix.head()

user_id,8,9,10,14,16,17,20,23,26,32,...,278832,278836,278838,278843,278844,278846,278849,278851,278852,278854
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005018,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2243962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2244098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2251760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2255081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- The matrix provides the human-readable information (e.g., book titles and user details).
- The csr_data provides the numerical format needed for computational efficiency in algorithms like KNN.

In [30]:
from scipy.sparse import csr_matrix
csr_data = csr_matrix(matrix.values)
matrix.reset_index(inplace=True)

#### Saving the matrix and csr_data

In [31]:
print(matrix.columns)

Index(['isbn',      8,      9,     10,     14,     16,     17,     20,     23,
           26,
       ...
       278832, 278836, 278838, 278843, 278844, 278846, 278849, 278851, 278852,
       278854],
      dtype='object', name='user_id', length=66781)


In [32]:
from scipy.sparse import save_npz
import json

# To save the data from the original matrix for later extract the book titles we are going to get the indexes of the book_titles
# and then save it using json 
# Assuming rows correspond to books in the csr_matrix
book_isbn = matrix['isbn'].tolist()

# Assuming columns correspond to user IDs
user_ids = matrix.columns[1:].tolist()  # Skip the 'book_title' column

# Save book titles and user IDs as JSON files
with open('../data/book_isbn.json', 'w') as f:
    json.dump(book_isbn, f)

with open('../data/user_ids.json', 'w') as f:
    json.dump(user_ids, f)

# Save the csr_matrix to a file
save_npz('../data/csr_data.npz', csr_data)

### Model Creation Nearest Neighbors

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Define hyperparameters to tune
param_grid = {
    'n_neighbors': [10, 15, 20],
    'metric': ['cosine', 'euclidean', 'manhattan'],
    'algorithm': ['auto', 'brute']
}

# Initialize the NearestNeighbors model
knn = NearestNeighbors()

# Use GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Fit with the csr data
grid_search.fit(csr_data)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'algorithm': 'auto', 'metric': 'cosine', 'n_neighbors': 10}


In [34]:
# Save the model to a file
from joblib import dump
best_model = grid_search.best_estimator_
dump(best_model, '../models/model.joblib')

['../models/model.joblib']

## Testing

In [35]:
import json
from recommendation import recommend
from scipy.sparse import load_npz
from joblib import load

# Load the model from the file
optimized_knn = load('../models/model.joblib')

# Load the csr_matrix from the file
csr_data = load_npz('../data/csr_data.npz')

# Load book titles IDs
with open('../data/book_isbn.json', 'r') as f:
    book_isbns = json.load(f)



In [36]:
recommendations = recommend(
    book_isbn="0439064864",  # Harry Potter and the Chamber of Secrets (Book 2)
    knn_model=optimized_knn,
    csr_data=csr_data,
    book_isbns=book_isbns,  # List of ISBNs corresponding to the rows in csr_data
    dataframe=df,  # Original DataFrame with book metadata
    n_neighbors=10
)

recommendations

Unnamed: 0,ISBN,Relevance,Title,Author,year_of_publication,publisher,language,category,Average Rating,Number of Reviews,Total Interactions
2,0439064864,0.0,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999.0,Scholastic,en,fiction,8.94,121,161
3,0439136350,0.362,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999.0,Scholastic,en,fiction,9.08,136,189
7,0590353403,0.377,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998.0,Scholastic,en,fiction,8.99,115,161
4,0439139597,0.449,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000.0,Scholastic,en,fiction,9.31,131,187
6,043935806X,0.667,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003.0,Scholastic,en,fiction,9.03,197,319
5,0439139600,0.842,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2002.0,Scholastic Paperbacks,en,fiction,8.96,102,181
9,0873529758,0.854,MLA Handbook for Writers of Research Papers (5...,Joseph Gibaldi,1999.0,Modern Language Association,en,unknown_category,9.0,5,6
1,0394800303,0.862,Dr. Seuss's A B C (I Can Read It All by Myself...,DR SEUSS,1960.0,Random House Books for Young Readers,na,unknown_category,9.4,5,16
8,0786927062,0.865,"Dragons of a Lost Star (The War of Souls, Volu...",Margaret Weis,2002.0,Wizards of the Coast,na,unknown_category,9.5,4,8
0,0380770334,0.872,Miss Zukas and the Stroke of Death (Miss Zukas...,Jo Dereske,1995.0,Avon,na,unknown_category,8.0,2,7


In [37]:
recommendations = recommend(
    book_isbn="0451525078",  # Harry Potter and the Chamber of Secrets (Book 2)
    knn_model=optimized_knn,
    csr_data=csr_data,
    book_isbns=book_isbns,  # List of ISBNs corresponding to the rows in csr_data
    dataframe=df,  # Original DataFrame with book metadata
    n_neighbors=10
)

recommendations

Unnamed: 0,ISBN,Relevance,Title,Author,year_of_publication,publisher,language,category,Average Rating,Number of Reviews,Total Interactions
5,451525078,0.0,Don Quixote of LA Mancha,Miguel de Cervantes Saavedra,1990.0,Signet Classics,en,fiction,6.33,3,9
3,451205235,0.434,Cold Flat Junction,Martha Grimes,2002.0,New American Library,na,unknown_category,7.25,4,9
7,452283558,0.445,The Captain's Wife: A Novel,Douglas Kelley,2002.0,Plume Books,na,unknown_category,7.5,2,7
2,425171957,0.471,Who Killed Blanche Dubois? (Whodunnit),Carole Bugge,1999.0,Berkley Publishing Group,na,unknown_category,7.33,3,7
0,312963211,0.511,A Stiff Risotto (A Heaven Lee Culinary Mystery),Lou Jane Temple,1997.0,St. Martin's Paperbacks,en,fiction,6.0,2,12
8,553213377,0.7,The Three Musketeers (Bantam Classics),ALEXANDRE DUMAS,1984.0,Bantam Classics,en,fiction,8.36,11,21
6,451525264,0.705,Les Miserables a New Unabridged Translation (S...,Victor Hugo,1987.0,Signet Book,en,fiction,8.45,11,38
9,965645363,0.719,Zarafa: A Giraffe's True Story from Deep in Af...,Michael Allin,1994.0,Walker Co,na,unknown_category,8.33,3,6
4,451523385,0.737,Wuthering Heights (Signet Classic),Emily Bronte,1993.0,Signet Book,en,unknown_category,7.5,14,32
1,399149783,0.739,Monkeewrench,P. J. Tracy,2003.0,Putnam Publishing Group,en,fiction,8.17,12,25


In [38]:
recommendations = recommend(
    book_isbn="0553564684",  # Star Wars: Tales from the Mos Eisley Cantina
    knn_model=optimized_knn,
    csr_data=csr_data,
    book_isbns=book_isbns,  # List of ISBNs corresponding to the rows in csr_data
    dataframe=df,  # Original DataFrame with book metadata
    n_neighbors=15
)

recommendations

Unnamed: 0,ISBN,Relevance,Title,Author,year_of_publication,publisher,language,category,Average Rating,Number of Reviews,Total Interactions
8,0553564684,0.0,Star Wars: Tales from the Mos Eisley Cantina (...,Kevin J. Anderson,1995.0,Bantam,en,fiction,7.57,7,25
5,0553297996,0.544,Dark Apprentice (Star Wars: The Jedi Academy T...,Kevin J. Anderson,1994.0,Bantam,na,unknown_category,6.75,8,28
14,088184389X,0.557,Dr. Bloodmoney,Philip K. Dick,1988.0,Pub Group West,na,unknown_category,8.0,1,6
10,0553568728,0.597,Star Wars: The Truce at Bakura (Star Wars (Ran...,Kathy Tyers,1994.0,Bantam,en,fiction,7.62,8,23
9,0553564927,0.615,The Last Command (Star Wars: The Thrawn Trilog...,Timothy Zahn,1998.0,Bantam,en,fiction,8.22,9,32
12,0553572938,0.615,Star Wars: Children of the Jedi (Star Wars (Ra...,Barbara Hambly,1996.0,Bantam,na,unknown_category,6.0,4,9
6,055329802X,0.629,Champions of the Force (Star Wars: The Jedi Ac...,Kevin J. Anderson,1994.0,Bantam,en,fiction,6.21,14,28
1,0310277728,0.637,Late Great Planet Earth,Hal Lindsey,1970.0,Zondervan,en,religion & spirituality,8.5,2,6
0,0140350489,0.656,Dracula (Puffin Classics),Bram Stoker,1986.0,Putnam Pub Group,na,unknown_category,8.0,1,6
2,0380792893,0.669,Tortoise Soup (Rachel Porter Mysteries),Jessica Speart,1998.0,Avon,en,fiction,8.5,2,6


## Evaluation

In [39]:
import json
from scipy.sparse import load_npz, csr_matrix
from joblib import load
from sklearn.model_selection import train_test_split
import numpy as np

# Load the model from the file
optimized_knn = load('../models/model.joblib')

# Load the csr_matrix from the file
csr_data = load_npz('../data/csr_data.npz')

# Load book titles and Users IDs
with open('../data/book_isbn.json', 'r') as f:
    book_isbns = json.load(f)

with open('../data/user_ids.json', 'r') as f:
    user_ids = json.load(f)


In [40]:
print(f"Number of columns in csr_data: {csr_data.shape[1]}")
print(f"Number of rows in csr_data: {csr_data.shape[0]}")
print(f"Length of user_ids: {len(user_ids)}")
print(f"Length of book_isbns: {len(book_isbns)}")

Number of columns in csr_data: 66780
Number of rows in csr_data: 30902
Length of user_ids: 66780
Length of book_isbns: 30902


In [46]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# Convert book_isbns to a NumPy array for indexing
book_isbns = np.array(book_isbns)

# Convert the sparse matrix to dense format for splitting
csr_dense = csr_data.toarray()

# Split data into training and testing sets (80/20 split)
train_data, test_data = train_test_split(csr_dense, test_size=0.2, random_state=42)

# Convert training data back to sparse format
csr_train = csr_matrix(train_data)

# Helper function for Precision@k
def precision_at_k(recommended_items, relevant_items, k):
    """Compute Precision@k"""
    recommended_at_k = recommended_items[:k]
    relevant_at_k = set(recommended_at_k).intersection(set(relevant_items))
    return len(relevant_at_k) / k

# Helper function for Recall@k
def recall_at_k(recommended_items, relevant_items, k):
    """Compute Recall@k"""
    recommended_at_k = recommended_items[:k]
    relevant_at_k = set(recommended_at_k).intersection(set(relevant_items))
    return len(relevant_at_k) / len(relevant_items) if relevant_items else 0

# Function to evaluate the recommendation system
def evaluate_recommender(knn_model, csr_train, test_data, book_isbns, k=10):
    """
    Evaluate the recommendation system using Precision@k and Recall@k.

    Parameters:
    - knn_model: Trained kNN model.
    - csr_train: Sparse matrix for training data.
    - test_data: Dense matrix for testing data.
    - book_isbns: List or array of book ISBNs (rows of the matrix).
    - k: Number of top recommendations to consider.

    Returns:
    - mean_precision: Average Precision@k across all books.
    - mean_recall: Average Recall@k across all books.
    """
    precision_scores = []
    recall_scores = []

    for book_idx, test_row in enumerate(test_data):  # Iterate over books (rows)
        # Get the users who interacted with this book (non-zero entries in the test set)
        relevant_users = set(np.flatnonzero(test_row))

        if not relevant_users:
            continue  # Skip books with no test interactions

        # Generate recommendations for the book
        distances, indices = knn_model.kneighbors(csr_train[book_idx], n_neighbors=k)
        recommended_books = book_isbns[indices.flatten()]  # Map indices to book ISBNs

        # Get books that share users with the current book in the test set
        relevant_books = set(
            book_isbns[np.flatnonzero(csr_dense[:, list(relevant_users)].sum(axis=1))]
        )

        # Compute Precision@k and Recall@k
        precision_scores.append(precision_at_k(recommended_books, relevant_books, k))
        recall_scores.append(recall_at_k(recommended_books, relevant_books, k))

    # Calculate average Precision@k and Recall@k
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)

    return mean_precision, mean_recall

# Evaluate the kNN model
mean_precision, mean_recall = evaluate_recommender(optimized_knn, csr_train, test_data, book_isbns, k=10)

# Print evaluation metrics
print(f"Mean Precision@k: {mean_precision}")
print(f"Mean Recall@k: {mean_recall}")


Mean Precision@k: 0.012980199695379927
Mean Recall@k: 0.0002715457791659188


## Evaluation 2

In [None]:
import json
from scipy.sparse import load_npz, csr_matrix
from joblib import load
from sklearn.model_selection import train_test_split
import numpy as np

# Load the model from the file
optimized_knn = load('../models/model.joblib')

# Load the csr_matrix from the file
csr_data = load_npz('../data/csr_data.npz')

# Load book titles and Users IDs
with open('../data/book_isbn.json', 'r') as f:
    book_isbns = json.load(f)

with open('../data/user_ids.json', 'r') as f:
    user_ids = json.load(f)


In [47]:
from surprise import accuracy
from surprise.model_selection import train_test_split

ModuleNotFoundError: No module named 'surprise'

In [48]:
!pip install surprise

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'error'
Failed to build scikit-surprise


  error: subprocess-exited-with-error
  
  Building wheel for scikit-surprise (pyproject.toml) did not run successfully.
  exit code: 1
  
  [115 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build\lib.win-amd64-cpython-38\surprise
  copying surprise\accuracy.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\builtin_datasets.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\dataset.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\dump.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\reader.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\trainset.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\utils.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\__init__.py -> build\lib.win-amd64-cpython-38\surprise
  copying surprise\__main__.py -> build\lib.win-amd64-cpython-38\surprise
  creating build\lib.win-amd64-cpython-38\surprise\model_selectio