# Importing all necessary packages

In [38]:
# Data Processing
import pandas as pd
# Compresed Sparse row
from scipy.sparse import csr_matrix
# Nearest neighbours
from sklearn.neighbors import NearestNeighbors

# Importing relevant data

"items_info.dat" contains data field seperated by "\t" but some of the fields also contain "\t" within the text of the file. This causes errors when the file is read directly. The next cell is trying to sort that error.

In [39]:
# This code block creates a new .dat file that contains only the first 6 columns and the last column (the remaining columns are not necessary for the project). 
with open('book_crossing/book_crossing/items_info.dat', 'r', encoding='utf-8') as infile, \
     open('book_crossing/book_crossing/items_info_clean.dat', 'w', encoding='utf-8') as outfile:
    for line in infile:
        parts = line.strip().split('\t')
        if len(parts) >= 6:
            selected_parts = parts[:6] + [parts[-1]]  # First 6 fields + last field
            outfile.write('\t'.join(selected_parts) + '\n')

In [40]:
ratings = pd.read_csv('book_crossing/book_crossing/book_ratings.dat', delimiter = '\t')
history = pd.read_csv('book_crossing/book_crossing/book_history.dat', delimiter = '\t') # Books and readers history.
items = pd.read_csv('book_crossing/book_crossing/items_info_clean.dat', delimiter = '\t', on_bad_lines='skip')  # list of books and Ids (Primary id key)
users = pd.read_csv('book_crossing/book_crossing/users_info.dat', delimiter = '\t') # list of users

# Analysis

## Analysis of ratings dataset

In [41]:
# Total number of readers
n_readers = ratings['user'].nunique()
# Total number of books
n_books = ratings['item'].nunique()

## Book Average ratings

In [42]:
avg_ratings = ratings.groupby('item')['rating'].mean()
avg_ratings_dict = dict(zip(avg_ratings.index, round(avg_ratings, 2)))

## Book titles

In [43]:
book_titles = dict(zip(items['Book_ID'], items['Book-Title']))
inv_book_titles = dict(zip(items['Book-Title'], items['Book_ID']))

In [44]:
# What percent of books accessed were rated
percent_rated = len(ratings) / len(history) * 100
print(f'{round(percent_rated, 2)}% of the books accessed were rated.')
# What user accessed the most books
user_access = history.groupby('user')['item'].count().sort_values(ascending=False)
print(f'The user who accessed the most books is user {user_access.index[0]} with {user_access.max()} books accessed.')
# What user rated the most books
user_ratings = ratings.groupby('user')['item'].count().sort_values(ascending=False)
print(f'The user who rated the most books is user {user_ratings.index[0]} with {user_ratings.max()} books rated.')
# What was the most accessed books
book_access = history.value_counts('item')
print(f'The most accessed book was {book_titles[book_access.index[0]]}, accessed {book_access.max()} times.')
# What was the most rated books?
book_ratings = ratings.value_counts('item')
print(f'The most rated book was {book_titles[book_ratings.index[0]]}, rated {book_ratings.max()} times.')

22.98% of the books accessed were rated.
The user who accessed the most books is user 1614 with 2088 books accessed.
The user who rated the most books is user 1003 with 1092 books rated.
The most accessed book was Airframe, accessed 718 times.
The most rated book was Impossible Vacation, rated 160 times.


## Books with rating

In [45]:
rated_books = items[items['Book_ID'].isin(list(ratings['item'].unique()))]
book_titles = dict(zip(rated_books['Book_ID'], rated_books['Book-Title']))
inv_book_titles = dict(zip(rated_books['Book-Title'], rated_books['Book_ID']))

# Creating Sparse Matrix

In [46]:
# def sparse_matrix(df, user_id_name, item_id_name, rating_column_name):
#     """
#     This function helps to create a sparse matrix (a matrix largely populated by zeroes) from your ratings dataset.
    
#     Parameters:
#     df: Pandas DataFrame
#     user_id_name (str): Column name of the user ID.
#     item_id_name (str): Column name of the item ID.
#     rating_column_name (str): Column name of the ratings.
    
#     Returns:
#     - matrix: the resulting sparse matrix
#     - user_map: a dictionary mapping original user IDs to matrix row indices
#     - item_map: a dictionary mapping original item IDs to matrix column indices
#     - inv_user_map: inverse mapping from row indices back to original user IDs
#     - inv_item_map: inverse mapping from column indices back to original item IDs
#     """
    
#     # Stores the number of unique users and items in the dataset.
#     # This is used in determining the shape of the sparse matrix.
#     n_users = df[user_id_name].nunique()
#     n_items = df[item_id_name].nunique()

#     # Creates a map of the user/item IDs to new sequential indices.
#     user_map = dict(zip(df[user_id_name].unique(), list(range(n_users))))
#     item_map = dict(zip(df[item_id_name].unique(), list(range(n_items))))

#     # Creates the inverse map of user/item IDs for referencing purposes.
#     inv_user_map = dict(zip(list(range(n_users)), df[user_id_name].unique()))
#     inv_item_map = dict(zip(list(range(n_items)), df[item_id_name].unique()))

#     # Applies the new IDs to create index lists.
#     user_index = [user_map[i] for i in df[user_id_name]]
#     item_index = [item_map[i] for i in df[item_id_name]]

#     # Creates the sparse matrix using Compressed Sparse Row (csr) format.
#     item_matrix = csr_matrix((df[rating_column_name], (item_index, user_index)), shape=(n_items, n_users))
#     user_matrix = csr_matrix((df[rating_column_name], (user_index, item_index)), shape=(n_users, n_items))

#     return item_matrix, user_matrix, user_map, item_map, inv_user_map, inv_item_map


In [47]:

def sparse_matrix(df, user_id_name, item_id_name, rating_column_name):
    """
    Create a sparse user-item matrix from a dataframe.
    Returns the item and user matrices, mapping dictionaries, and inverse mappings.
    """

    # Factorize ensures a consistent mapping
    df[user_id_name], user_idx = pd.factorize(df[user_id_name])
    df[item_id_name], item_idx = pd.factorize(df[item_id_name])

    user_map = dict(zip(user_idx, range(len(user_idx))))
    item_map = dict(zip(item_idx, range(len(item_idx))))
    inv_user_map = dict(enumerate(user_idx))
    inv_item_map = dict(enumerate(item_idx))

    user_index = df[user_id_name]
    item_index = df[item_id_name]
    ratings = df[rating_column_name].astype(float)

    item_matrix = csr_matrix((ratings, (item_index, user_index)))
    user_matrix = csr_matrix((ratings, (user_index, item_index)))

    return item_matrix, user_matrix, user_map, item_map, inv_user_map, inv_item_map

In [None]:
# item_matrix, user_matrix, user_map, item_map, inv_user_map, inv_item_map = sparse_matrix(ratings, 'user', 'item', 'rating')

# User similarity function

In [49]:
def similar_users_suggestions(user_item_list, user_rating_list, df, user_id, n_similar_users, metric='cosine'):
    """
    Generates personalized book recommendations by identifying similar users using KNN.
    """
    # Step 1: Create a synthetic new user ID (one greater than current max)
    new_user_id = df[user_id].max() + 1
    df = df.copy()

    # Step 2: Convert book titles to internal book IDs
    user_item_id = [inv_book_titles[i] for i in user_item_list]

    # Step 3: Add the new user and their ratings to the DataFrame
    for index, item_id in enumerate(user_item_id):
        df.loc[len(df)] = {'user': new_user_id, 'item': item_id, 'rating': user_rating_list[index]}

    # Step 4: Rebuild sparse matrix with new user included
    item_matrix, user_matrix, user_map, item_map, inv_user_map, inv_item_map = sparse_matrix(
        df, 'user', 'item', 'rating'
    )

    # Step 5: Get the matrix row index for the new user
    new_user_matrix_index = user_matrix.shape[0] -1
    user_vector = user_matrix[new_user_matrix_index]

    # Step 6: Fit KNN and find neighbors
    k = n_similar_users + 1  # +1 to include the new user in neighbors
    KNN = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=metric)
    KNN.fit(user_matrix)
    neighbours = KNN.kneighbors(user_vector, return_distance=False)

    # Step 7: Map back to user IDs, skipping the new user itself
    neighbouring_user_ids = [inv_user_map[n] for n in neighbours.flatten() if inv_user_map[n] != new_user_id]

    # Step 8: Pick top-rated books from similar users
    book_suggestions = []
    for uid in neighbouring_user_ids:
        user_books = df[df['user'] == uid]
        if not user_books.empty:
            best_book = user_books.sort_values(by='rating', ascending=False)['item'].head(1)
            book_id = int(best_book.values[0])
            book_suggestions.append(book_titles[book_id])

    return sorted(book_suggestions)



In [None]:
# # User similarity function
# def similar_users_suggestions(user_item_list, user_rating_list, df, user_id, n_similar_users, metric='cosine'):
#     """
#     Generates personalized book recommendations by identifying similar users using KNN.
#     """
#     # Step 1: Create a synthetic new user ID (one greater than current max)
#     new_user_id = df[user_id].max() + 1
#     df = df.copy()

#     # Step 2: Convert book titles to internal book IDs
#     user_item_id = [inv_book_titles[i] for i in user_item_list]

#     # Step 3: Add the new user and their ratings to the DataFrame
#     for index, item_id in enumerate(user_item_id):
#         df.loc[len(df)] = {'user': new_user_id, 'item': item_id, 'rating': user_rating_list[index]}

#     # Step 4: Rebuild sparse matrix with new user included
#     item_matrix, user_matrix, user_map, item_map, inv_user_map, inv_item_map = sparse_matrix(
#         df, 'user', 'item', 'rating'
#     )

#     # Step 5: Get the matrix row index for the new user
#     new_user_matrix_index = user_matrix.shape[0] -1
#     user_vector = user_matrix[new_user_matrix_index]

#     # Step 6: Fit KNN and find neighbors
#     k = n_similar_users + 1  # +1 to include the new user in neighbors
#     KNN = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=metric)
#     KNN.fit(user_matrix)
#     neighbours = KNN.kneighbors(user_vector, return_distance=False)

#     # Step 7: Map back to user IDs, skipping the new user itself
#     neighbouring_user_ids = [inv_user_map[n] for n in neighbours.flatten() if inv_user_map[n] != new_user_id]

#     # Step 8: Pick top-rated books from similar users
#     book_suggestions = []
#     for uid in neighbouring_user_ids:
#         user_books = df[df['user'] == uid]
#         if not user_books.empty:
#             best_book = user_books.sort_values(by='rating', ascending=False)['item'].head(1)
#             book_id = int(best_book.values[0])
#             book_suggestions.append(book_titles[book_id])

#     return book_suggestions

In [54]:
user_item_list = ['The Mists of Avalon', 'Love Always Remembers: A Book of Poems', 'Great Expectations (Dover Thrift Editions)']
user_rating_list = [4, 6, 3]
books = similar_users_suggestions(user_item_list, user_rating_list, ratings, 'user', 4)
books

['How to Get Your Child to Love Reading: For Ravenous and Reluctant Readers Alike',
 "Left Behind: A Novel of the Earth's Last Days (Left Behind No. 1)",
 "The Children's Zoo",
 'Writing for Story']

# Item similarity function

In [32]:
def similar_books(book, df, k, metric='cosine'):
    """
    Finds k books similar to the given book based on item-item collaborative filtering.

    Args:
        book (str): The title of the book for which to find similar books.
        df (pd.DataFrame): Ratings DataFrame with columns ['user', 'item', 'rating'].
        k (int): Number of similar books to return.
        metric (str, optional): Distance metric for KNN. Default is 'cosine'.

    Returns:
        List[str]: List of book titles similar to the input book.
    
    Notes:
        - `book_titles` and `inv_book_titles` must be defined globally.
    """
    # Convert book title to item ID
    book_id = inv_book_titles[book]

    # Build item-user matrix
    item_matrix, user_matrix, user_map, item_map, inv_user_map, inv_item_map = sparse_matrix(
        df, 'user', 'item', 'rating'
    )

    # Get matrix row index for the book
    book_loc = item_map[book_id]
    book_vector = item_matrix[book_loc]

    # Fit KNN
    knn = NearestNeighbors(n_neighbors=k + 1, algorithm='brute', metric=metric)
    knn.fit(item_matrix)

    # Find neighbors
    neighbours = knn.kneighbors(book_vector.reshape(1, -1), return_distance=False).flatten()

    # Skip the first neighbor (it will be the book itself)
    similar_ids = [inv_item_map[idx] for idx in neighbours[1:]]

    # Map item IDs back to book titles
    return [book_titles[item_id] for item_id in similar_ids]


In [33]:
books = similar_books('To Kill a Mockingbird', ratings, 5)
books

['Berlin Noir: March Violets/the Pale Criminal/a German Requiem/3 Novels in 1 Volume (Penguin Crime/Mystery)',
 'The Great Train Robbery',
 'Miss You Like Crazy',
 'See No Evil (Loving Dangerously) (Harlequin Superromance, No 722)',
 'Outer Banks']