In [2]:
import random
import numpy as np
import pandas as pd
from pandas import DataFrame
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from surprise import Dataset, Reader, KNNWithMeans

from entities import UserBook, Book, User

pd.set_option('future.no_silent_downcasting', True) # Avoid downcasting warnings

# Constants
ENGINE = create_engine("sqlite:///../../bookshelf.db", echo=True)
HYPERPLANES = 8

# ** Inner helper functions
# TODO: Must be implemented
def overall_rate(user_book : UserBook) -> float:
    return random.randint(1, 5)

In [56]:
def load_rating_frame() -> DataFrame:    
    with Session(ENGINE) as session:
        user_books = session.query(UserBook).all() # Get all user_books

        rating_frame = DataFrame(columns=['book_id', 'user_id', 'overall_rating']) # Create utility matrix

        for i, user_book in enumerate(user_books):
            rating_frame.loc[i] = (user_book.bookId, user_book.userId, overall_rate(user_book))
    return rating_frame

rating_frame = load_rating_frame()

2024-08-21 17:16:58,587 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-21 17:16:58,589 INFO sqlalchemy.engine.Engine SELECT "UserBooks".id AS "UserBooks_id", "UserBooks"."readRatio" AS "UserBooks_readRatio", "UserBooks".rating AS "UserBooks_rating", "UserBooks".comment AS "UserBooks_comment", "UserBooks".shared AS "UserBooks_shared", "UserBooks"."bookId" AS "UserBooks_bookId", "UserBooks"."userId" AS "UserBooks_userId" 
FROM "UserBooks"
2024-08-21 17:16:58,590 INFO sqlalchemy.engine.Engine [cached since 1224s ago] ()
2024-08-21 17:19:51,692 INFO sqlalchemy.engine.Engine ROLLBACK


In [47]:
def build_utility_matrix() -> DataFrame:    
    with Session(ENGINE) as session:
        users = session.query(User).all() # Get all users
        books = session.query(Book).all() # Get all books
        
        user_books = session.query(UserBook).all() # Get all user_books

        utility_matrix = DataFrame(index=[book.id for book in books], columns=[user.id for user in users]) # Create utility matrix

        for user_book in user_books:
            utility_matrix.loc[user_book.bookId, user_book.userId] = overall_rate(user_book) # Populate utility matrix

    utility_matrix = utility_matrix.fillna(0)
    return utility_matrix

utility_matrix = build_utility_matrix()

2024-08-21 17:15:43,381 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-21 17:15:43,381 INFO sqlalchemy.engine.Engine SELECT "Users".id AS "Users_id", "Users".name AS "Users_name", "Users".email AS "Users_email" 
FROM "Users"
2024-08-21 17:15:43,382 INFO sqlalchemy.engine.Engine [cached since 1149s ago] ()
2024-08-21 17:15:43,388 INFO sqlalchemy.engine.Engine SELECT "Books".id AS "Books_id", "Books".name AS "Books_name", "Books".author AS "Books_author", "Books".language AS "Books_language", "Books".year AS "Books_year" 
FROM "Books"
2024-08-21 17:15:43,389 INFO sqlalchemy.engine.Engine [cached since 1149s ago] ()
2024-08-21 17:15:43,432 INFO sqlalchemy.engine.Engine SELECT "UserBooks".id AS "UserBooks_id", "UserBooks"."readRatio" AS "UserBooks_readRatio", "UserBooks".rating AS "UserBooks_rating", "UserBooks".comment AS "UserBooks_comment", "UserBooks".shared AS "UserBooks_shared", "UserBooks"."bookId" AS "UserBooks_bookId", "UserBooks"."userId" AS "UserBooks_userId" 
FROM "User

In [57]:
utility_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
1,0,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,3,0,0,0
3,0,3,0,0,0,0,4,2,0,0,...,0,0,0,0,1,4,0,0,0,0
4,0,0,0,3,0,0,0,0,0,4,...,0,4,3,0,0,0,0,0,0,0
5,0,0,0,5,0,0,0,0,0,0,...,0,5,0,0,0,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
def build_lsh(utility_matrix : DataFrame) -> dict[str, list[int]]:
    lsh_dict : dict[str, list[int]]= {}
    book_lsh : dict[int, str] = {}

    hyperplanes = HYPERPLANES
    dimension = utility_matrix.shape[1] # Take vector dimensions

    plane_norms = np.random.rand(hyperplanes, dimension) - .5

    for index, row in utility_matrix.iterrows():
        row_centered = row - row.mean() # Center the row to zero mean.
        row_vector = row_centered.values # Take the row vector

        row_dot = np.dot(row_vector, plane_norms.T) # Compute the dot product with each plane.
        row_dot = (row_dot > 0).astype(int) # Convert the result to binary.
        hash_str = ''.join(row_dot.astype(str)) # Convert to string

        # Save in dictionary
        if hash_str in lsh_dict:
            lsh_dict[hash_str].append(index)
        else:
            lsh_dict[hash_str] = [index] 
        
        book_lsh[index] = hash_str

    return lsh_dict, book_lsh

lsh_dict, book_lsh = build_lsh(utility_matrix)

In [58]:
book_lsh

{1: '11010110',
 2: '10010110',
 3: '01000011',
 4: '11011100',
 5: '11100101',
 6: '11100101',
 7: '00110110',
 8: '01000101',
 9: '00000000',
 10: '01000011',
 11: '00111010',
 12: '10110110',
 13: '10010001',
 14: '01111111',
 15: '01101010',
 16: '01001011',
 17: '10110111',
 18: '00110101',
 19: '00011101',
 20: '00100100',
 21: '01000100',
 22: '00000101',
 23: '00110100',
 24: '00000000',
 25: '00000000',
 26: '00111110',
 27: '00000000',
 28: '10000100',
 30: '00000000',
 31: '11100111',
 32: '00010111',
 33: '11110100',
 34: '00000000',
 35: '00001101',
 36: '00110100',
 41: '00001111',
 42: '01101100',
 43: '00101000',
 44: '00000000',
 45: '10000100',
 46: '00100100',
 47: '00110100',
 48: '00000000',
 51: '01110101',
 54: '01001100',
 55: '11111101',
 57: '00000000',
 58: '00000000',
 59: '01001101',
 60: '11111111',
 61: '10110101',
 62: '00111010',
 64: '00000000',
 68: '00000000',
 70: '00000000',
 71: '01011100',
 72: '00000000',
 73: '11101010',
 74: '10111001',
 76: '

In [51]:
lsh_dict

{'11010110': [1, 98, 40580],
 '10010110': [2, 14859, 31516],
 '01000011': [3, 10, 1732, 11030, 12513, 20038],
 '11011100': [4, 100, 963, 20781],
 '11100101': [5, 6, 5827, 23079, 25063, 26494, 35690, 40986, 67092],
 '00110110': [7, 1947, 3090, 18857, 26739, 30201, 32415, 57333, 74111],
 '01000101': [8, 847, 2600, 2946, 8438, 17405, 23042, 24571, 35139, 61168],
 '00000000': [9,
  24,
  25,
  27,
  30,
  34,
  44,
  48,
  57,
  58,
  64,
  68,
  70,
  72,
  79,
  81,
  83,
  91,
  95,
  97,
  101,
  106,
  118,
  122,
  123,
  124,
  125,
  134,
  142,
  144,
  148,
  151,
  153,
  154,
  157,
  165,
  166,
  169,
  173,
  200,
  207,
  208,
  211,
  213,
  216,
  218,
  221,
  223,
  224,
  225,
  232,
  238,
  242,
  247,
  262,
  263,
  269,
  274,
  309,
  316,
  325,
  341,
  351,
  367,
  368,
  370,
  391,
  409,
  432,
  445,
  447,
  456,
  467,
  468,
  470,
  471,
  479,
  494,
  501,
  506,
  507,
  525,
  526,
  535,
  537,
  540,
  543,
  559,
  564,
  573,
  575,
  601,
  6

In [111]:
user_id = 10
def get_closest_books(user_id : int):
    with Session(ENGINE) as session:
        # Select book-ids from user's user_books
        user_books = session.query(UserBook).filter(UserBook.userId == user_id).all()
    rated_books = [user_book.bookId for user_book in user_books]

    # Sort by overall rating
    rated_books = sorted(rated_books, key=lambda x: utility_matrix.loc[x, user_id])

    # Take top ten rated_books
    if len(rated_books) > 10:
        rated_books = rated_books[0:10]

    target_books : set[int] = set() 
    for book in rated_books:
        hash_str = book_lsh[book]
        close_books = lsh_dict[hash_str]

        if len(close_books) > 10:
            close_books = random.choices(close_books, k=10)
        
        for close_book in close_books:
            if close_book not in target_books and close_book not in rated_books:
                target_books.add(close_book)
        
    return target_books

closest_ones = get_closest_books(user_id)


2024-08-21 17:41:39,186 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-21 17:41:39,187 INFO sqlalchemy.engine.Engine SELECT "UserBooks".id AS "UserBooks_id", "UserBooks"."readRatio" AS "UserBooks_readRatio", "UserBooks".rating AS "UserBooks_rating", "UserBooks".comment AS "UserBooks_comment", "UserBooks".shared AS "UserBooks_shared", "UserBooks"."bookId" AS "UserBooks_bookId", "UserBooks"."userId" AS "UserBooks_userId" 
FROM "UserBooks" 
WHERE "UserBooks"."userId" = ?
2024-08-21 17:41:39,188 INFO sqlalchemy.engine.Engine [cached since 2746s ago] (10,)
2024-08-21 17:41:39,192 INFO sqlalchemy.engine.Engine ROLLBACK


In [60]:
rating_frame

Unnamed: 0,book_id,user_id,overall_rating
0,23079,0,4
1,27348,0,4
2,56327,0,2
3,5116,0,3
4,768,0,5
...,...,...,...
125038,23997,499,2
125039,5333,499,5
125040,65903,499,1
125041,140,499,4


In [61]:
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(rating_frame[['book_id', 'user_id', 'overall_rating']], reader)

In [62]:
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}

algo = KNNWithMeans(sim_options=sim_options)

trainingSet = dataset.build_full_trainset()

In [63]:
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7eb5245f3790>

In [96]:
prediction = algo.predict(3, 140)

In [97]:
prediction.est

3.3680811107943622

In [112]:
# closest_ones = sorted(closest_ones, key=lambda x: (algo.predict(user_id, x)).est)

pairs = []
for book in closest_ones:
    pairs.append((algo.predict(user_id, book).est, book))

s = sorted(pairs, key=lambda x: -x[0])
s

[(3.2008827384284593, 6),
 (3.117810474329362, 45),
 (3.0942400899009592, 10),
 (2.996449221467815, 3328),
 (2.996449221467815, 73985),
 (2.996449221467815, 73987),
 (2.996449221467815, 16389),
 (2.996449221467815, 19337),
 (2.996449221467815, 657),
 (2.996449221467815, 67092),
 (2.996449221467815, 43541),
 (2.996449221467815, 11030),
 (2.996449221467815, 28054),
 (2.996449221467815, 40986),
 (2.996449221467815, 55201),
 (2.996449221467815, 10150),
 (2.996449221467815, 23079),
 (2.996449221467815, 42157),
 (2.996449221467815, 49965),
 (2.996449221467815, 10800),
 (2.996449221467815, 14264),
 (2.996449221467815, 18241),
 (2.996449221467815, 5827),
 (2.996449221467815, 1732),
 (2.996449221467815, 17989),
 (2.996449221467815, 20038),
 (2.996449221467815, 3913),
 (2.996449221467815, 8525),
 (2.996449221467815, 50637),
 (2.996449221467815, 12753),
 (2.996449221467815, 32596),
 (2.996449221467815, 2009),
 (2.996449221467815, 73951),
 (2.996449221467815, 12513),
 (2.996449221467815, 2274),
 (