In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
users = pd.read_csv('../../data/Users.csv', delimiter=';')
books = pd.read_csv('../../data/Books.csv', delimiter=';', dtype={'ISBN': str, 'Title': str, 'Author': str, 'Year': np.int16, 'Publisher': str})
ratings = pd.read_csv('../../data/Ratings.csv', delimiter=';', dtype={'User-ID': np.int32, 'ISBN': str, 'Rating': np.int8})

  users = pd.read_csv('../../data/Users.csv', delimiter=';')


In [23]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [24]:
books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [25]:
books.ISBN.nunique()

271378

In [26]:
books[books.duplicated(subset='ISBN')]

Unnamed: 0,ISBN,Title,Author,Year,Publisher
111813,486404242,War in Kind: And Other Poems (Dover Thrift Edi...,Stephen Crane,1998,Dover Publications


In [27]:
books[books.ISBN == '0486404242']

Unnamed: 0,ISBN,Title,Author,Year,Publisher
111658,486404242,War in Kind: And Other Poems (Dover Thrift Edi...,Stephen Crane,1998,Dover Publications
111813,486404242,War in Kind: And Other Poems (Dover Thrift Edi...,Stephen Crane,1998,Dover Publications


In [28]:
books.drop_duplicates(subset='ISBN', inplace=True)
books = books.reset_index()

isbn_mapping = {category: idx for idx, category in enumerate(books['ISBN'])}


In [33]:
ratings['ISBN_i'] = ratings['ISBN'].map(isbn_mapping)

unknown_isbns = ratings[ratings['ISBN_i'].isna()]
unknown_isbns.head()

Unnamed: 0,User-ID,ISBN,Rating,ISBN_i
6,276736,3257224281,8,
7,276737,0600570967,6,
9,276745,342310538,10,
25,276748,3442437407,0,
26,276751,033390804X,0,


In [35]:
unknown_isbns.shape

(118605, 4)

In [36]:
print(f"Dropping unknown ISBNs, ratings.shape = {ratings.shape}")
ratings.dropna(subset=['ISBN_i'], inplace=True)
print(f"Dropping unknown ISBNs, ratings.shape = {ratings.shape}")

Dropping unknown ISBNs, ratings.shape = (1149780, 4)
Dropping unknown ISBNs, ratings.shape = (1031175, 4)


In [38]:
from scipy.sparse import csr_matrix
ratings['ISBN_i'] = ratings['ISBN_i'].astype(np.int32)
user_item_matrix = csr_matrix((ratings['Rating'], (ratings['User-ID'], ratings['ISBN_i'])), dtype=np.float64)

In [39]:
normalized_matrix = user_item_matrix.copy()

for i in range(normalized_matrix.shape[0]):
    # Extract the row
    row = normalized_matrix.getrow(i)
    
    # Calculate the mean of non-zero elements in the row
    non_zero_elements = row.data
    if non_zero_elements.size > 0:
        mean_non_zero = non_zero_elements.mean()
        
        # Subtract the mean from the non-zero elements
        row.data -= mean_non_zero
    
    # Update the normalized_matrix row
    normalized_matrix.data[normalized_matrix.indptr[i]:normalized_matrix.indptr[i+1]] = row.data


In [40]:
k = 0
for i in range(normalized_matrix.shape[0]):
    row = normalized_matrix.getrow(i)
    if row.data.size > 0:
        print(row.data)
        k+=1
    if k == 10:
        break

[0.]
[ 2.70588235 -2.29411765 -2.29411765 -2.29411765 -2.29411765 -2.29411765
 -2.29411765 -2.29411765  2.70588235 -2.29411765 -2.29411765  2.70588235
  2.70588235 -2.29411765  3.70588235  3.70588235  4.70588235]
[-2.  4. -2.]
[0.]
[0.]
[ 1.  1.  2. -4.]
[ 4.5 -4.5]
[-3. -3.  0. -3.  4.  3.  2.]
[0.]
[0.]


In [15]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Calculate cosine similarity between rows
# similarity_matrix = cosine_similarity(normalized_matrix)
# similarity_matrix.shape


In [16]:
# k = 30
# nearest_neighbors_similarities = np.sort(-similarity_matrix, axis=1)[:, 1:k+1]

# print("Similarity scores to 30 nearest neighbors for each row:")
# print(-nearest_neighbors_similarities[0])

In [41]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=30, metric='cosine', algorithm='auto').fit(normalized_matrix)
distances, indices = nbrs.kneighbors(normalized_matrix)

In [42]:
distances.shape

(278855, 30)

In [43]:
filtered_distances = distances[(distances > 0) & (distances < 1)]
print(filtered_distances.shape)

(851778,)


In [129]:
def recommend(user_id, ratings, books, n_neighbors=10, n_recomm=5):
    # Get the row corresponding to the user
    user_row = ratings.getrow(user_id)
    print(f"user {user_id} rated {len(user_row.indices)} books")

    # Find the nearest neighbors
    distances, indices = nbrs.kneighbors(user_row, n_neighbors)
    #print(f"Indices of nearest neighbors ({user_id}): {indices}")

    # Get the ratings of the neighbors
    neighbor_ratings = normalized_matrix[indices[0]].toarray()
    weighted_rows = neighbor_ratings * distances[0][:, np.newaxis]

    result_vector = weighted_rows.sum(axis=0)
    result_vector = result_vector / distances.sum()

    result_vector.ravel()[user_row.indices] = 0

    sorted_indices = np.argsort(result_vector)
    top_indices = sorted_indices[-n_recomm:][::-1]

    print("Indexes of top 5 elements:", top_indices)

    return books.iloc[top_indices]



user 98391 rated 5779 books
Indexes of top 5 elements: [  5503  21287 157131   3737  20861]
              ISBN                                              Title  \
5503    0874776945  The Artist's Way : A Spiritual Path to Higher ...   
21287   0446606189                    Cat & Mouse (Alex Cross Novels)   
157131  0895551926  Martyrs of the Coliseum With Historical Record...   
3737    0446365505                                    Pleading Guilty   
20861   038072362X                                   A Superior Death   

                 Author  Year                      Publisher  
5503      Julia Cameron  1992              Jeremy P. Tarcher  
21287   James Patterson  1998                   Warner Books  
157131    A. J. Oreilly  1987                Tan Books & Pub  
3737        Scott Turow  1994                   Warner Books  
20861       Nevada Barr  2002  Harper Mass Market Paperbacks  


In [130]:
recommend(ratings.sample(n=1)['User-ID'].astype(np.int32).values[0], normalized_matrix, books, n_neighbors=30, n_recomm=5)

user 241980 rated 314 books
Indexes of top 5 elements: [10928 13283 61813   706 79358]


Unnamed: 0,ISBN,Title,Author,Year,Publisher
10928,446608815,Pop Goes the Weasel,James Patterson,2000,Warner Vision
13283,671642561,Fallen Hearts,V.C. Andrews,1988,Pocket
61813,61031011,Have a Nice Day!: A Tale of Blood and Sweatsocks,Mick Foley,2000,ReganBooks
706,446672211,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,1998,Warner Books
79358,743428617,Midnight Flight (Broken Wings),V.C. Andrews,2003,Pocket Star


In [3]:
import sys
sys.path.append('../..')

import src.recommenders.user_collaborative_filtering as ubcf

model = ubcf.UserRecommender()
model.fit(users=users, items=books, ratings=ratings)

In [4]:
model.predict(users=[ratings.sample(n=1)['User-ID'].astype(np.int32).values[0]], items=books)

[255846]