In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import concurrent.futures
from joblib import Parallel, delayed
import multiprocessing
from sklearn.cluster import DBSCAN
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import seaborn as sns

In [None]:
to_read = pd.read_csv('to_read.csv')
book_tags = pd.read_csv('book_tags.csv')
tags = pd.read_csv('tags.csv')

In [39]:
# Load the dataset
books = pd.read_csv('books.csv')
ratings = pd.read_csv('ratings.csv')


In [None]:
# Check the first few rows to understand the structure
print(books.head())
print(ratings.head())

In [40]:
# Check for missing values
print(books.isnull().sum())
print(ratings.isnull().sum())

book_id                         0
goodreads_book_id               0
best_book_id                    0
work_id                         0
books_count                     0
isbn                          700
isbn13                        585
authors                         0
original_publication_year      21
original_title                585
title                           0
language_code                1084
average_rating                  0
ratings_count                   0
work_ratings_count              0
work_text_reviews_count         0
ratings_1                       0
ratings_2                       0
ratings_3                       0
ratings_4                       0
ratings_5                       0
image_url                       0
small_image_url                 0
dtype: int64
user_id    0
book_id    0
rating     0
dtype: int64


In [41]:
# Handle missing values
books.fillna(0, inplace=True)

In [8]:
interaction_matrix = ratings.pivot(index='user_id', columns='book_id', values='rating').fillna(0)

book_id  1      2      3      4      5      6      7      8      9      10     \
user_id                                                                         
1          0.0    0.0    0.0    5.0    0.0    0.0    0.0    0.0    0.0    4.0   
2          0.0    5.0    0.0    0.0    5.0    0.0    0.0    4.0    0.0    5.0   
3          0.0    0.0    0.0    3.0    0.0    0.0    0.0    0.0    0.0    0.0   
4          0.0    5.0    0.0    4.0    4.0    0.0    4.0    4.0    0.0    5.0   
5          0.0    0.0    0.0    0.0    0.0    4.0    0.0    0.0    0.0    0.0   
...        ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
53420      4.0    5.0    3.0    0.0    2.0    0.0    0.0    0.0    4.0    3.0   
53421      4.0    5.0    0.0    5.0    4.0    0.0    4.0    0.0    5.0    0.0   
53422      4.0    5.0    0.0    0.0    0.0    0.0    5.0    0.0    0.0    5.0   
53423      4.0    5.0    0.0    5.0    0.0    0.0    5.0    4.0    0.0    0.0   
53424      4.0    5.0    4.0

In [10]:
print(interaction_matrix)


book_id  1      2      3      4      5      6      7      8      9      10     \
user_id                                                                         
1          0.0    0.0    0.0    5.0    0.0    0.0    0.0    0.0    0.0    4.0   
2          0.0    5.0    0.0    0.0    5.0    0.0    0.0    4.0    0.0    5.0   
3          0.0    0.0    0.0    3.0    0.0    0.0    0.0    0.0    0.0    0.0   
4          0.0    5.0    0.0    4.0    4.0    0.0    4.0    4.0    0.0    5.0   
5          0.0    0.0    0.0    0.0    0.0    4.0    0.0    0.0    0.0    0.0   
...        ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
53420      4.0    5.0    3.0    0.0    2.0    0.0    0.0    0.0    4.0    3.0   
53421      4.0    5.0    0.0    5.0    4.0    0.0    4.0    0.0    5.0    0.0   
53422      4.0    5.0    0.0    0.0    0.0    0.0    5.0    0.0    0.0    5.0   
53423      4.0    5.0    0.0    5.0    0.0    0.0    5.0    4.0    0.0    0.0   
53424      4.0    5.0    4.0

In [12]:
# Count the number of books rated by each user
user_book_counts = (interaction_matrix > 0).sum(axis=1)
# Filter users who have rated 4 or more books
filtered_interaction_matrix = interaction_matrix[user_book_counts >= 4]

print(f"Original shape: {interaction_matrix.shape}")
print(f"Filtered shape: {filtered_interaction_matrix.shape}")


Original shape: (53424, 10000)
Filtered shape: (53424, 10000)


In [20]:
user_book_counts.max()


200

In [21]:
from scipy.sparse import csr_matrix

# Convert the filtered interaction matrix to a sparse matrix
sparse_interaction_matrix = csr_matrix(filtered_interaction_matrix.values)

In [23]:
sparse_interaction_matrix

<53424x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 5976479 stored elements in Compressed Sparse Row format>

In [24]:
from scipy.sparse.linalg import svds
import numpy as np

# Assuming 'sparse_interaction_matrix' is your user-item sparse matrix
# Choose the number of factors
num_factors = 50  # This is a hyperparameter you can tune

# Perform matrix factorization
u, s, vt = svds(sparse_interaction_matrix, k=num_factors)

# Diagonalize s
s_diag_matrix = np.diag(s)

# Reconstruct the predictions matrix
predicted_ratings = np.dot(np.dot(u, s_diag_matrix), vt)


In [43]:
# Convert to DataFrame for easier handling
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=filtered_interaction_matrix.index, columns=filtered_interaction_matrix.columns)

# Function to recommend books for a user
def recommend_books(user_id, num_recommendations=5):
    # Get user's predictions
    user_row = predicted_ratings_df.loc[user_id]

    # Sort predictions
    sorted_user_predictions = user_row.sort_values(ascending=False)

    # Get user data and merge in the book information
    user_data = ratings[ratings.user_id == user_id]
    user_full = (user_data.merge(books, how='left', left_on='book_id', right_on='book_id').
                 sort_values(['rating'], ascending=False)
                 )

    print("User {0} has already rated {1} books.".format(user_id, user_full.shape[0]))
    print("Recommending the highest {0} predicted ratings books not already rated.".format(num_recommendations))

    # Recommend the highest predicted rating books that the user hasn't seen yet.
    recommendations = (books[~books['book_id'].isin(user_full['book_id'])].
                       merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left',
                             left_on='book_id',
                             right_on='book_id').
                       rename(columns={user_id: 'Predictions'}).
                       sort_values('Predictions', ascending=False).
                       iloc[:num_recommendations, :-1]
                       )

    return recommendations

# Example: Recommend 5 books for user with ID 1
recommendations_for_user = recommend_books(user_id=1, num_recommendations=5)
print(recommendations_for_user)


User 1 has already rated 117 books.
Recommending the highest 5 predicted ratings books not already rated.
     book_id  goodreads_book_id  best_book_id  work_id  books_count  \
41        63               6185          6185  1565818         2498   
142      195            2728527       2728527  2754161          116   
223      291            3591262       3591262  3633533           74   
80       118               7763          7763  1955658          126   
121      172           15823480      15823480  2507928         1492   

          isbn        isbn13  \
41   393978893  9.780394e+12   
142  385340990  9.780385e+12   
223  375414495  9.780375e+12   
80   143038095  9.780143e+12   
121  345803922  9.780346e+12   

                                               authors  \
41                       Emily Brontë, Richard J. Dunn   
142                    Mary Ann Shaffer, Annie Barrows   
223                                   Abraham Verghese   
80                                        

In [33]:
print(type(books), type(ratings), type(predicted_ratings_df))


<class 'NoneType'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [46]:
from sklearn.cluster import KMeans

# Number of clusters - this is a hyperparameter you can tune
num_clusters = 10

# We'll use the reduced dimensionality matrix from the SVD step
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(sparse_interaction_matrix.T)  # Transpose to cluster books

# Add cluster information to the books DataFrame
books['cluster'] = clusters

  super()._check_params_vs_input(X, default_n_init=10)


In [47]:
clusters

array([7, 8, 6, ..., 1, 1, 1])

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
interaction_matrix_scaled = scaler.fit_transform(interaction_matrix)
interaction_matrix = pd.DataFrame(interaction_matrix_scaled, index=interaction_matrix.index, columns=interaction_matrix.columns)


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(interaction_matrix, test_size=0.2, random_state=42)
