In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import concurrent.futures
from joblib import Parallel, delayed
import multiprocessing
from sklearn.cluster import DBSCAN
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import seaborn as sns

In [None]:
to_read = pd.read_csv('to_read.csv')
book_tags = pd.read_csv('book_tags.csv')
tags = pd.read_csv('tags.csv')

In [39]:
# Load the dataset
books = pd.read_csv('books.csv')
ratings = pd.read_csv('ratings.csv')


In [None]:
# Check the first few rows to understand the structure
print(books.head())
print(ratings.head())

In [40]:
# Check for missing values
print(books.isnull().sum())
print(ratings.isnull().sum())

book_id                         0
goodreads_book_id               0
best_book_id                    0
work_id                         0
books_count                     0
isbn                          700
isbn13                        585
authors                         0
original_publication_year      21
original_title                585
title                           0
language_code                1084
average_rating                  0
ratings_count                   0
work_ratings_count              0
work_text_reviews_count         0
ratings_1                       0
ratings_2                       0
ratings_3                       0
ratings_4                       0
ratings_5                       0
image_url                       0
small_image_url                 0
dtype: int64
user_id    0
book_id    0
rating     0
dtype: int64


In [41]:
# Handle missing values
books.fillna(0, inplace=True)

In [8]:
interaction_matrix = ratings.pivot(index='user_id', columns='book_id', values='rating').fillna(0)

book_id  1      2      3      4      5      6      7      8      9      10     \
user_id                                                                         
1          0.0    0.0    0.0    5.0    0.0    0.0    0.0    0.0    0.0    4.0   
2          0.0    5.0    0.0    0.0    5.0    0.0    0.0    4.0    0.0    5.0   
3          0.0    0.0    0.0    3.0    0.0    0.0    0.0    0.0    0.0    0.0   
4          0.0    5.0    0.0    4.0    4.0    0.0    4.0    4.0    0.0    5.0   
5          0.0    0.0    0.0    0.0    0.0    4.0    0.0    0.0    0.0    0.0   
...        ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
53420      4.0    5.0    3.0    0.0    2.0    0.0    0.0    0.0    4.0    3.0   
53421      4.0    5.0    0.0    5.0    4.0    0.0    4.0    0.0    5.0    0.0   
53422      4.0    5.0    0.0    0.0    0.0    0.0    5.0    0.0    0.0    5.0   
53423      4.0    5.0    0.0    5.0    0.0    0.0    5.0    4.0    0.0    0.0   
53424      4.0    5.0    4.0

In [10]:
print(interaction_matrix)


book_id  1      2      3      4      5      6      7      8      9      10     \
user_id                                                                         
1          0.0    0.0    0.0    5.0    0.0    0.0    0.0    0.0    0.0    4.0   
2          0.0    5.0    0.0    0.0    5.0    0.0    0.0    4.0    0.0    5.0   
3          0.0    0.0    0.0    3.0    0.0    0.0    0.0    0.0    0.0    0.0   
4          0.0    5.0    0.0    4.0    4.0    0.0    4.0    4.0    0.0    5.0   
5          0.0    0.0    0.0    0.0    0.0    4.0    0.0    0.0    0.0    0.0   
...        ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
53420      4.0    5.0    3.0    0.0    2.0    0.0    0.0    0.0    4.0    3.0   
53421      4.0    5.0    0.0    5.0    4.0    0.0    4.0    0.0    5.0    0.0   
53422      4.0    5.0    0.0    0.0    0.0    0.0    5.0    0.0    0.0    5.0   
53423      4.0    5.0    0.0    5.0    0.0    0.0    5.0    4.0    0.0    0.0   
53424      4.0    5.0    4.0

In [12]:
# Count the number of books rated by each user
user_book_counts = (interaction_matrix > 0).sum(axis=1)
# Filter users who have rated 4 or more books
filtered_interaction_matrix = interaction_matrix[user_book_counts >= 4]

print(f"Original shape: {interaction_matrix.shape}")
print(f"Filtered shape: {filtered_interaction_matrix.shape}")


Original shape: (53424, 10000)
Filtered shape: (53424, 10000)


In [20]:
user_book_counts.max()


200

In [21]:
from scipy.sparse import csr_matrix

# Convert the filtered interaction matrix to a sparse matrix
sparse_interaction_matrix = csr_matrix(filtered_interaction_matrix.values)

In [23]:
sparse_interaction_matrix

<53424x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 5976479 stored elements in Compressed Sparse Row format>

In [24]:
from scipy.sparse.linalg import svds
import numpy as np

# Assuming 'sparse_interaction_matrix' is your user-item sparse matrix
# Choose the number of factors
num_factors = 50  # This is a hyperparameter you can tune

# Perform matrix factorization
u, s, vt = svds(sparse_interaction_matrix, k=num_factors)

# Diagonalize s
s_diag_matrix = np.diag(s)

# Reconstruct the predictions matrix
predicted_ratings = np.dot(np.dot(u, s_diag_matrix), vt)


In [43]:
# Convert to DataFrame for easier handling
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=filtered_interaction_matrix.index, columns=filtered_interaction_matrix.columns)

# Function to recommend books for a user
def recommend_books(user_id, num_recommendations=5):
    # Get user's predictions
    user_row = predicted_ratings_df.loc[user_id]

    # Sort predictions
    sorted_user_predictions = user_row.sort_values(ascending=False)

    # Get user data and merge in the book information
    user_data = ratings[ratings.user_id == user_id]
    user_full = (user_data.merge(books, how='left', left_on='book_id', right_on='book_id').
                 sort_values(['rating'], ascending=False)
                 )

    print("User {0} has already rated {1} books.".format(user_id, user_full.shape[0]))
    print("Recommending the highest {0} predicted ratings books not already rated.".format(num_recommendations))

    # Recommend the highest predicted rating books that the user hasn't seen yet.
    recommendations = (books[~books['book_id'].isin(user_full['book_id'])].
                       merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left',
                             left_on='book_id',
                             right_on='book_id').
                       rename(columns={user_id: 'Predictions'}).
                       sort_values('Predictions', ascending=False).
                       iloc[:num_recommendations, :-1]
                       )

    return recommendations

# Example: Recommend 5 books for user with ID 1
recommendations_for_user = recommend_books(user_id=1, num_recommendations=5)
print(recommendations_for_user)


User 1 has already rated 117 books.
Recommending the highest 5 predicted ratings books not already rated.
     book_id  goodreads_book_id  best_book_id  work_id  books_count  \
41        63               6185          6185  1565818         2498   
142      195            2728527       2728527  2754161          116   
223      291            3591262       3591262  3633533           74   
80       118               7763          7763  1955658          126   
121      172           15823480      15823480  2507928         1492   

          isbn        isbn13  \
41   393978893  9.780394e+12   
142  385340990  9.780385e+12   
223  375414495  9.780375e+12   
80   143038095  9.780143e+12   
121  345803922  9.780346e+12   

                                               authors  \
41                       Emily Brontë, Richard J. Dunn   
142                    Mary Ann Shaffer, Annie Barrows   
223                                   Abraham Verghese   
80                                        

In [33]:
print(type(books), type(ratings), type(predicted_ratings_df))


<class 'NoneType'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [51]:
from sklearn.cluster import KMeans

# Number of clusters - this is a hyperparameter you can tune
num_clusters = 10

# We'll use the reduced dimensionality matrix from the SVD step
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(sparse_interaction_matrix.T)  # Transpose to cluster books

# Add cluster information to the books DataFrame
books['cluster'] = clusters

In [52]:
clusters

array([7, 8, 6, ..., 1, 1, 1])

In [53]:
def recommend_books_cluster(user_id, num_recommendations=5):
    # Identify user's preferred clusters
    user_data = ratings[ratings.user_id == user_id]
    user_book_clusters = books[books['book_id'].isin(user_data['book_id'])]['cluster']
    preferred_clusters = user_book_clusters.value_counts().index.tolist()

    # Generate recommendations within these clusters
    recommendations = []
    for cluster in preferred_clusters:
        cluster_books = books[books['cluster'] == cluster]
        user_unrated_books = cluster_books[~cluster_books['book_id'].isin(user_data['book_id'])]
        sorted_predictions = predicted_ratings_df.loc[user_id][user_unrated_books['book_id']].sort_values(ascending=False)
        top_books = sorted_predictions.head(num_recommendations).index.tolist()
        recommendations.extend(top_books)
        if len(recommendations) >= num_recommendations:
            break

    return books[books['book_id'].isin(recommendations)].head(num_recommendations)


In [54]:
recommend_books_cluster(5678, 7)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,cluster
1422,1423,10619,10619,833191,123,340640146,9780341000000.0,Stephen King,1995.0,Rose Madder,...,76251,1714,2065,8052,23483,22705,19946,https://images.gr-assets.com/books/1375870513m...,https://images.gr-assets.com/books/1375870513s...,1
1693,1694,40317,40317,1943980,43,312349491,9780312000000.0,Janet Evanovich,2007.0,Lean Mean Thirteen,...,77820,2214,309,2120,16987,30069,28335,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,1
1820,1821,2292384,2292384,3173189,39,312349513,9780312000000.0,Janet Evanovich,2008.0,Fearless Fourteen,...,75011,3147,435,2834,17632,28086,26024,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,1
1958,1959,6048530,6048530,6224145,36,312383282,9780312000000.0,Janet Evanovich,2009.0,Finger Lickin' Fifteen,...,70872,3795,578,3242,17002,25909,24141,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,1
2093,2094,9583508,11732634,14470450,37,307932230,9780308000000.0,"Janet Evanovich, Lorelei King",2011.0,Smokin' Seventeen,...,64640,4838,589,3246,16376,23607,20822,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,1
2137,2138,7023554,7023554,7270473,37,739377728,9780739000000.0,Janet Evanovich,2009.0,Sizzling Sixteen,...,63664,3784,585,3334,16294,22899,20552,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,1
2263,2264,11746513,11746513,16696160,33,345527712,9780346000000.0,Janet Evanovich,2011.0,Explosive Eighteen,...,59894,4637,503,3147,15891,21582,18771,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,1


In [70]:
unique_user_ratings = ratings[ratings["user_id"] == 5678].book_id
books[books["book_id"].isin(unique_user_ratings)]


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,cluster
1,2,3,3,4640799,491,439554934,9.780440e+12,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,8
3,4,2657,2657,3275794,487,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,8
8,9,960,960,3338963,311,1416524797,9.781417e+12,Dan Brown,2000.0,Angels & Demons,...,2078754,25112,77841,145740,458429,716569,680175,https://images.gr-assets.com/books/1303390735m...,https://images.gr-assets.com/books/1303390735s...,3
17,18,5,5,2402163,376,043965548X,9.780440e+12,"J.K. Rowling, Mary GrandPré, Rufus Beck",1999.0,Harry Potter and the Prisoner of Azkaban,...,1969375,36099,6716,20413,166129,509447,1266670,https://images.gr-assets.com/books/1499277281m...,https://images.gr-assets.com/books/1499277281s...,6
20,21,2,2,2809203,307,439358078,9.780439e+12,"J.K. Rowling, Mary GrandPré",2003.0,Harry Potter and the Order of the Phoenix,...,1840548,28685,9528,31577,180210,494427,1124806,https://images.gr-assets.com/books/1387141547m...,https://images.gr-assets.com/books/1387141547s...,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7046,7047,32453,32453,1049922,62,747236380,9.780747e+12,"Leigh Nichols, Dean Koontz",1984.0,Twilight,...,15878,274,191,1017,4824,5482,4364,https://images.gr-assets.com/books/1327999566m...,https://images.gr-assets.com/books/1327999566s...,1
7863,7864,9549,9549,858558,27,553574132,9.780554e+12,Steve Perry,1996.0,,...,13688,382,185,927,3880,5001,3695,https://images.gr-assets.com/books/1361481442m...,https://images.gr-assets.com/books/1361481442s...,1
8221,8222,77786,77786,209567,22,553100351,9.780553e+12,Timothy Zahn,1998.0,Star Wars: Vision of the Future,...,12908,203,60,414,2790,5081,4563,https://images.gr-assets.com/books/1328289436m...,https://images.gr-assets.com/books/1328289436s...,1
8782,8783,760110,760110,863900,25,553297988,9.780553e+12,Kevin J. Anderson,1994.0,Jedi Search (Star Wars: The Jedi Academy Trilo...,...,13060,233,291,1110,4449,4390,2820,https://images.gr-assets.com/books/1327958212m...,https://images.gr-assets.com/books/1327958212s...,1


In [105]:
state_mapping = {
    'highly_positive_rater': 0,
    'positive_rater': 1,
    'mixed_rater': 2,
    'critical_rater': 3
}

In [104]:
def determine_state_from_ratings(user_ratings):
    average_rating = user_ratings['rating'].mean()

    if average_rating >= 4.0:
        return 'highly_positive_rater'
    elif average_rating >= 3.0:
        return 'positive_rater'
    elif average_rating >= 2.0:
        return 'mixed_rater'
    else:
        return 'critical_rater'


In [112]:
def get_initial_state(user_id, ratings_df):
    if user_id not in ratings_df['user_id'].unique():
        # New user or insufficient data, assign a default state
        return np.random.randint(0, 10)
    
    # For existing users, analyze their rating patterns to determine the initial state
    user_ratings = ratings_df[ratings_df['user_id'] == user_id]
    # Implement logic to determine state from user_ratings
    initial_state = state_mapping[determine_state_from_ratings(user_ratings)]
    return initial_state


In [111]:
def calculate_reward(rating):
    # Example: linear reward based on rating
    return rating - 3  # Assuming ratings are from 1 to 5


In [113]:
def take_action(state, action, user_id, ratings_df, book_list):
    # Convert action to book ID
    recommended_book_id = book_list[action]
    user_rating = ratings_df[(ratings_df['user_id'] == user_id) & (ratings_df['book_id'] == recommended_book_id)]

    if not user_rating.empty:
        reward = calculate_reward(user_rating.iloc[0]['rating'])
        done = True
    else:
        reward = 0
        done = False

    # Update the state based on the user's latest interactions
    # For simplicity, you might keep the state unchanged or implement a logic to change it
    new_state = state  # or update_state_based_on_rating(user_id, recommended_book_id, ratings_df)

    return new_state, reward, done

In [115]:
import numpy as np

# Assuming a finite number of states and actions
book_list = ratings['book_id'].unique().tolist()

max_actions = 1000  # Set a limit to the number of unique books
unique_books = ratings['book_id'].unique()
if len(unique_books) > max_actions:
    unique_books = np.random.choice(unique_books, max_actions, replace=False)
book_to_index = {book_id: idx for idx, book_id in enumerate(unique_books)}

num_states = 4  # Adjust based on your state definitions
num_actions = len(book_list)  # Number of unique books
q_table = np.zeros((num_states, num_actions))

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.6  # Discount factor
epsilon = 0.1  # Exploration rate

# Function to choose an action
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.choice(num_actions)
    else:
        action = np.argmax(q_table[state])
    return action

for episode in range(20):
    state = get_initial_state(5678, ratings)
    done = False

    while not done:
        action = choose_action(state)
        new_state, reward, done = take_action(action, state, 5678, ratings, book_list)
                
        # Debugging print statements
        print(f"Current State: {state}, Action: {action}, New State: {new_state}")

        # Update Q-table
        q_table[state, action] = q_table[state, action] + alpha * (reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])
        state = new_state

# Use the trained Q-table for recommendations


Current State: 1, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New State: 0
Current State: 0, Action: 0, New S

IndexError: index 644 is out of bounds for axis 0 with size 4

In [102]:
print(f"New state: {new_state}")

New state: 7446


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
interaction_matrix_scaled = scaler.fit_transform(interaction_matrix)
interaction_matrix = pd.DataFrame(interaction_matrix_scaled, index=interaction_matrix.index, columns=interaction_matrix.columns)


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(interaction_matrix, test_size=0.2, random_state=42)
