In [5]:
# Reinstall lightfm with gcc-14 compiled to use all threads
!CC=gcc-14 pip install --no-binary lightfm lightfm --force-reinstall

Collecting lightfm
  Using cached lightfm-1.17-cp311-cp311-macosx_14_0_arm64.whl
Collecting numpy (from lightfm)
  Using cached numpy-2.1.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting scipy>=0.17.0 (from lightfm)
  Using cached scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting requests (from lightfm)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting scikit-learn (from lightfm)
  Using cached scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting charset-normalizer<4,>=2 (from requests->lightfm)
  Using cached charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests->lightfm)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests->lightfm)
  Using cached urllib3-2.2.3-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests->lightfm)
  Using cached certifi-2024.8.3

In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
from lightfm.data import Dataset

In [1]:
# Load the dataset
# all_cleaned = pd.read_csv('../data/all_cleaned.csv', usecols=['user_id', 'isbn', 'book_rating'])
%run import_data_.py

Goodreads dataset loaded successfully as books_goodreads
Pandas dataframes (books_goodreads, books_big, book, users, ratings) loaded successfully
Columns in DataFrames 'users' and 'ratings' renamed
You can use the DataFrames 'books' or 'books_big' - they are exactly the same (big) dataset
loading books_ratings and books_users_ratings
Ready to go!


In [7]:
# Convert 'user_id' and 'isbn' to strings to ensure compatibility with LightFM
books_users_ratings['user_id'] = books_users_ratings['user_id'].astype(int)
books_users_ratings['user_id'] = books_users_ratings['user_id'].astype(str)
books_users_ratings['isbn'] = books_users_ratings['isbn'].astype(str)
books_users_ratings['year_of_publication'] = books_users_ratings['year_of_publication'].astype(str)

In [10]:
# Create a LightFM dataset object
dataset = Dataset()

# Fit the dataset to include all unique users and items
dataset.fit(
    users = books_users_ratings['user_id'].unique(),
    items = books_users_ratings['isbn'].unique()
)

# Build the user-item interaction matrix based on explicit feedback (book_rating)
(interactions, weights) = dataset.build_interactions(
    [(x[0], x[1], x[2]) for x in books_users_ratings[['user_id', 'isbn', 'individual_rating']].values]
)

# Get the user and item mappings
user_mapping, _, item_mapping, _ = dataset.mapping()


In [11]:
# Define the LightFM model using the logistic loss function for explicit feedback
model = LightFM(loss='warp')

# Train the model on the interactions matrix
model.fit(interactions, epochs=30, num_threads=2)


<lightfm.lightfm.LightFM at 0x321324b90>

In [19]:
import joblib

# Save the trained LightFM model
joblib.dump(model, '../streamlit_files/lightfm_model.pkl')


['../streamlit_files/lightfm_model.pkl']

In [12]:
def recommend_books(model, interactions, user_id, user_mapping, item_mapping, num_recommendations=10):
    # Ensure the user_id is a string
    user_id = str(user_id)

    # Check if the user_id exists in the user mapping
    if user_id not in user_mapping:
        raise ValueError(f"User ID {user_id} is not found in the dataset.")

    # Get the internal index for the user_id
    user_idx = user_mapping[user_id]

    # Predict scores for all items for the given user
    scores = model.predict(user_idx, np.arange(interactions.shape[1]))

    # Get the indices of the top scores
    top_items = np.argsort(-scores)[:num_recommendations]

    # Map the indices back to ISBNs
    recommended_isbns = [list(item_mapping.keys())[list(item_mapping.values()).index(item)] for item in top_items]

    return recommended_isbns


In [20]:
user_id_to_recommend = '16634'  # Replace with an actual user_id from your dataset

# Ensure the user ID is a string
user_id_to_recommend = str(user_id_to_recommend)

recommended_books = recommend_books(model, interactions, user_id_to_recommend, user_mapping, item_mapping)

