In [1]:
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
import numpy as np

# Load the dataset
all_cleaned = pd.read_csv('data/all_cleaned.csv', usecols=['user_id', 'isbn', 'book_rating', 'mod_book_title', 'mod_book_author', 'genre'])

# Convert 'user_id' and 'isbn' to strings to ensure compatibility with LightFM
all_cleaned['user_id'] = all_cleaned['user_id'].astype(str)
all_cleaned['isbn'] = all_cleaned['isbn'].astype(str)




In [2]:
# categorize genre

# Define the genre mapping
genre_mapping = {
    'fiction': 'Fiction',
    'romance': 'Romance',
    'historical': 'Historical Fiction',
    'thrillers': 'Thriller',
    'suspense': 'Suspense',
    'juvenile fiction': 'Juvenile Fiction',
    'children\'s literature': 'Children\'s Literature',
    'guidebooks': 'Guidebooks',
    'poetry': 'Poetry',
    'songs': 'Music',
    'ballads': 'Music',
    'unknown': 'Unknown',
    'error fetching data': 'Unknown'
}

# Function to categorize genres
def categorize_genre(genre):
    genre_lower = genre.lower()
    for keyword, category in genre_mapping.items():
        if keyword in genre_lower:
            return category
    return 'Other'

# Apply the function to the genre column
all_cleaned['categorized_genre'] = all_cleaned['genre'].apply(categorize_genre)

print(all_cleaned[['genre', 'categorized_genre']])

                                                    genre categorized_genre
0                                                 unknown           Unknown
1                                                 unknown           Unknown
2                                                 unknown           Unknown
3                                                 unknown           Unknown
4                                                 unknown           Unknown
...                                                   ...               ...
183595  Accident victims, American fiction (fictional ...           Fiction
183596  Open Library Staff Picks, Satire, War stories,...           Fiction
183597                                Error fetching data           Unknown
183598  Bicycle touring, Description and travel, Journ...        Guidebooks
183599  Bicycle touring, Description and travel, Journ...        Guidebooks

[183600 rows x 2 columns]


In [3]:
# dropping the genre column
all_cleaned = all_cleaned.drop(columns=['genre'])
all_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183600 entries, 0 to 183599
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   isbn               183600 non-null  object 
 1   user_id            183600 non-null  object 
 2   book_rating        183600 non-null  float64
 3   mod_book_author    183600 non-null  object 
 4   mod_book_title     183600 non-null  object 
 5   categorized_genre  183600 non-null  object 
dtypes: float64(1), object(5)
memory usage: 8.4+ MB


In [4]:
# Create a LightFM dataset object
dataset = Dataset()

# Fit the dataset to include all unique users, items, and item features
dataset.fit(
    users=all_cleaned['user_id'].unique(),
    items=all_cleaned['isbn'].unique(),
    item_features=(all_cleaned['mod_book_author'].unique().tolist() + 
                   all_cleaned['categorized_genre'].unique().tolist())
)

# Build the user-item interaction matrix based on explicit feedback (book_rating)
(interactions, weights) = dataset.build_interactions(
    [(x[0], x[1], x[2]) for x in all_cleaned[['user_id', 'isbn', 'book_rating']].values]

)

# Retrieve user and item mappings

user_mapping = {user_id: idx for idx, user_id in enumerate(all_cleaned['user_id'].unique())}
item_mapping = {isbn: idx for idx, isbn in enumerate(all_cleaned['isbn'].unique())}
isbn_info = {row['isbn']: {'mod_book_title': row['mod_book_title'], 'mod_book_author': row['mod_book_author']} for _, row in all_cleaned.iterrows()}



In [6]:
#Create item features by combining author and genre
item_features = dataset.build_item_features(
    [(row['isbn'], [row['mod_book_author'], row['categorized_genre']]) for _, row in all_cleaned.iterrows()]
)


In [7]:
# Define the LightFM model using the logistic loss function for explicit feedback
model = LightFM(loss='logistic')

# Train the model on the interactions matrix with item features
model.fit(interactions, item_features=item_features, epochs=30, num_threads=2)


<lightfm.lightfm.LightFM at 0x108440790>

In [7]:
all_cleaned.user_id.head()

0     16634.0
1     87141.0
2    169736.0
3    208406.0
4    230496.0
Name: user_id, dtype: object

In [8]:
# Define the recommend_books function
def recommend_books(model, interactions, user_id, user_mapping, item_mapping, isbn_info, num_recommendations=5):
    # Ensure the user_id is a string
    user_id = str(user_id)

    # Check if the user_id exists in the user mapping
    if user_id not in user_mapping:
        raise ValueError(f"User ID {user_id} is not found in the dataset.")

    # Get the internal index for the user_id
    user_idx = user_mapping[user_id]

    # Predict scores for all items for the given user
    scores = model.predict(user_idx, np.arange(interactions.shape[1]))

    # Get the indices of the top scores
    top_items = np.argsort(-scores)[:num_recommendations]

    # Map the indices back to ISBNs and fetch their title and author
    recommended_books = []
    for item in top_items:
        isbn = list(item_mapping.keys())[list(item_mapping.values()).index(item)]
        book_info = isbn_info.get(isbn, {'mod_book_title': 'Unknown Title', 'mod_book_author': 'Unknown Author'})
        recommended_books.append({'isbn': isbn, 'title': book_info['mod_book_title'], 'author': book_info['mod_book_author']})

    return recommended_books

In [9]:
# Example usage
user_id_to_recommend = '16634.0'  # Replace with an actual user_id from your dataset

try:
    # Generate recommendations
    recommended_books = recommend_books(model, interactions, user_id_to_recommend, user_mapping, item_mapping, isbn_info)
    
    # Display the recommended books with titles and authors
    for book in recommended_books:
        print(f"ISBN: {book['isbn']}, Title: {book['title']}, Author: {book['author']}")
except ValueError as e:
    print(e)

ISBN: 1883938813, Title: the ghost dance insurrection a jazzman novel, Author: jack random
ISBN: 1878093401, Title: marvin composes a tea and other humorous stories, Author: judith hunt
ISBN: 1878093398, Title: the ghostly bell ringer and other mysteries, Author: highlights for children
ISBN: 679721886, Title: the woman warrior memoirs of a girlhood among ghosts, Author: maxine hong kingston
ISBN: 1863733671, Title: on writing books for children, Author: jenny wagner
