In [1]:
import pandas as pd

In [15]:
df_ratings = pd.read_csv('ratings.csv')
df_books = pd.read_csv('books.csv')

In [17]:
df_ratings.head(5)
df_books.head(5)

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [21]:
# Clean and filter data
MIN_USER_RATINGS = 15
MIN_BOOK_RATINGS = 10 

try:
    ratings_df = pd.read_csv('ratings.csv')
    books_df = pd.read_csv('books.csv')
except FileNotFoundError:
    print("Error: Ensure 'ratings.csv' and 'books.csv' are in the current directory.")

ratings_df = ratings_df.rename(columns={'book_id': 'item_id'})
ratings_df = ratings_df.rename(columns={'user_id': 'user_id'})

user_counts = ratings_df['user_id'].value_counts()

active_users = user_counts[user_counts >= MIN_USER_RATINGS].index

filtered_ratings_df = ratings_df[ratings_df['user_id'].isin(active_users)]

print(f"Total ratings before user filter: {len(ratings_df)}")
print(f"Total ratings after user filter: {len(filtered_ratings_df)}")
print(f"Active users remaining: {len(active_users)}")

Total ratings before user filter: 981756
Total ratings after user filter: 788206
Active users remaining: 18546


In [23]:
book_counts = filtered_ratings_df['item_id'].value_counts()
popular_books = book_counts[book_counts >= MIN_BOOK_RATINGS].index

# Filter the ratings DataFrame again
final_ratings_df = filtered_ratings_df[filtered_ratings_df['item_id'].isin(popular_books)]

print(f"\nTotal ratings after book filter: {len(final_ratings_df)}")
print(f"Popular books remaining: {len(popular_books)}")
print(f"Final size of the dataset for training: {len(final_ratings_df)}")



Total ratings after book filter: 787925
Popular books remaining: 9955
Final size of the dataset for training: 787925


In [25]:
# Get filtered books
final_books_df = books_df[books_df['book_id'].isin(final_ratings_df['item_id'].unique())]

book_id_to_title = pd.Series(final_books_df.title.values, index=final_books_df.book_id).to_dict()

In [29]:
# Libraries for surprise
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

In [31]:
# Define the scale of ratings
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(
    final_ratings_df[['user_id', 'item_id', 'rating']], 
    reader
)
# Chek data for evaluation
trainset, testset = train_test_split(data, test_size=0.20, random_state=42)

print(f"Training set size: {trainset.n_ratings} ratings")
print(f"Test set size: {len(testset)} ratings")

Training set size: 630340 ratings
Test set size: 157585 ratings


In [33]:
# Initialize the SVD model and train

model = SVD(n_factors=100, n_epochs=20, random_state=42, verbose=True)

print("\nStarting Model Training...")
model.fit(trainset)
print("Model Training Complete.")

predictions = model.test(testset)

rmse = accuracy.rmse(predictions, verbose=True) 

# A good RMSE for recommendation systems is generally around 0.8 to 1.0.
print(f"\nModel RMSE on Test Set: {rmse:.4f}")


Starting Model Training...
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Model Training Complete.
RMSE: 0.8329

Model RMSE on Test Set: 0.8329


In [35]:
# Prep model for fast API
import pickle
import json

MODEL_PATH = 'svd_goodreads_model.pkl'

full_trainset = data.build_full_trainset()
model.fit(full_trainset)
print("\nRe-trained model on the full dataset for deployment.")

with open(MODEL_PATH, 'wb') as file:
    pickle.dump(model, file)
print(f"Trained SVD model saved to: {MODEL_PATH}")

# Save book_id
MAPPING_PATH = 'book_id_to_title.json'

try:
    with open(MAPPING_PATH, 'w') as file:
        json.dump(book_id_to_title, file)
    print(f"Book ID to Title map saved to: {MAPPING_PATH}")
except NameError:
    print("Error: 'book_id_to_title' dictionary not found. Ensure the Pandas script was run.")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19

Re-trained model on the full dataset for deployment.
Trained SVD model saved to: svd_goodreads_model.pkl
Book ID to Title map saved to: book_id_to_title.json
