In [1]:
import pandas as pd
import ast
import re

In [2]:
df = pd.read_csv('../Data/GoodReads_100k_books_cleaned.csv')

In [None]:
def isbn_list_to_book_details(isbn_list, df):
    """
    Function to retrieve book details for a list of ISBNs.
    
    Parameters:
    isbn_list (list): A list of ISBN numbers.
    df (DataFrame): The dataset containing book details.
    
    Returns:
    list: A list of dictionaries with book details for each ISBN in isbn_list.
    """
    books_details = []
    
    for isbn in isbn_list:
        book = df[df['isbn'] == isbn]

        if not book.empty:
            book_details = {
                'title': book['title'].values[0],
                'author': book['author'].values[0],
                'genre': book['genre'].values[0],
                'rating': book['rating'].values[0],
                'totalratings': book['totalratings'].values[0],
                'isbn': book['isbn'].values[0],
                'pages': book['pages'].values[0],
                'img': book['img'].values[0]
            }
            books_details.append(book_details)
        else:
            books_details.append(f"Book with ISBN {isbn} not found.")
    
    return books_details

In [None]:
def preprocess_for_content_model(df):
    def parse_genre_list(val):
        if isinstance(val, list): return val
        if not isinstance(val, str): return []
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list): return parsed
        except: pass
        s = val.strip().lstrip('[').rstrip(']')
        items = []
        for part in s.split(','):
            item = part.strip().strip("'\" ")
            if item and '...' not in item: items.append(item)
        return items

    def clean_text(x):
        if not isinstance(x, str):
            return ''
        x = x.lower()
        x = re.sub(r'[^\w\s]', ' ', x)
        x = re.sub(r'\s+', ' ', x)
        return x.strip()

    df = df.copy()
    
    df['genre_list'] = df['genre_list'].apply(parse_genre_list)
    
    df['title_clean'] = df['title'].apply(clean_text)
    df['author_clean'] = df['author'].apply(clean_text)
    df['desc_clean'] = df['desc'].apply(clean_text)
    df['genres_clean'] = df['genre_list'].apply(lambda genres: ' '.join([clean_text(g) for g in genres]))
    df['combined_features'] = df['title_clean'] + ' ' + df['author_clean'] + ' ' + df['desc_clean'] + ' ' + df['genres_clean']

    return df

In [6]:
df_clean = preprocess_for_content_model(df)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np

def build_sparse_similarity(df_clean, top_k=20):
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(df_clean['combined_features'])

    model = NearestNeighbors(n_neighbors=top_k + 1, metric='cosine')
    model.fit(tfidf_matrix)

    distances, indices = model.kneighbors(tfidf_matrix)

    similarity_dict = {
        i: [idx for idx in neighbor_list[1:]]
        for i, neighbor_list in enumerate(indices)
    }

    return similarity_dict

In [8]:
similarity_matrix = build_sparse_similarity(df_clean)

In [None]:
def recommend_similar_isbns(isbn, df, similarity_dict):
    if isbn not in df['isbn'].values:
        return []

    idx = df[df['isbn'] == isbn].index[0]

    similar_idxs = similarity_dict.get(idx, [])
    return df.iloc[similar_idxs]['isbn'].tolist()

In [12]:
recom = recommend_similar_isbns('002914180X', df_clean, similarity_matrix)

In [17]:
isbn_list_to_book_details(['002914180X'], df_clean)

[{'title': 'Between Two Fires: American Indians in the Civil War',
  'author': 'Laurence M. Hauptman',
  'genre': 'History,Military History,Civil War,American History,American Civil War,Nonfiction,North American Hi...,American History,Native Americans',
  'rating': 3.52,
  'totalratings': 33,
  'isbn': '002914180X',
  'pages': 0,
  'img': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1387738765l/1001053.jpg'}]

In [15]:
isbn_list_to_book_details(recom, df_clean)

[{'title': "Chancellorsville: Lee's Greatest Battle",
  'author': 'Edward J. Stackpole',
  'genre': 'Military History,Civil War,History,Nonfiction,American History,American Civil War,North American Hi...,American History,American Civil War,Civil War History,Military,Military History',
  'rating': 3.6,
  'totalratings': 45,
  'isbn': '811722384',
  'pages': 0,
  'img': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1347720555l/1644548.jpg'},
 {'title': 'Jeb Stuart: The Last Cavalier',
  'author': 'Burke Davis',
  'genre': 'Military History,Civil War,Biography,History,American History,American Civil War,North American Hi...,American History,American Civil War,Civil War History,War,Military Fiction,Nonfiction,War',
  'rating': 3.94,
  'totalratings': 170,
  'isbn': '517185970',
  'pages': 462,
  'img': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1387736934l/1083113.jpg'},
 {'title': 'The Blue and the Gray (2 Vols in 1)',
  'author': 'Henr

In [18]:
import ast
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [19]:
class ContentBasedRecommender:
    def __init__(self, df, top_k=20):
        self.df_original = df.copy()
        self.top_k = top_k
        self.df_clean = self._preprocess(df)
        self.similarity_dict = self._build_sparse_similarity()

    def _parse_genre_list(self, val):
        if isinstance(val, list): return val
        if not isinstance(val, str): return []
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list): return parsed
        except: pass
        s = val.strip().lstrip('[').rstrip(']')
        items = []
        for part in s.split(','):
            item = part.strip().strip("'\" ")
            if item and '...' not in item: items.append(item)
        return items

    def _clean_text(self, x):
        if not isinstance(x, str):
            return ''
        x = x.lower()
        x = re.sub(r'[^\w\s]', ' ', x)
        x = re.sub(r'\s+', ' ', x)
        return x.strip()

    def _preprocess(self, df):
        df = df.copy()

        # Clean and extract features
        df['genre_list'] = df['genre_list'].apply(self._parse_genre_list)
        df['title_clean'] = df['title'].apply(self._clean_text)
        df['author_clean'] = df['author'].apply(self._clean_text)
        df['desc_clean'] = df['desc'].apply(self._clean_text)
        df['genres_clean'] = df['genre_list'].apply(
            lambda genres: ' '.join([self._clean_text(g) for g in genres])
        )

        df['combined_features'] = (
            df['title_clean'] + ' ' +
            df['author_clean'] + ' ' +
            df['desc_clean'] + ' ' +
            df['genres_clean']
        )
        return df

    def _build_sparse_similarity(self):
        tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
        tfidf_matrix = tfidf.fit_transform(self.df_clean['combined_features'])

        model = NearestNeighbors(n_neighbors=self.top_k + 1, metric='cosine')
        model.fit(tfidf_matrix)

        distances, indices = model.kneighbors(tfidf_matrix)

        similarity_dict = {
            i: [idx for idx in neighbor_list[1:]]
            for i, neighbor_list in enumerate(indices)
        }
        return similarity_dict

    def recommend_by_isbn(self, isbn):
        if isbn not in self.df_clean['isbn'].values:
            return []

        idx = self.df_clean[self.df_clean['isbn'] == isbn].index[0]
        similar_idxs = self.similarity_dict.get(idx, [])
        return self.df_clean.iloc[similar_idxs]['isbn'].tolist()


In [8]:
import ast
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
from scipy.sparse import save_npz, load_npz

class ContentBasedRecommender:
    def __init__(self, top_k=20):
        self.top_k = top_k
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
        self.model = NearestNeighbors(n_neighbors=top_k + 1, metric='cosine')
        self.similarity_dict = {}
        self.df_clean = None
        self.tfidf_matrix = None

    def _parse_genre_list(self, val):
        if isinstance(val, list): return val
        if not isinstance(val, str): return []
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list): return parsed
        except: pass
        s = val.strip().lstrip('[').rstrip(']')
        items = []
        for part in s.split(','):
            item = part.strip().strip("'\" ")
            if item and '...' not in item: items.append(item)
        return items

    def _clean_text(self, x):
        if not isinstance(x, str): return ''
        x = x.lower()
        x = re.sub(r'[^\w\s]', ' ', x)
        x = re.sub(r'\s+', ' ', x)
        return x.strip()

    def preprocess(self, df):
        df = df.copy()
        df['genre_list'] = df['genre_list'].apply(self._parse_genre_list)
        df['title_clean'] = df['title'].apply(self._clean_text)
        df['author_clean'] = df['author'].apply(self._clean_text)
        df['desc_clean'] = df['desc'].apply(self._clean_text)
        df['genres_clean'] = df['genre_list'].apply(lambda genres: ' '.join([self._clean_text(g) for g in genres]))
        df['combined_features'] = df['title_clean'] + ' ' + df['author_clean'] + ' ' + df['desc_clean'] + ' ' + df['genres_clean']
        self.df_clean = df
        return df

    def build_similarity(self):
        tfidf_matrix = self.vectorizer.fit_transform(self.df_clean['combined_features'])
        self.tfidf_matrix = tfidf_matrix
        self.model.fit(tfidf_matrix)

        distances, indices = self.model.kneighbors(tfidf_matrix)
        self.similarity_dict = {
            i: [idx for idx in neighbors[1:]]  # skip self
            for i, neighbors in enumerate(indices)
        }

    def recommend_by_isbn(self, isbn):
        if isbn not in self.df_clean['isbn'].values:
            return []
        idx = self.df_clean[self.df_clean['isbn'] == isbn].index[0]
        similar_idxs = self.similarity_dict.get(idx, [])
        return self.df_clean.iloc[similar_idxs]['isbn'].tolist()

    def save(self, path_prefix):
        columns_to_keep = ['isbn', 'title_clean', 'author_clean', 'desc_clean', 'genres_clean', 'combined_features']
        self.df_clean = self.df_clean[columns_to_keep]
        
        with open(f'{path_prefix}_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.vectorizer, f)
        with open(f'{path_prefix}_model.pkl', 'wb') as f:
            pickle.dump(self.model, f)
        with open(f'{path_prefix}_similarity_dict.pkl', 'wb') as f:
            pickle.dump(self.similarity_dict, f)
        self.df_clean.to_csv(f'{path_prefix}_df_clean.csv', index=False)
        save_npz(f'{path_prefix}_tfidf_matrix.npz', self.tfidf_matrix)

    def load(self, path_prefix):
        with open(f'{path_prefix}_vectorizer.pkl', 'rb') as f:
            self.vectorizer = pickle.load(f)
        with open(f'{path_prefix}_model.pkl', 'rb') as f:
            self.model = pickle.load(f)
        with open(f'{path_prefix}_similarity_dict.pkl', 'rb') as f:
            self.similarity_dict = pickle.load(f)
        self.df_clean = pd.read_csv(f'{path_prefix}_df_clean.csv')
        self.tfidf_matrix = load_npz(f'{path_prefix}_tfidf_matrix.npz')

In [9]:
import os

recommender = ContentBasedRecommender(top_k=20)
df_clean = recommender.preprocess(df)
recommender.build_similarity()

In [10]:
recommender.save("model_assets/book_recommender")

In [None]:
new_recommender = ContentBasedRecommender()
new_recommender.load("model_assets/book_recommender")
recommendations = new_recommender.recommend_by_isbn("123456789X")

In [13]:
recommendations

[]

In [None]:
import ast
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
from scipy.sparse import save_npz, load_npz

class ContentBasedRecommender:
    def __init__(self, top_k=20):
        self.top_k = top_k
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
        self.model = NearestNeighbors(n_neighbors=top_k + 1, metric='cosine')
        self.similarity_dict = {}
        self.df_clean = None
        self.tfidf_matrix = None

    def _parse_genre_list(self, val):
        if isinstance(val, list): return val
        if not isinstance(val, str): return []
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list): return parsed
        except: pass
        s = val.strip().lstrip('[').rstrip(']')
        items = []
        for part in s.split(','):
            item = part.strip().strip("'\" ")
            if item and '...' not in item: items.append(item)
        return items

    def _clean_text(self, x):
        if not isinstance(x, str): return ''
        x = x.lower()
        x = re.sub(r'[^\w\s]', ' ', x)
        x = re.sub(r'\s+', ' ', x)
        return x.strip()

    def preprocess(self, df):
        required_columns = ['title', 'author', 'desc', 'genre_list']
        for col in required_columns:
            if col not in df.columns:
                raise ValueError(f"Missing required column: {col}")
        
        df = df.copy()
        df['genre_list'] = df['genre_list'].apply(self._parse_genre_list)
        df['title_clean'] = df['title'].apply(self._clean_text)
        df['author_clean'] = df['author'].apply(self._clean_text)
        df['desc_clean'] = df['desc'].apply(self._clean_text)
        df['genres_clean'] = df['genre_list'].apply(lambda genres: ' '.join([self._clean_text(g) for g in genres]))
        df['combined_features'] = df['title_clean'] + ' ' + df['author_clean'] + ' ' + df['desc_clean'] + ' ' + df['genres_clean']
        self.df_clean = df
        return df

    def build_similarity(self):
        if self.df_clean is None:
            raise ValueError("Dataframe is not preprocessed. Call preprocess() first.")
        
        tfidf_matrix = self.vectorizer.fit_transform(self.df_clean['combined_features'])
        self.tfidf_matrix = tfidf_matrix
        self.model.fit(tfidf_matrix)

        distances, indices = self.model.kneighbors(tfidf_matrix)
        self.similarity_dict = {
            i: [idx for idx in neighbors[1:]]  # skip self
            for i, neighbors in enumerate(indices)
        }

    def recommend_by_isbn(self, isbn):
        if isbn not in self.df_clean['isbn'].values:
            return []
        idx = self.df_clean[self.df_clean['isbn'] == isbn].index[0]
        similar_idxs = self.similarity_dict.get(idx, [])
        return self.df_clean.iloc[similar_idxs]['isbn'].tolist()

    def save(self, path_prefix):
        columns_to_keep = ['isbn', 'title_clean', 'author_clean', 'desc_clean', 'genres_clean', 'combined_features']
        self.df_clean = self.df_clean[columns_to_keep]
        
        with open(f'{path_prefix}_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.vectorizer, f)
        with open(f'{path_prefix}_model.pkl', 'wb') as f:
            pickle.dump(self.model, f)
        with open(f'{path_prefix}_similarity_dict.pkl', 'wb') as f:
            pickle.dump(self.similarity_dict, f)
        self.df_clean.to_csv(f'{path_prefix}_df_clean.csv', index=False)
        save_npz(f'{path_prefix}_tfidf_matrix.npz', self.tfidf_matrix)

    def load(self, path_prefix):
        with open(f'{path_prefix}_vectorizer.pkl', 'rb') as f:
            self.vectorizer = pickle.load(f)
        with open(f'{path_prefix}_model.pkl', 'rb') as f:
            self.model = pickle.load(f)
        with open(f'{path_prefix}_similarity_dict.pkl', 'rb') as f:
            self.similarity_dict = pickle.load(f)
        self.df_clean = pd.read_csv(f'{path_prefix}_df_clean.csv')
        self.tfidf_matrix = load_npz(f'{path_prefix}_tfidf_matrix.npz')


In [19]:
df = pd.read_csv("../Data/GoodReads_100k_books_cleaned.csv")

In [None]:
recommender = ContentBasedRecommender()

recommender.preprocess(df)

recommender.build_similarity()

isbn = '002914180X'
recommendations = recommender.recommend_by_isbn(isbn)
print(f"Recommendations for ISBN {isbn}: {recommendations}")

recommender.save('model_assets/book_recommender')

Recommendations for ISBN 002914180X: ['811722384', '517185970', '142620874X', '517060159', '684104261', '807104752', '252062108', '075667185X', '1580800971', '385411456', '300042477', '933031718', '609610236', '870494252', '038515626X', '809095114', '30600413', '809447401', '078581552X', '807835234']


In [None]:
new_recommender = ContentBasedRecommender()

new_recommender.load('model_assets/book_recommender')
isbn = '002914180X'
recommendations = recommender.recommend_by_isbn(isbn)
print(f"Recommendations after loading for ISBN {isbn}: {recommendations}")

Recommendations after loading for ISBN 002914180X: ['811722384', '517185970', '142620874X', '517060159', '684104261', '807104752', '252062108', '075667185X', '1580800971', '385411456', '300042477', '933031718', '609610236', '870494252', '038515626X', '809095114', '30600413', '809447401', '078581552X', '807835234']


In [22]:
recommendations = recommender.recommend_by_isbn(isbn)
print(f"Recommendations after loading for ISBN {isbn}: {recommendations}")

Recommendations after loading for ISBN 002914180X: ['811722384', '517185970', '142620874X', '517060159', '684104261', '807104752', '252062108', '075667185X', '1580800971', '385411456', '300042477', '933031718', '609610236', '870494252', '038515626X', '809095114', '30600413', '809447401', '078581552X', '807835234']


Recommendations after loading for ISBN 002914180X: ['811722384', '517185970', '517060159', '142620874X', '684104261', '807104752', '252062108', '075667185X', '1580800971', '385411456', '300042477', '933031718', '609610236', '870494252', '038515626X', '809095114', '30600413', '809447401', '807835234', '078581552X']
