In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re

In [21]:
# Load your dataset
books = pd.read_csv('book1100k-1200k.csv')  # Replace with your dataset

In [22]:
books = books.drop(columns=[
    'RatingDist5', 
    'RatingDist4', 
    'RatingDist3', 
    'RatingDist2', 
    'RatingDist1', 
    'RatingDistTotal', 
    'CountsOfReview', 
    'Language', 
    'Count of text reviews'
])

In [23]:
books.size

460812

In [24]:
def clean_html_tags(text):
    if isinstance(text, str):
        cleaned_text = re.sub(r'<br\s*/?>', '\n', text)
        return cleaned_text.strip()  # Remove extra spaces and newlines
    else:
        return ''  # Handle non-string values

In [25]:
# Clean HTML tags in descriptions
books['Description'] = books['Description'].apply(clean_html_tags)
# Drop rows where 'Description' is empty or consists only of whitespace
books = books[books['Description'].str.strip().astype(bool)]


In [26]:
books.size

399124

In [27]:
# Combine relevant columns into a single 'features' column
books['features'] = books['Name'] + ' ' + books['Description'] + ' ' + books['Publisher']

In [28]:
books = books.dropna(subset=['Name', 'Authors', 'PublishYear', 'Publisher','Description'])

In [29]:
# Convert PublishYear to string for consistency in feature extraction
books['PublishYear'] = books['PublishYear'].astype(str)

In [30]:
# Initialize a TF-IDF Vectorizer for each searchable column
tfidf_vectorizer_name = TfidfVectorizer(stop_words='english')
tfidf_vectorizer_authors = TfidfVectorizer(stop_words='english')
tfidf_vectorizer_publish_year = TfidfVectorizer(stop_words='english')
tfidf_vectorizer_publisher = TfidfVectorizer(stop_words='english')

In [31]:
# Create separate TF-IDF matrices for each column
tfidf_matrix_name = tfidf_vectorizer_name.fit_transform(books['Name'])
tfidf_matrix_authors = tfidf_vectorizer_authors.fit_transform(books['Authors'])
tfidf_matrix_publish_year = tfidf_vectorizer_publish_year.fit_transform(books['PublishYear'])
tfidf_matrix_publisher = tfidf_vectorizer_publisher.fit_transform(books['Publisher'])


In [35]:
# Function to recommend books based on query parameters
def recommend_books(query_params, top_n=5):
    cosine_similarities = None
    
    # Check which query parameters are provided
    if 'name' in query_params:
        query_vector = tfidf_vectorizer_name.transform([query_params['name']])
        cosine_similarities = linear_kernel(query_vector, tfidf_matrix_name).flatten()

    if 'authors' in query_params:
        query_vector = tfidf_vectorizer_authors.transform([query_params['authors']])
        cosine_similarities_authors = linear_kernel(query_vector, tfidf_matrix_authors).flatten()
        cosine_similarities = cosine_similarities_authors if cosine_similarities is None else cosine_similarities + cosine_similarities_authors

    if 'publish_year' in query_params:
        query_vector = tfidf_vectorizer_publish_year.transform([query_params['publish_year']])
        cosine_similarities_year = linear_kernel(query_vector, tfidf_matrix_publish_year).flatten()
        cosine_similarities = cosine_similarities_year if cosine_similarities is None else cosine_similarities + cosine_similarities_year

    if 'publisher' in query_params:
        query_vector = tfidf_vectorizer_publisher.transform([query_params['publisher']])
        cosine_similarities_publisher = linear_kernel(query_vector, tfidf_matrix_publisher).flatten()
        cosine_similarities = cosine_similarities_publisher if cosine_similarities is None else cosine_similarities + cosine_similarities_publisher

    if cosine_similarities is None:
        raise ValueError("At least one search criterion must be provided.")
    
    # Get top N indices
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Retrieve the top books
    top_books = books.iloc[top_indices]
    
    return top_books[['Name', 'Authors', 'PublishYear', 'Publisher', 'Description','Rating']]


In [36]:
# Example usage
query_params = {
    'name': 'Harry Potter',
}
recommended_books = recommend_books(query_params)
recommended_books.head()

Unnamed: 0,Name,Authors,PublishYear,Publisher,Description,Rating
27907,Harry Potter and the Goblet of Fire (Harry Pot...,J.K. Rowling,2000,Listening Library,"Running time: 20 hrs., 30 mins. 12 cassettes\n...",4.56
10839,"Harry Potter i Zakon Feniksa (Harry Potter, #5)",J.K. Rowling,2004,Media Rodzina,Wobec śmiertelnego zagrożenia ze strony Lorda ...,4.5
11294,"Harry Potter i Czara Ognia (Harry Potter, #4)",J.K. Rowling,2001,Media Rodzina,W tym roku w Szkole Magii i Czarodziejstwa Hog...,4.56
16210,Harry Potter à l'école des sorciers (Harry Pot...,J.K. Rowling,1999,Folio Junior,"Le jour de ses 11 ans, Harry Potter, un orphel...",4.47
10057,Harry Potter et L'Ordre du Phenix (Harry Potte...,J.K. Rowling,2007,Gallimard Jeunesse,Harry is due to start his fifth year at Hogwar...,4.5


In [18]:
import pickle
# Save the TF-IDF Vectorizers and DataFrame
with open('../Web/flask_project/Books_sys/model/tfidf_vectorizer_name.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer_name, f)
with open('../Web/flask_project/Books_sys/model/tfidf_vectorizer_authors.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer_authors, f)
with open('../Web/flask_project/Books_sys/model/tfidf_vectorizer_publish_year.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer_publish_year, f)
with open('../Web/flask_project/Books_sys/model/tfidf_vectorizer_publisher.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer_publisher, f)
with open('../Web/flask_project/Books_sys/model/books_df.pkl', 'wb') as f:
    pickle.dump(books, f)