### Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# for text cleaning that contain special characters or non-Latin characters
import re
from unidecode import unidecode

### Loading the datasets

In [2]:
final_data = pd.read_csv(r'C:\Users\DELL\Downloads\hybrid filtering dataset\FinalData.csv')
final_data.head()

Unnamed: 0,book_id,authors,title,Genres
0,1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",SciFi;Drama
1,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,Fantasy;Young-Age
2,3,Stephenie Meyer,"Twilight (Twilight, #1)",Fantasy
3,4,Harper Lee,To Kill a Mockingbird,Self-Help;Drama
4,5,F. Scott Fitzgerald,The Great Gatsby,Drama


In [3]:
ratings = pd.read_csv(r'C:\Users\DELL\Downloads\hybrid filtering dataset\ratings.csv')
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [4]:
# Get the mean rating for each book_id
avg_ratings = ratings.groupby('book_id')['rating'].mean().reset_index()

# Display the result in a dataframe
avg_ratings = pd.DataFrame(avg_ratings)
avg_ratings

Unnamed: 0,book_id,rating
0,1,4.24
1,2,4.21
2,3,3.09
3,4,4.46
4,5,3.89
...,...,...
994,995,4.03
995,996,3.94
996,997,3.71
997,998,4.44


In [5]:
ratings_count = pd.read_csv(r'C:\Users\DELL\Downloads\hybrid filtering dataset\RatingsCount.csv')
ratings_count

Unnamed: 0,book_id,rating
0,1,22806
1,2,21850
2,3,16931
3,4,19088
4,5,16604
...,...,...
9995,9996,141
9996,9997,93
9997,9998,102
9998,9999,130


### Data Preprocessing

In [6]:
# Clean the 'author' column by transliterating non-Latin characters
final_data['authors'] = final_data['authors'].apply(lambda x: ', '.join([re.sub(r'[^A-Za-z0-9\s]+', '', unidecode(str(name))) for name in x.split(', ')]))

# Clean the 'title' column by removing content within parentheses and transliterating non-Latin characters
final_data['title'] = final_data['title'].apply(lambda x: re.sub(r'\([^)]*\)', '', unidecode(str(x))))
final_data

Unnamed: 0,book_id,authors,title,Genres
0,1,Suzanne Collins,The Hunger Games,SciFi;Drama
1,2,"JK Rowling, Mary GrandPre",Harry Potter and the Sorcerer's Stone,Fantasy;Young-Age
2,3,Stephenie Meyer,Twilight,Fantasy
3,4,Harper Lee,To Kill a Mockingbird,Self-Help;Drama
4,5,F Scott Fitzgerald,The Great Gatsby,Drama
...,...,...,...,...
994,995,Michelle Hodkin,The Unbecoming of Mara Dyer,Fiction
995,996,Janet Evanovich,Three to Get Deadly,Crime
996,997,Emmuska Orczy,The Scarlet Pimpernel,History;Fiction
997,998,"Jon Stone, Michael J Smollin",The Monster at the End of this Book,Fiction;Kids


In [7]:
# Merging the dataframes

# Merge with average ratings
merged_df = pd.merge(final_data, avg_ratings, on='book_id')

# Merge with ratings count
merged_df = pd.merge(merged_df, ratings_count, on='book_id')

In [8]:
merged_df

Unnamed: 0,book_id,authors,title,Genres,rating_x,rating_y
0,1,Suzanne Collins,The Hunger Games,SciFi;Drama,4.24,22806
1,2,"JK Rowling, Mary GrandPre",Harry Potter and the Sorcerer's Stone,Fantasy;Young-Age,4.21,21850
2,3,Stephenie Meyer,Twilight,Fantasy,3.09,16931
3,4,Harper Lee,To Kill a Mockingbird,Self-Help;Drama,4.46,19088
4,5,F Scott Fitzgerald,The Great Gatsby,Drama,3.89,16604
...,...,...,...,...,...,...
994,995,Michelle Hodkin,The Unbecoming of Mara Dyer,Fiction,4.03,1015
995,996,Janet Evanovich,Three to Get Deadly,Crime,3.94,1334
996,997,Emmuska Orczy,The Scarlet Pimpernel,History;Fiction,3.71,1315
997,998,"Jon Stone, Michael J Smollin",The Monster at the End of this Book,Fiction;Kids,4.44,1307


In [9]:
# Renaming the columns

merged_df.rename(columns={'rating_x': 'mean_rating', 'rating_y': 'total_ratings'}, inplace=True)
merged_df

Unnamed: 0,book_id,authors,title,Genres,mean_rating,total_ratings
0,1,Suzanne Collins,The Hunger Games,SciFi;Drama,4.24,22806
1,2,"JK Rowling, Mary GrandPre",Harry Potter and the Sorcerer's Stone,Fantasy;Young-Age,4.21,21850
2,3,Stephenie Meyer,Twilight,Fantasy,3.09,16931
3,4,Harper Lee,To Kill a Mockingbird,Self-Help;Drama,4.46,19088
4,5,F Scott Fitzgerald,The Great Gatsby,Drama,3.89,16604
...,...,...,...,...,...,...
994,995,Michelle Hodkin,The Unbecoming of Mara Dyer,Fiction,4.03,1015
995,996,Janet Evanovich,Three to Get Deadly,Crime,3.94,1334
996,997,Emmuska Orczy,The Scarlet Pimpernel,History;Fiction,3.71,1315
997,998,"Jon Stone, Michael J Smollin",The Monster at the End of this Book,Fiction;Kids,4.44,1307


### TF-IDF vectorization on genres 

In [10]:
# Content-Based Filtering (TF-IDF Vectorization on 'genres')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'\b\w+\b')
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['Genres'].str.replace(';', ' '))

### Cosine Similarity

In [11]:
# Collaborative Filtering (User-based)

user_item_matrix = merged_df.pivot(index='book_id', columns='title', values='mean_rating').fillna(0)
collaborative_similarity = cosine_similarity(user_item_matrix)

### Building functions

In [12]:
# Function to get collaborative recommendations
def collaborative_recommendations(book_index, similarity_matrix):
    sim_scores = list(enumerate(similarity_matrix[book_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]             # Exclude the book itself
    book_indices = [i[0] for i in sim_scores]
    return book_indices

In [13]:
# Function to get content-based recommendations
def content_based_recommendations(book_index, tfidf_matrix):
    content_similarity = cosine_similarity(tfidf_matrix[book_index], tfidf_matrix).flatten()
    content_sim_scores = list(enumerate(content_similarity))
    content_sim_scores = sorted(content_sim_scores, key=lambda x: x[1], reverse=True)
    content_book_indices = [i[0] for i in content_sim_scores]
    return content_book_indices

In [14]:
# Function to generate weighted hybrid recommendations
def weighted_hybrid_recommendations(book_title, collaborative_weight=0.5, content_weight=0.5):
    book_index = merged_df.index[merged_df['title'] == book_title][0]

    # Get collaborative recommendations
    collaborative_indices = collaborative_recommendations(book_index, collaborative_similarity)

    # Get content-based recommendations
    content_indices = content_based_recommendations(book_index, tfidf_matrix)

    # Combine recommendations with weights
    combined_scores = {
        idx: collaborative_weight * collaborative_similarity[book_index][idx]
             + content_weight * cosine_similarity(tfidf_matrix[book_index], tfidf_matrix[idx])[0][0]
        for idx in set(collaborative_indices) | set(content_indices)
        if idx != book_index}  # Exclude the input book itself
   

    # Get top 5 book indices based on combined scores
    top_books = sorted(combined_scores, key=combined_scores.get, reverse=True)[:5]

    # Get book titles from the indices
    recommended_books = merged_df.loc[top_books, 'title'].tolist()

    return recommended_books

### Get recommendations

In [15]:
# Example 1: Get recommendations for a book
book_title = 'The Hobbit'
recommendations = weighted_hybrid_recommendations(book_title, collaborative_weight=0.6, content_weight=0.4)
print(f"Recommendations for '{book_title}': {recommendations}")

Recommendations for 'The Hobbit': ['Twilight ', 'The Fellowship of the Ring ', 'Mockingjay ', 'Harry Potter and the Order of the Phoenix ', 'The Giver ']


In [16]:
# Example 2: Get recommendations for a book
book_title = '1984'
recommendations = weighted_hybrid_recommendations(book_title, collaborative_weight=0.7, content_weight=0.3)
print(f"Recommendations for '{book_title}': {recommendations}")

Recommendations for '1984': ['Gone Girl', 'A Time to Kill', "The Husband's Secret", 'Kiss the Girls ', 'The Pelican Brief']


In [17]:
# Example 3: Get recommendations for a book
book_title = 'Romeo and Juliet'
recommendations = weighted_hybrid_recommendations(book_title, collaborative_weight=0.2, content_weight=0.8)
print(f"Recommendations for '{book_title}': {recommendations}")

Recommendations for 'Romeo and Juliet': ['Sense and Sensibility', 'An Abundance of Katherines', 'Delirium ', 'The Rosie Project ', 'The One ']


In [18]:
# Example 4: Get recommendations for a book
book_title = 'Pride and Prejudice'
recommendations = weighted_hybrid_recommendations(book_title, collaborative_weight=0.6, content_weight=0.4)
print(f"Recommendations for '{book_title}': {recommendations}")

Recommendations for 'Pride and Prejudice': ['The Fault in Our Stars', 'Eat, Pray, Love', 'The Notebook ', 'The Devil Wears Prada ', 'The Princess Bride']


In [19]:
# Example 5: Get recommendations for a book
book_title = 'Les Miserables'
recommendations = weighted_hybrid_recommendations(book_title, collaborative_weight=0.6, content_weight=0.4)
print(f"Recommendations for '{book_title}': {recommendations}")

Recommendations for 'Les Miserables': ['Breakfast of Champions', 'Julius Caesar', 'Robinson Crusoe', 'Macbeth', 'Don Quixote']
