In [121]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

books = pd.read_csv('/kaggle/input/goodbooks-10k/books.csv')
book_tags = pd.read_csv('/kaggle/input/goodbooks-10k/book_tags.csv')
rating = pd.read_csv('/kaggle/input/goodbooks-10k/ratings.csv')
reading_books = pd.read_csv('/kaggle/input/goodbooks-10k/to_read.csv')

In [122]:
books.isnull().sum()

id                              0
book_id                         0
best_book_id                    0
work_id                         0
books_count                     0
isbn                          700
isbn13                        585
authors                         0
original_publication_year      21
original_title                585
title                           0
language_code                1084
average_rating                  0
ratings_count                   0
work_ratings_count              0
work_text_reviews_count         0
ratings_1                       0
ratings_2                       0
ratings_3                       0
ratings_4                       0
ratings_5                       0
image_url                       0
small_image_url                 0
dtype: int64

In [123]:
# handling missing values
books['isbn'].fillna(books['isbn'].mode()[0], inplace=True)
books['isbn13'].fillna(books['isbn13'].mode()[0], inplace=True)
books['original_publication_year'].fillna(books['original_publication_year'].median(), inplace=True)
books['original_title'].fillna(books['title'], inplace=True)
books['language_code'].fillna(books['language_code'].mode()[0], inplace=True)
books.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books['isbn'].fillna(books['isbn'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books['isbn13'].fillna(books['isbn13'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
5,6,11870085,11870085,16827462,226,525478817,9780525000000.0,John Green,2012.0,The Fault in Our Stars,...,2346404,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...
6,7,5907,5907,1540236,969,618260307,9780618000000.0,J.R.R. Tolkien,1937.0,The Hobbit or There and Back Again,...,2071616,2196809,37653,46023,76784,288649,665635,1119718,https://images.gr-assets.com/books/1372847500m...,https://images.gr-assets.com/books/1372847500s...
7,8,5107,5107,3036731,360,316769177,9780317000000.0,J.D. Salinger,1951.0,The Catcher in the Rye,...,2044241,2120637,44920,109383,185520,455042,661516,709176,https://images.gr-assets.com/books/1398034300m...,https://images.gr-assets.com/books/1398034300s...
8,9,960,960,3338963,311,1416524797,9781417000000.0,Dan Brown,2000.0,Angels & Demons,...,2001311,2078754,25112,77841,145740,458429,716569,680175,https://images.gr-assets.com/books/1303390735m...,https://images.gr-assets.com/books/1303390735s...
9,10,1885,1885,3060926,3455,679783261,9780680000000.0,Jane Austen,1813.0,Pride and Prejudice,...,2035490,2191465,49152,54700,86485,284852,609755,1155673,https://images.gr-assets.com/books/1320399351m...,https://images.gr-assets.com/books/1320399351s...


In [124]:
book_tags.isnull().sum()

goodreads_book_id    0
tag_id               0
count                0
dtype: int64

In [125]:
rating.isnull().sum()

book_id    0
user_id    0
rating     0
dtype: int64

In [126]:
reading_books.isnull().sum()

user_id    0
book_id    0
dtype: int64

In [127]:
#Tokenize and vectorize text-based features like book titles and descriptions (e.g., using TF-IDF or word embeddings).
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(books['original_title'])
books['original_title']

0                                        The Hunger Games
1                Harry Potter and the Philosopher's Stone
2                                                Twilight
3                                   To Kill a Mockingbird
4                                        The Great Gatsby
                              ...                        
9995                                           Bayou Moon
9996                                     Means of Ascent 
9997                                The Mauritius Command
9998    Cinderella Ate My Daughter: Dispatches from th...
9999                                  The First World War
Name: original_title, Length: 10000, dtype: object

In [128]:
#Normalize numerical features (e.g., publication year, number of pages).
scaler = MinMaxScaler()
books[['original_publication_year', 'ratings_count','average_rating']] = scaler.fit_transform(books[['original_publication_year', 'ratings_count','average_rating']])
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,0.997611,The Hunger Games,...,1.0,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",0.994691,Harry Potter and the Philosopher's Stone,...,0.962709,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,0.996814,Twilight,...,0.808743,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,0.984869,To Kill a Mockingbird,...,0.668899,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,0.975577,The Great Gatsby,...,0.56111,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [129]:
#Implement a collaborative filtering model to identify similar users and generate recommendations.
rating = rating.groupby(['user_id', 'book_id']).rating.mean().reset_index()
user = rating.pivot(index='user_id', columns='book_id', values='rating').fillna(0)
interaction = csr_matrix(user.values)interaction

SyntaxError: invalid syntax (3210837380.py, line 4)

In [None]:
#Use models such as K-Nearest Neighbors or Matrix Factorization (e.g., SVD) based on the user-item interaction matrix.
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(interaction)

def get_collaborative_recommendations(user_id, k=5, num_recommendations=5):
    # Ensure that the user ID exists in the user-item matrix
    if user_id not in user.index:
        print("User ID not found in the data.")
        return []
    
    # Find the index of the user in the matrix
    user_idx = user.index.get_loc(user_id)
    
    # Find the k-nearest neighbors for the user
    distances, indices = knn_model.kneighbors(interaction[user_idx], n_neighbors=k+1)
    
    # Collect books rated by similar users
    similar_users = indices.flatten()[1:]  # Skip the first entry as it is the user itself
    recommended_books = []
    
    for similar_user_idx in similar_users:
        similar_user_id = user.index[similar_user_idx]
        
        # Get books rated by similar user with high ratings
        high_rated_books = user.loc[similar_user_id][user.loc[similar_user_id] > 4].index
        recommended_books.extend(high_rated_books)
    
    # Select unique recommendations, limited to the desired number
    unique_recommendations = list(set(recommended_books))[:num_recommendations]
    
    # Return the recommended books
    return unique_recommendations

user_id = 314 # replace with an actual user_id from the dataset
recommendations = get_collaborative_recommendations(user_id, k=5, num_recommendations=5)
print("Recommended Books for User:", recommendations)

In [None]:
#Implement a content-based filtering model to recommend books based on attributes like genre and description.
#Compute cosine similarity or use machine learning models to suggest books based on content features.
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_content_recommendations(book_id, num_recommendations=5):
    if book_id not in books['book_id'].values:
        print("Book ID not found in the data.")
        return []

    book_idx = books.index[books['book_id'] == book_id][0]
    
    sim_scores = list(enumerate(cosine_sim[book_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    book_indices = [i[0] for i in sim_scores[1:num_recommendations+1]]
    return books['book_id'].iloc[book_indices].tolist()

#Recommending from content
book_id = 2657 
content_recommendations = get_content_recommendations(book_id, num_recommendations=5)
print("Content-Based Recommended Books for Book ID", book_id, ":", content_recommendations)


In [None]:
#Combine recommendations from both collaborative and content-based models.
#Experiment with different hybrid approaches like weighted averaging or switching strategies based on user characteristics.

def get_hybrid_recommendations(user_id, book_id, k=5, num_recommendations=5, weight_collab=0.5, weight_content=0.5):
    collab_recommendations = get_collaborative_recommendations(user_id, k=k, num_recommendations=num_recommendations)
    content_recommendations = get_content_recommendations(book_id, num_recommendations=num_recommendations)
    final_recommendations = {}
    
    for book in collab_recommendations:
        final_recommendations[book] = final_recommendations.get(book, 0) + weight_collab

    for book in content_recommendations:
        final_recommendations[book] = final_recommendations.get(book, 0) + weight_content
    
    # Sort recommendations by their combined weights in descending order
    sorted_recommendations = sorted(final_recommendations.items(), key=lambda x: x[1], reverse=True)
    
    # Get the top N recommendations based on combined scores
    top_recommendations = [book[0] for book in sorted_recommendations[:num_recommendations]]
    
    return top_recommendations

user_id = 314 
book_id = 2657  
hybrid_recommendations = get_hybrid_recommendations(user_id, book_id, k=5, num_recommendations=5)
print("Hybrid Recommended Books for User", user_id, "based on Book", book_id, ":", hybrid_recommendations)