In [None]:


import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'book-recommendation-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1004280%2F5624361%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240315%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240315T061302Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4006151380e04123f0215b07b5fd35e5ba47dd5152ccb0c215af0893f1f5a0c1ba23ed9f0a657bbd45478662b61f364598f4c908c7a75c2eb8b288c879e2a7a43f37f67e04a2b6362c8b303f3c592c02288b0beaebf43eb7b35abbffedc35b2cc6a007a905983671c20b3ca35dad3ae283243eb3b2771492acec7ccf7c41d31c7dffbad989649949ef3bd3362a16b0be746c40e0d4a146273987ddf55df6d2af82d674ef8c7ac8bc87481268470d1c0fe6044c124d9b42f466164afc7b753d52b26930eb98a7ec7b5d063e45e2184581ffa2b39b4580a5678884b1f08d6ae72516a15afd76f3902857dfc3eab4133d8d3ca03dd4016f897abe2d6c8eb18fd9b0'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
#importing needed libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#downloading files
books=pd.read_csv("/kaggle/input/book-recommendation-dataset/Books.csv")
books.head()

In [None]:
ratings = pd.read_csv("/kaggle/input/book-recommendation-dataset/Ratings.csv")
ratings.head()

In [None]:
users = pd.read_csv("/kaggle/input/book-recommendation-dataset/Users.csv")
users.head()

In [None]:
print("books:"  ,books.shape)
print("ratings:" ,ratings.shape)
print("users:" ,users.shape)

In [None]:
#checking null values in books csv
missing_bookdetails_count=books.isnull().sum()
missing_bookdetails_count

In [None]:
#checking null value in user csv
missing_userdet_count = users.isnull().sum()
missing_userdet_count

In [None]:
#checking null value in ratings csv
missing_ratings_count = ratings.isnull().sum()
missing_ratings_count

In [None]:
#Checking for dublicates
duplicate=books.duplicated().sum()
duplicate

In [None]:
#Merging Ratings And Books Based On ISBN
ratings.merge(books, on= 'ISBN').shape


In [None]:
ratings_with_name = ratings.merge(books, on= 'ISBN')
ratings_with_name

In [None]:
ratings_with_name.groupby('Book-Title').count()

In [None]:
ratings_with_name.groupby('Book-Title').count()['Book-Rating']

In [None]:
ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()

In [None]:
num_rating_df=ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_rating'},inplace=True)
num_rating_df

In [None]:
avg_rating_df=ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_rating'},inplace=True)
avg_rating_df

In [None]:
popular_df = num_rating_df.merge(avg_rating_df,on = 'Book-Title')
popular_df

In [None]:
popular_df[popular_df['num_rating']>=200]

In [None]:
popular_df=popular_df[popular_df['num_rating']>=200].sort_values('avg_rating',ascending=False).head(50)

In [None]:
popular_df.merge(books,on='Book-Title')

In [None]:
popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_rating','avg_rating']]

In [None]:
ratings_with_name

In [None]:
ratings_with_name.groupby('User-ID').count()['Book-Rating']

In [None]:
x=ratings_with_name.groupby('User-ID').count()['Book-Rating']>200
padhe_likhe_users = x[x].index
ratings_with_name

In [None]:
x=ratings_with_name.groupby('User-ID').count()['Book-Rating']>200
padhe_likhe_users=x[x].index

In [None]:
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(padhe_likhe_users)]

In [None]:
filtered_rating.groupby('Book-Title').count()['Book-Rating']

In [None]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = y[y].index
famous_books

In [None]:
filtered_rating['Book-Title'].isin(famous_books)

In [None]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]
final_ratings

In [None]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')
pt

In [None]:
pt.fillna(0,inplace=True)
pt

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity_scores =cosine_similarity(pt)

In [None]:
similarity_scores.shape

In [None]:
def recommend(book_name):
    #index fetch
    index = np.where(pt.index==book_name)[0][0]
    similar_items=sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:6]

    for i in similar_items:
        print(pt.index[i[0]])


In [None]:
recommend('The Da Vinci Code')