## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams['figure.dpi'] = 150
import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='viridis')
%matplotlib inline
import statistics

## Load the data

In [None]:
Books = pd.read_csv('Books.csv',encoding='latin-1')
Books

In [None]:
Books =  Books.drop(['Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
Books

In [None]:
Users =pd.read_csv('Users.csv',encoding='latin-1')
Users

In [None]:
Ratings = pd.read_csv('Ratings.csv',encoding='latin-1')
Ratings

## EDA

In [None]:
Books['Book-Title'].nunique()

In [None]:
Books['Book-Author'].nunique()

In [None]:
Books['Publisher'].nunique()

In [None]:
Books.info()

In [None]:
Books.isnull().sum()

In [None]:
Books[Books['Book-Author'].isna()]

In [None]:
Books['Book-Author'].mode()

In [None]:
Books.iloc[118033]['Book-Author'] ='Agatha Christie'
Books.iloc[187689]['Book-Author'] ='Agatha Christie'

In [None]:
Books[Books['Book-Author'].isna()]

In [None]:
Books[Books['Publisher'].isna()]

In [None]:
Books['Publisher'].mode()

In [None]:
Books.iloc[128890]['Publisher'] ='Harlequin'
Books.iloc[129037]['Publisher'] ='Harlequin'

In [None]:
Books[Books['Publisher'].isna()]

In [None]:
Books['Year-Of-Publication'].unique()

In [None]:
Books[Books['Year-Of-Publication'] == '0']

In [None]:
Books[Books['Year-Of-Publication'] == 0]

In [None]:
Books['Year-Of-Publication'].mode()

In [None]:
Books['Year-Of-Publication'].replace('0','2002',inplace = True)

In [None]:
Books[Books['Year-Of-Publication'] == '0']

In [None]:
Books['Year-Of-Publication'].replace(0,'2002',inplace = True)

In [None]:
Books[Books['Year-Of-Publication'] == 0]

In [None]:
Books['Year-Of-Publication'].replace(0,'2002',inplace = True)

In [None]:
Books.dtypes

In [None]:
Books[Books['Year-Of-Publication'] == 'DK Publishing Inc']

In [None]:
Books[Books['Year-Of-Publication'] == 'Gallimard']

In [None]:
Books.loc[209538]['Book-Author'] ='Agatha Christie'
Books.loc[209538]['Year-Of-Publication'] =2000
Books.loc[209538]['Publisher'] ='DK Publishing Inc'
Books.loc[221678]['Book-Author'] ='Agatha Christie'
Books.loc[221678]['Year-Of-Publication'] =2000
Books.loc[221678]['Publisher'] ='DK Publishing Inc'
Books.loc[220731]['Book-Author'] ='Agatha Christie'
Books.loc[220731]['Year-Of-Publication'] =2003
Books.loc[220731]['Publisher'] ='Gallimard'

In [None]:
Books[Books['Year-Of-Publication'] == 'DK Publishing Inc']

In [None]:
Books[Books['Year-Of-Publication'] == 'Gallimard']

In [None]:
convert_dict = {'Year-Of-Publication': int}
Books = Books.astype(convert_dict)

In [None]:
Books[Books['Year-Of-Publication'] > 2024]

In [None]:
Books.loc[Books['Year-Of-Publication'] > 2024, 'Year-Of-Publication'] = 2002

In [None]:
Books.dtypes

In [None]:
Books.describe()

In [None]:
Users.info()

In [None]:
Users.isnull().sum()

In [None]:
Users['Age'].unique()

In [None]:
Users[(Users['Age']> 90)]

In [None]:
Users[(Users['Age']<10)]

In [None]:
Users.loc[Users['Age'] > 90, 'Age'] = Users['Age'].mean()

In [None]:
Users[(Users['Age']> 90)]

In [None]:
Users.loc[Users['Age'] <10, 'Age'] = Users['Age'].mean()

In [None]:
Users[(Users['Age']< 10)]

In [None]:
Users['Age'].unique()

In [None]:
Users['Age'].fillna(Users.Age.mean(),inplace=True)

In [None]:
Users.isnull().sum()

In [None]:
Users

In [None]:
Users.describe()

In [None]:
Ratings.info()

In [None]:
Ratings['Book-Rating'].unique()

In [None]:
Ratings.rename(columns={'Book-Rating':'BookRating'},inplace=True)

In [None]:
Ratings = Ratings[Ratings.BookRating != 0]

In [None]:
Ratings['BookRating'].unique()

In [None]:
Ratings.isnull().sum()

In [None]:
Ratings.describe()

In [None]:
Books.duplicated().sum()

In [None]:
Users.duplicated().sum()

In [None]:
Ratings.duplicated().sum()

In [None]:
Ratings_and_Users = Ratings.merge(Users, on="User-ID")

In [None]:
Ratings_and_Users

In [None]:
Data = Ratings_and_Users.merge(Books,on ='ISBN')

In [None]:
Data.head()

In [None]:
Data.shape

In [None]:
Data.info()

In [None]:
Data.describe()

In [None]:
Data.nunique()

In [None]:
Data.isnull().sum()

In [None]:
Data.duplicated().sum()

In [None]:
Data['Book-Title'].value_counts()

In [None]:
Data['Book-Author'].value_counts()

In [None]:
Data['BookRating'].value_counts()

## Visualization

In [None]:
Data['Age'].plot.box()
plt.show()

In [None]:
## Detecting Outliers using Z-score
from scipy import stats
z = np.abs(stats.zscore(Data['Age']))
threshold =3
outliers= Data[z> threshold]

In [None]:
len(outliers)

In [None]:
## IQR Method
Q1= Data['Age'].quantile(0.25)
Q3= Data['Age'].quantile(0.75)
IQR= Q3 -Q1
print(Q3)
print(Q1)
IQR

In [None]:
Data= Data[~((Data['Age']<(Q1 -1.5*IQR))|(Data['Age']>(Q3+ 1.5*IQR)))]

In [None]:
Data['Age'].plot.box()
plt.show()

In [None]:
sns.countplot(data=Data, x= 'BookRating')

In [None]:
sns.countplot(y='Book-Title',data=Data,order=pd.value_counts(Data['Book-Title']).iloc[:10].index,palette='rocket_r')
plt.title('Top 10 Popular Books')

In [None]:
sns.countplot(y='Book-Author',data=Data,order=pd.value_counts(Data['Book-Author']).iloc[:20].index,palette='viridis')
plt.title('Top 20 Authors')

In [None]:
sns.countplot(y='Publisher',data=Data,order=pd.value_counts(Data['Publisher']).iloc[:20].index,palette='mako_r')
plt.title('Top 20 Publishers')

In [None]:
sns.countplot(y='Location',data=Data,order=pd.value_counts(Data['Location']).iloc[:20].index,palette='viridis')
plt.title('Count of users Location wise')

In [None]:
sns.countplot(x='Year-Of-Publication',data=Data,order=pd.value_counts(Data['Year-Of-Publication']).iloc[:20].index,palette='mako_r')
plt.title('Books Published Yearwise')

In [None]:
Data

## Model Building

### Popularity-Based Recommendation System:

In [None]:
# No of ratings each book have got
df=Data.groupby('Book-Title').BookRating.count().to_frame('BookRatingscount').reset_index()
df

In [None]:
df[df['Book-Title'] == 'Rites of Passage']

In [None]:
df['BookRatingscount']

In [None]:
data = Data.merge(df, on="Book-Title")

In [None]:
data

In [None]:
data[data['Book-Title'] == 'Ã?Â?rger mit Produkt X. Roman.']

In [None]:
# Avg ratings of books
data['AvgRatings'] = data.groupby(['Book-Title'])['BookRating'].transform('mean')

In [None]:
data

In [None]:
data[data['Book-Title'] == 'Rites of Passage']

In [None]:
## Top 10 Popular Books
data.sort_values("BookRatingscount", ascending=False).head(10)

In [None]:
## Locationwise Popular Books
data.sort_values(["BookRatingscount"], ascending=False).groupby('Location').head()

In [None]:
data.groupby('Year-Of-Publication')

In [None]:
data['Year-Of-Publication'].value_counts()

In [None]:
convert_dict = {'Age': int}
data = data.astype(convert_dict)

In [None]:
data.dtypes

In [None]:
## Books popular Yearly
data.sort_values(["Year-Of-Publication"], ascending=False)

In [None]:
data_above_100 = data[data['BookRatingscount']>=
  100]

In [None]:
## Top Books with highest weighted rating

def calcWeightedRating(row, avgRating, numOfRatings, minThres, defRating):
    weightedRating = ((row[avgRating] * row[numOfRatings]) + 
      (minThres * defRating))/(row[numOfRatings] + minThres)
    return weightedRating

# For number of ratings above 100
data_above_100 = data_above_100.copy()
data_above_100['Weighted-Rating'] = data_above_100.apply(
  lambda x:calcWeightedRating( 
      x,'AvgRatings', 'BookRatingscount', 100, 5),axis=1)
data_above_100.sort_values(
    'Weighted-Rating', ascending=False).head(20)

### Collaborative Filtering-Based Recommendation System:

In [None]:
# No of ratings given by the each User
df1=Data.groupby('User-ID').BookRating.count().to_frame('UserRatings').reset_index()
df1

In [None]:
data[data['User-ID']==8]

In [None]:
filteredData = df1.merge(data, on="User-ID")

In [None]:
filteredData

In [None]:
### users who have rated over 200 books and books which have atleast 50 ratings
filteredData = filteredData[(filteredData['UserRatings'] > 200) & (filteredData['BookRatingscount']>=50)]

In [None]:
filteredData

In [None]:
filteredData[filteredData['Book-Title']=='Dreamcatcher']

In [None]:
filteredData.isnull().sum()

In [None]:
pivot = pd.pivot_table(data=filteredData,index='Book-Title',columns='User-ID', values='UserRatings')
pivot.fillna(0, inplace=True)

In [None]:
pivot

In [None]:
pivot.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(pivot)
similarities

In [None]:
similarities.shape

In [None]:
np.where(pivot.index=='1984')

In [None]:
display(similarities[0])

In [None]:
sorted(list(enumerate(similarities[0])), key=lambda x: x[1], 
  reverse=True)[1:6]

In [None]:
### Displaying the name of books from the top 5
for book in sorted(list(enumerate(similarities[0])), key=lambda x: x[1], reverse=True)[1:6]:
    print(pivot.index[book[0]])

In [None]:
### Final Recommendation for the selected Book
def recommend(book_name):
    if book_name in pivot.index:
        index = np.where(pivot.index == book_name)[0][0]
        similar_books_list = sorted(
        list(enumerate(similarities[index])), key=lambda x: x[1], reverse=True)[1:11]
        
        print(f'Recommendations for the book {book_name}:')
        print('-'*10)
        for book in similar_books_list:
            print(pivot.index[book[0]])
        print('\n')
    else:
        print('Book Not Found')
        print('\n')

In [None]:
recommend('Harry Potter and the Chamber of Secrets (Book 2)')

In [None]:
recommend('The Runaway Jury')

In [None]:
recommend('From One to One Hundred')

In [None]:
recommend('Dreamcatcher')

### Deployment

In [None]:
import pickle

In [None]:
# Saving the popularity-based recommendation data into a pickle file
pickle.dump(data_above_100, open('PopularBookRecommendation.pkl', 'wb'))

In [None]:
# Saving collaborative filtering data into pickle files
pickle.dump(pivot, open('pivot.pkl', 'wb'))
pickle.dump(Books, open('Books.pkl', 'wb'))
pickle.dump(similarities, open('similarities.pkl', 'wb'))

In [None]:
# Load the recommendation models from pickle files
popular_recommendations = pickle.load(open('PopularBookRecommendation.pkl', 'rb'))
collaborative_filtering_pivot = pickle.load(open('pivot.pkl', 'rb'))
books_data = pickle.load(open('Books.pkl', 'rb'))
similarities = pickle.load(open('similarities.pkl', 'rb'))