In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt 
import difflib
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv('Book.csv')
df

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,0002005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,0002261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,0006163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,0006178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,0006280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6805,9788185300535,8185300534,I Am that,Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0
6806,9788185944609,8185944601,Secrets Of The Heart,,Khalil Gibran,Mysticism,http://books.google.com/books/content?id=XcrVp...,,1993.0,4.08,74.0,324.0
6807,9788445074879,8445074873,Fahrenheit 451,,Ray Bradbury,Book burning,,,2004.0,3.98,186.0,5733.0
6808,9789027712059,9027712050,The Berlin Phenomenology,,Georg Wilhelm Friedrich Hegel,History,http://books.google.com/books/content?id=Vy7Sk...,Since the three volume edition ofHegel's Philo...,1981.0,0.00,210.0,0.0


In [4]:
df.shape

(6810, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [6]:
df.isna().sum()

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64

In [7]:
selected_features = ['title','authors','categories','published_year']
print(selected_features)

['title', 'authors', 'categories', 'published_year']


In [8]:
for features in selected_features:
    df[features] = df[features].fillna('')

In [9]:
df[features].isna().sum()

0

# 1 Data Preprocessing

Before building the recommendation system, we need to preprocess the data. This may include text cleaning, handling missing values, and tokenization.

In [10]:
df['combine_features'] = df['title']+' '+df['authors']+' '+df['categories']+' '+ f"{df['published_year']}"

In [11]:
df['combine_features'].head(3)

0    Gilead Marilynne Robinson Fiction 0       2004...
1    Spider's Web Charles Osborne;Agatha Christie D...
2    The One Tree Stephen R. Donaldson American fic...
Name: combine_features, dtype: object

In [12]:
df

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,combine_features
0,9780002005883,0002005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead Marilynne Robinson Fiction 0 2004...
1,9780002261982,0002261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web Charles Osborne;Agatha Christie D...
2,9780006163831,0006163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,The One Tree Stephen R. Donaldson American fic...
3,9780006178736,0006178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels Sidney Sheldon Fiction 0 ...
4,9780006280897,0006280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves Clive Staples Lewis Christian l...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6805,9788185300535,8185300534,I Am that,Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that Sri Nisargadatta Maharaj;Sudhakar S....
6806,9788185944609,8185944601,Secrets Of The Heart,,Khalil Gibran,Mysticism,http://books.google.com/books/content?id=XcrVp...,,1993.0,4.08,74.0,324.0,Secrets Of The Heart Khalil Gibran Mysticism 0...
6807,9788445074879,8445074873,Fahrenheit 451,,Ray Bradbury,Book burning,,,2004.0,3.98,186.0,5733.0,Fahrenheit 451 Ray Bradbury Book burning 0 ...
6808,9789027712059,9027712050,The Berlin Phenomenology,,Georg Wilhelm Friedrich Hegel,History,http://books.google.com/books/content?id=Vy7Sk...,Since the three volume edition ofHegel's Philo...,1981.0,0.00,210.0,0.0,The Berlin Phenomenology Georg Wilhelm Friedri...


# 2 Making A Data Storage Variable Form Which We Search

In [13]:
# Here I MAke A Variable In Which I Store 5 Columns From Df DataFrame.
combine = df[['isbn10','title','authors','published_year','combine_features']]

In [14]:
combine

Unnamed: 0,isbn10,title,authors,published_year,combine_features
0,0002005883,Gilead,Marilynne Robinson,2004.0,Gilead Marilynne Robinson Fiction 0 2004...
1,0002261987,Spider's Web,Charles Osborne;Agatha Christie,2000.0,Spider's Web Charles Osborne;Agatha Christie D...
2,0006163831,The One Tree,Stephen R. Donaldson,1982.0,The One Tree Stephen R. Donaldson American fic...
3,0006178731,Rage of angels,Sidney Sheldon,1993.0,Rage of angels Sidney Sheldon Fiction 0 ...
4,0006280897,The Four Loves,Clive Staples Lewis,2002.0,The Four Loves Clive Staples Lewis Christian l...
...,...,...,...,...,...
6805,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,1999.0,I Am that Sri Nisargadatta Maharaj;Sudhakar S....
6806,8185944601,Secrets Of The Heart,Khalil Gibran,1993.0,Secrets Of The Heart Khalil Gibran Mysticism 0...
6807,8445074873,Fahrenheit 451,Ray Bradbury,2004.0,Fahrenheit 451 Ray Bradbury Book burning 0 ...
6808,9027712050,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,1981.0,The Berlin Phenomenology Georg Wilhelm Friedri...


In [15]:
combine

Unnamed: 0,isbn10,title,authors,published_year,combine_features
0,0002005883,Gilead,Marilynne Robinson,2004.0,Gilead Marilynne Robinson Fiction 0 2004...
1,0002261987,Spider's Web,Charles Osborne;Agatha Christie,2000.0,Spider's Web Charles Osborne;Agatha Christie D...
2,0006163831,The One Tree,Stephen R. Donaldson,1982.0,The One Tree Stephen R. Donaldson American fic...
3,0006178731,Rage of angels,Sidney Sheldon,1993.0,Rage of angels Sidney Sheldon Fiction 0 ...
4,0006280897,The Four Loves,Clive Staples Lewis,2002.0,The Four Loves Clive Staples Lewis Christian l...
...,...,...,...,...,...
6805,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,1999.0,I Am that Sri Nisargadatta Maharaj;Sudhakar S....
6806,8185944601,Secrets Of The Heart,Khalil Gibran,1993.0,Secrets Of The Heart Khalil Gibran Mysticism 0...
6807,8445074873,Fahrenheit 451,Ray Bradbury,2004.0,Fahrenheit 451 Ray Bradbury Book burning 0 ...
6808,9027712050,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,1981.0,The Berlin Phenomenology Georg Wilhelm Friedri...


# 3. Building the Content-Based Recommendation System

# Vectorization
- Make The Data In The Token Form :

In [16]:
# Bag of Words 
# Importing Library :
from sklearn.feature_extraction.text import CountVectorizer
# Here We Have Making A Variable Of cv In That We Store 5000 Words Which Is Relates To Each Others 
# Reomiving The Stop Words EX :(The , Puctuations , is , We etc......)
# Converting That 5000 Words Into Binary : (0,1) Form.
cv= CountVectorizer(max_features=5000, stop_words='english',binary=True)

In [17]:
vector = cv.fit_transform(combine['combine_features']).toarray()

In [18]:
vector.shape

(6810, 5000)

In [19]:
vector[0].sum()

20

In [20]:
len(cv.get_feature_names_out())

5000

In [21]:
combine['combine_features'][0]

'Gilead Marilynne Robinson Fiction 0       2004.0\n1       2000.0\n2       1982.0\n3       1993.0\n4       2002.0\n         ...  \n6805    1999.0\n6806    1993.0\n6807    2004.0\n6808    1981.0\n6809    1998.0\nName: published_year, Length: 6810, dtype: object'

# Stemming
- It Works 2 Things :
- One Is It Can Remove All The Stop Words Which Is Present In The Nltk Library For Making The Array Short.
- Two Is I Can Change Multiple Words Occurs Into One Word . 

In [22]:
# This Is A Library 
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [23]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [77]:
combine['combine_features'][3]

'rage of angel sidney sheldon fiction 0 2004.0 1 2000.0 2 1982.0 3 1993.0 4 2002.0 ... 6805 1999.0 6806 1993.0 6807 2004.0 6808 1981.0 6809 1998.0 name: published_year, length: 6810, dtype: object'

In [79]:
#Function Of Steming :
#Spit The Text Input And Append In Y A Stem Text .
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [81]:
stem('My Name Is Asad and i Work For Asadullah')

'my name is asad and i work for asadullah'

In [168]:
# Here I Apply Stem Function Combine Features Column .
combine['combine_features'] = combine['combine_features'].apply(stem)
combine['combine_features'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combine['combine_features'] = combine['combine_features'].apply(stem)


'gilead marilynn robinson fiction 0 2004.0 1 2000.0 2 1982.0 3 1993.0 4 2002.0 ... 6805 1999.0 6806 1993.0 6807 2004.0 6808 1981.0 6809 1998.0 name: published_year, length: 6810, dtype: object'

# Cousine Similarity Check 

In [86]:
# Here We Finding Cosine Similarity :
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [87]:
vector[0].sum()

20

In [102]:
#for i in similarity[0]:
#    print(i)

In [104]:
vector[0].shape

(5000,)

In [106]:
#this will held the indexes also 
sorted(list(enumerate(similarity[0])),reverse=True, key=lambda x: x[1])[1:10]

[(843, 0.9746794344808962),
 (2014, 0.9746794344808962),
 (3827, 0.9746794344808962),
 (5728, 0.9746794344808962),
 (6420, 0.9746794344808962),
 (402, 0.9534625892455924),
 (416, 0.9534625892455924),
 (3512, 0.9534625892455924),
 (688, 0.9500000000000003)]

In [116]:
combine['title']

0                          Gilead
1                    Spider's Web
2                    The One Tree
3                  Rage of angels
4                  The Four Loves
                  ...            
6805                    I Am that
6806         Secrets Of The Heart
6807               Fahrenheit 451
6808     The Berlin Phenomenology
6809    'I'm Telling You Stories'
Name: title, Length: 6810, dtype: object

In [118]:
combine[combine['title']=='Gilead'].index[0]

0

In [120]:
combine.iloc[0]['title']

'Gilead'

In [202]:
def recommend(book):
   book_index = combine[(combine['title']==book)|(combine['published_year']==book)].index[0]
   distances = similarity[book_index]
   book_list = sorted(list(enumerate(distances)),reverse=True, key=lambda x: x[1])[1:6]
   for i in book_list:
       print(combine.iloc[i[0]]['title'])
       print(combine.iloc[i[0]]['published_year'])

In [218]:
recommend(2002.0)

The Problem of Pain
2002.0
Letters to Children
1985.0
C.S. Lewis
1996.0
THE GOON SHOW
2005.0
The voyage of the Dawn Treader
1970.0
