In [12]:
#Import Libraries
# Data handling and manipulation
import pandas as pd        # For working with tabular data (DataFrames)
import numpy as np         # For numerical operations, arrays, and matrices

# Data visualization
import seaborn as sns      # For advanced statistical plotting
import matplotlib.pyplot as plt  # For creating standard plots and visualizations

# Model persistence
import pickle              # For saving/loading Python objects (e.g., trained models)

# Machine learning algorithms
from sklearn.neighbors import NearestNeighbors  # For finding closest data points (KNN-style)
from sklearn.preprocessing import StandardScaler  # For normalizing features to a standard scale

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text to numerical TF-IDF features

# Sparse matrix operations
from scipy.sparse import hstack, csr_matrix  # For efficiently storing and combining large sparse matrices

In [13]:
#Import the Dataset
data = pd.read_csv("Book_Recommend_Dataset.csv")

In [14]:
#Load the Dataset
data.head()

Unnamed: 0,asin,title,author,soldBy,imgUrl,productURL,stars,reviews,price,isKindleUnlimited,category_id,isBestSeller,isEditorsPick,isGoodReadsChoice,publishedDate,category_name
0,B00TZE87S4,Adult Children of Emotionally Immature Parents...,Lindsay C. Gibson,Amazon.com Services LLC,https://m.media-amazon.com/images/I/713KZTsaYp...,https://www.amazon.com/dp/B00TZE87S4,4.8,0,9.99,False,6,True,False,False,2015-06-01,Parenting & Relationships
1,B08WCKY8MB,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,Penguin Group (USA) LLC,https://m.media-amazon.com/images/I/A1LZcJFs9E...,https://www.amazon.com/dp/B08WCKY8MB,4.4,0,16.99,False,6,False,False,False,2022-02-15,Parenting & Relationships
2,B09KPS84CJ,Good Inside: A Guide to Becoming the Parent Yo...,Becky Kennedy,HarperCollins Publishers,https://m.media-amazon.com/images/I/71RIWM0sv6...,https://www.amazon.com/dp/B09KPS84CJ,4.8,0,16.99,False,6,False,True,False,2022-09-13,Parenting & Relationships
3,B07S7QPG6J,Everything I Know About Love: A Memoir,Dolly Alderton,HarperCollins Publishers,https://m.media-amazon.com/images/I/71QdQpTiKZ...,https://www.amazon.com/dp/B07S7QPG6J,4.2,0,9.95,True,6,False,True,False,2020-02-25,Parenting & Relationships
4,B00N6PEQV0,The Seven Principles for Making Marriage Work:...,John Gottman,Random House LLC,https://m.media-amazon.com/images/I/813o4WOs+w...,https://www.amazon.com/dp/B00N6PEQV0,4.7,0,13.99,False,6,False,False,False,2015-05-05,Parenting & Relationships


In [15]:
#Check the Info Of the Dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133102 entries, 0 to 133101
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   asin               133102 non-null  object 
 1   title              133102 non-null  object 
 2   author             132677 non-null  object 
 3   soldBy             123869 non-null  object 
 4   imgUrl             133102 non-null  object 
 5   productURL         133102 non-null  object 
 6   stars              133102 non-null  float64
 7   reviews            133102 non-null  int64  
 8   price              133102 non-null  float64
 9   isKindleUnlimited  133102 non-null  bool   
 10  category_id        133102 non-null  int64  
 11  isBestSeller       133102 non-null  bool   
 12  isEditorsPick      133102 non-null  bool   
 13  isGoodReadsChoice  133102 non-null  bool   
 14  publishedDate      84086 non-null   object 
 15  category_name      133102 non-null  object 
dtypes:

In [17]:
#Check the Shape of Dataset
data.shape

(133102, 16)

In [18]:
#Remove Unsed Columns
data = data[['title','author','soldBy','publishedDate','imgUrl']]

In [20]:
#Load new Dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133102 entries, 0 to 133101
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   title          133102 non-null  object
 1   author         132677 non-null  object
 2   soldBy         123869 non-null  object
 3   publishedDate  84086 non-null   object
 4   imgUrl         133102 non-null  object
dtypes: object(5)
memory usage: 5.1+ MB


In [21]:
#Check the Empty Values
data.isnull().sum()

title                0
author             425
soldBy            9233
publishedDate    49016
imgUrl               0
dtype: int64

In [24]:
#Remove the Empty values
data.dropna(subset=['author'],inplace=True)
#Remove the Empty values
data.dropna(subset=['soldBy'],inplace=True)
#Remove the Empty values
data.dropna(subset=['publishedDate'],inplace=True)

In [25]:
data.isnull().sum()

title            0
author           0
soldBy           0
publishedDate    0
imgUrl           0
dtype: int64

In [27]:
#Remove the Duplicate Values
data.drop_duplicates(subset=['author'],inplace=True)
#Remove the Duplicate Values
data.drop_duplicates(subset=['soldBy'],inplace=True)
#Remove the Duplicate Values
data.drop_duplicates(subset=['publishedDate'],inplace=True)

In [29]:
#Check the Status Dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35 entries, 0 to 100696
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          35 non-null     object
 1   author         35 non-null     object
 2   soldBy         35 non-null     object
 3   publishedDate  35 non-null     object
 4   imgUrl         35 non-null     object
dtypes: object(5)
memory usage: 1.6+ KB


In [31]:
#Check the columns data status
data.author.unique()

array(['Lindsay C. Gibson', 'Arthur C. Brooks', 'Becky Kennedy',
       'John Gottman', 'Jeannette Walls', 'Oprah Winfrey',
       'Robert F. Kennedy Jr.', 'David Sedaris', 'Justin Whitmel Earley',
       'L. M. Montgomery', 'Maria Tatar', 'Amanda Montei',
       'Teresa M. McDevitt', 'Belinda Daughrity', 'Mitch Weiss',
       'Rachel Marks', 'Kasey Edwards', 'Michelle Mitchell',
       'Roberta M. Berns', 'Kris Bordessa', 'Fletcher McKenzie',
       'Mark Millhone', 'Gerald J. Alred', 'Travis Campbell',
       'Ali Amrabet', 'Inès de La Fressange', 'Professor Braam van Wyk',
       'Dan Abnett', 'Jill Leovy', 'Sarah Albee', 'Bokuto Uno',
       'Matt Perman', 'Nicolas Bourriaud',
       'Harry Potter Theatrical Productions', 'Matt Forbeck'],
      dtype=object)

In [32]:
#Check the columns data status
data.soldBy.unique()

array(['Amazon.com Services LLC', 'Penguin Group (USA) LLC',
       'HarperCollins Publishers', 'Random House LLC',
       'Simon and Schuster Digital Sales Inc', 'Macmillan',
       'Simon & Schuster Digital Sales Inc.', 'Hachette Book Group',
       'HarperCollins Publishing', 'De Marque', 'W. W. Norton & Company',
       'Penguin Random House Publisher Services',
       'Pearson Education,Inc.', 'JOHN WILEY AND SONS INC',
       'Harlequin Digital Sales Corp.', 'PRH UK', 'RH AU',
       'Penguin Random House Australia Pty Ltd', 'Cengage Learning',
       'Disney Book Group', 'Random House NZ', 'Amazon.com',
       'Macmillan Higher Education', 'EDIGITA', 'Gallimard Lt.',
       'Flammarion Lt.', 'Random House ZA', 'Games Workshop',
       'Amazon Digital Services LLC HN', 'Scholastic Trade Publisher',
       'Yen Press LLC', 'Amazon Digital Services LLC MK',
       'Immat&eacute;riel fr', 'Pottermore', 'Marvel Entertainment US'],
      dtype=object)

In [33]:
#Check the columns data status
data.title.unique()

array(['Adult Children of Emotionally Immature Parents: How to Heal from Distant, Rejecting, or Self-Involved Parents',
       'From Strength to Strength: Finding Success, Happiness, and Deep Purpose in the Second Half of Life',
       'Good Inside: A Guide to Becoming the Parent You Want to Be',
       "The Seven Principles for Making Marriage Work: A Practical Guide from the Country's Foremost Relationship Expert",
       'The Glass Castle: A Memoir',
       'What Happened to You?: Conversations on Trauma, Resilience, and Healing',
       'Vax-Unvax: Let the Science Speak (Children’s Health Defense)',
       'Happy-Go-Lucky',
       'Habits of the Household: Practicing the Story of God in Everyday Family Rhythms',
       'Anne Of Green Gables Complete 8 Book Set',
       'The Classic Fairy Tales (Second Edition) (Norton Critical Editions)',
       'Touched Out: Motherhood, Misogyny, Consent, and Control',
       'Child Development and Education',
       'Autism Spectrum Disorders fro

In [34]:
#Check the columns data status
data.publishedDate.unique()

array(['2015-06-01', '2022-02-15', '2022-09-13', '2015-05-05',
       '2005-03-01', '2021-04-27', '2023-08-29', '2022-05-31',
       '2021-11-09', '2023-09-25', '2017-01-20', '2023-09-12',
       '2019-03-01', '2022-11-22', '2020-02-18', '2020-02-06',
       '2021-02-02', '2023-02-28', '2015-01-01', '2020-04-28',
       '2023-08-01', '2008-07-07', '2017-02-16', '2022-02-10',
       '2023-05-26', '2017-04-05', '2013-08-06', '2019-07-20',
       '2015-01-27', '2020-11-10', '2023-09-19', '2014-03-04',
       '2022-09-09', '2019-10-31', '2023-08-02'], dtype=object)

In [41]:
# Assuming your data is in a DataFrame called 'data'
# Extract year from 'publishedDate' column
data['year'] = pd.to_datetime(data['publishedDate']).dt.year

# Display the unique years
unique_years = data['year'].unique()
print(unique_years)

[2015 2022 2005 2021 2023 2017 2019 2020 2008 2013 2014]


In [42]:
data.head()

Unnamed: 0,title,author,soldBy,publishedDate,imgUrl,year
0,Adult Children of Emotionally Immature Parents...,Lindsay C. Gibson,Amazon.com Services LLC,2015-06-01,https://m.media-amazon.com/images/I/713KZTsaYp...,2015
1,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,Penguin Group (USA) LLC,2022-02-15,https://m.media-amazon.com/images/I/A1LZcJFs9E...,2022
2,Good Inside: A Guide to Becoming the Parent Yo...,Becky Kennedy,HarperCollins Publishers,2022-09-13,https://m.media-amazon.com/images/I/71RIWM0sv6...,2022
4,The Seven Principles for Making Marriage Work:...,John Gottman,Random House LLC,2015-05-05,https://m.media-amazon.com/images/I/813o4WOs+w...,2015
5,The Glass Castle: A Memoir,Jeannette Walls,Simon and Schuster Digital Sales Inc,2005-03-01,https://m.media-amazon.com/images/I/71td5GDUZM...,2005
