# Book recommendation using sentiment analysis

Dataset is collected from Kaggle https://www.kaggle.com/datasets/anshtanwar/top-200-trending-books-with-reviews/data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#reading the csv files
books = pd.read_csv("Top-100 Trending Books.csv")
books_reviews = pd.read_csv("customer reviews.csv")

In [3]:
books_reviews.head()

Unnamed: 0,Sno,book name,review title,reviewer,reviewer rating,review description,is_verified,date,timestamp,ASIN
0,0,The Woman in Me,Unbelievably impressive. Her torn life on paper.,Murderess Marbie,4,I'm only a third way in. Shipped lightening fa...,True,26-10-2023,"Reviewed in the United States October 26, 2023",1668009048
1,1,The Woman in Me,What a heartbreaking story,L J,5,"""There have been so many times when I was scar...",True,06-11-2023,"Reviewed in the United States November 6, 2023",1668009048
2,2,The Woman in Me,Britney you are so invincible! You are an insp...,Jamie,5,The media could not be loaded. I personally ha...,True,01-11-2023,"Reviewed in the United States November 1, 2023",1668009048
3,3,The Woman in Me,"Fast Read, Sad Story",KMG,5,I have been a fan of Britney's music since the...,True,25-10-2023,"Reviewed in the United States October 25, 2023",1668009048
4,4,The Woman in Me,"Buy it, it’s worth the read!",Stephanie Brown,5,"Whether or not you’re a fan, it’s a great read...",True,01-11-2023,"Reviewed in the United States November 1, 2023",1668009048


In [4]:
#checking for null values in the dataframe
missing_values = books_reviews.isnull().sum()
print(missing_values)

Sno                   0
book name             0
review title          0
reviewer              0
reviewer rating       0
review description    0
is_verified           0
date                  0
timestamp             0
ASIN                  0
dtype: int64


In [5]:
books_reviews['text_combined'] = books_reviews['review title'] + ' ' + books_reviews['review description']

In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers using regex
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming using Porter Stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Join the tokens back into a cleaned text
    cleaned_text = ' '.join(tokens)

    return cleaned_text

books_reviews['review_text_clean'] = books_reviews['text_combined'].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\1000070213\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1000070213\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#keywords to look for in the reviews
keywords_dict = {
    'Emotional': ['sentimental', 'touching', 'passionate', 'overwhelming', 'stirring'],
    'Unexpected': ['surprising', 'unanticipated', 'shocking', 'astonishing', 'unforeseen'],
    'Intriguing': ['captivating', 'fascinating', 'compelling', 'mysterious', 'provocative']
}

In [8]:
#filter the data set for the keywords
def filter_by_keywords(df,keywords):
    
    keyword_pattern = '|'.join(keywords)
    
    mask = df['review_text_clean'].str.contains(keyword_pattern,flags=re.IGNORECASE)
    
    return df[mask]

In [9]:
relevant_df = filter_by_keywords(books_reviews,keywords_dict)

In [10]:
relevant_df

Unnamed: 0,Sno,book name,review title,reviewer,reviewer rating,review description,is_verified,date,timestamp,ASIN,text_combined,review_text_clean
30,30,"Fourth Wing (The Empyrean, 1)",4.5 Stars,Sandy Farmer,4,"One Sunday, I was visiting blogs, which I don'...",True,23-10-2023,"Reviewed in the United States October 23, 2023",1649374046,"4.5 Stars One Sunday, I was visiting blogs, wh...",star one sunday visit blog dont get often stum...
87,87,Night (Night),Matter of Fact Horrors,Gary F. Taylor,5,NIGHT was among the first widely read accounts...,True,07-08-2014,"Reviewed in the United States August 7, 2014",374500010,Matter of Fact Horrors NIGHT was among the fir...,matter fact horror night among first wide read...
352,352,Outlive: The Science and Art of Longevity,"A Clear, Walkable Path to Longevity",Lucas,5,"8 Dimensions of Wellness: Physical, EmotionalT...",True,31-10-2023,"Reviewed in the United States October 31, 2023",593236599,"A Clear, Walkable Path to Longevity 8 Dimensio...",clear walkabl path longev dimens well physic e...
905,905,"I'm Dead, Now What?: Important Information Abo...",Great product!,Dee,5,I seldom leave reviews but felt this one worth...,True,10-09-2023,"Reviewed in the United States September 10, 2023",1441317996,Great product! I seldom leave reviews but felt...,great product seldom leav review felt one wort...


In [11]:
from textblob import TextBlob

In [12]:
def analyze_sentences(df):
    # Apply TextBlob to each text in the 'review_text_clean' column
    df['sentiment'] = df['review_text_clean'].apply(lambda x: TextBlob(x))
    
    # Extract sentiment polarity and subjectivity
    df['sentiment_polarity'] = df['sentiment'].apply(lambda x: x.sentiment.polarity)
    df['sentiment_subjectivity'] = df['sentiment'].apply(lambda x: x.sentiment.subjectivity)
    
    # Drop the 'sentiment' column
    df = df.drop(columns=['sentiment'])
    
    return df

In [13]:
relevant_df = analyze_sentences(relevant_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['review_text_clean'].apply(lambda x: TextBlob(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_polarity'] = df['sentiment'].apply(lambda x: x.sentiment.polarity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_subjectivity'] = df['sentiment'].apply(

In [14]:
relevant_df

Unnamed: 0,Sno,book name,review title,reviewer,reviewer rating,review description,is_verified,date,timestamp,ASIN,text_combined,review_text_clean,sentiment_polarity,sentiment_subjectivity
30,30,"Fourth Wing (The Empyrean, 1)",4.5 Stars,Sandy Farmer,4,"One Sunday, I was visiting blogs, which I don'...",True,23-10-2023,"Reviewed in the United States October 23, 2023",1649374046,"4.5 Stars One Sunday, I was visiting blogs, wh...",star one sunday visit blog dont get often stum...,0.066653,0.38544
87,87,Night (Night),Matter of Fact Horrors,Gary F. Taylor,5,NIGHT was among the first widely read accounts...,True,07-08-2014,"Reviewed in the United States August 7, 2014",374500010,Matter of Fact Horrors NIGHT was among the fir...,matter fact horror night among first wide read...,0.036486,0.400538
352,352,Outlive: The Science and Art of Longevity,"A Clear, Walkable Path to Longevity",Lucas,5,"8 Dimensions of Wellness: Physical, EmotionalT...",True,31-10-2023,"Reviewed in the United States October 31, 2023",593236599,"A Clear, Walkable Path to Longevity 8 Dimensio...",clear walkabl path longev dimens well physic e...,0.124756,0.490642
905,905,"I'm Dead, Now What?: Important Information Abo...",Great product!,Dee,5,I seldom leave reviews but felt this one worth...,True,10-09-2023,"Reviewed in the United States September 10, 2023",1441317996,Great product! I seldom leave reviews but felt...,great product seldom leav review felt one wort...,0.519481,0.635714


In [15]:
#top relevant books based on polarity
relevant_df.sort_values(by='sentiment_polarity',ascending=False)

Unnamed: 0,Sno,book name,review title,reviewer,reviewer rating,review description,is_verified,date,timestamp,ASIN,text_combined,review_text_clean,sentiment_polarity,sentiment_subjectivity
905,905,"I'm Dead, Now What?: Important Information Abo...",Great product!,Dee,5,I seldom leave reviews but felt this one worth...,True,10-09-2023,"Reviewed in the United States September 10, 2023",1441317996,Great product! I seldom leave reviews but felt...,great product seldom leav review felt one wort...,0.519481,0.635714
352,352,Outlive: The Science and Art of Longevity,"A Clear, Walkable Path to Longevity",Lucas,5,"8 Dimensions of Wellness: Physical, EmotionalT...",True,31-10-2023,"Reviewed in the United States October 31, 2023",593236599,"A Clear, Walkable Path to Longevity 8 Dimensio...",clear walkabl path longev dimens well physic e...,0.124756,0.490642
30,30,"Fourth Wing (The Empyrean, 1)",4.5 Stars,Sandy Farmer,4,"One Sunday, I was visiting blogs, which I don'...",True,23-10-2023,"Reviewed in the United States October 23, 2023",1649374046,"4.5 Stars One Sunday, I was visiting blogs, wh...",star one sunday visit blog dont get often stum...,0.066653,0.38544
87,87,Night (Night),Matter of Fact Horrors,Gary F. Taylor,5,NIGHT was among the first widely read accounts...,True,07-08-2014,"Reviewed in the United States August 7, 2014",374500010,Matter of Fact Horrors NIGHT was among the fir...,matter fact horror night among first wide read...,0.036486,0.400538
