In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Sample user input
user_description = "I'm looking    for a mystery novel with a sSrng female lead."

# Preprocessing
# Lowercase and remove punctuation
cleaned_description = re.sub(r'[^\w\s]', '', user_description.lower())

# Tokenization and stopword removal can be handled by TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
vectorized_description = vectorizer.fit_transform([cleaned_description])

print(cleaned_description)

im looking    for a mystery novel with a ssrng female lead


In [2]:
import pandas as pd
df = pd.read_csv("books_summary.csv")
df

Unnamed: 0.1,Unnamed: 0,book_name,summaries,categories
0,0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,3,Brave New World,presents a futuristic society engineered perf...,science
4,4,1984,is the story of a man questioning the system ...,science
...,...,...,...,...
5196,5240,Essentialism,"will show you a new, better way of looking at...",mindfulness
5197,5241,The Art Of Happiness,is the result of a psychiatrist interviewing ...,mindfulness
5198,5242,The Paradox Of Choice,shows you how today’s vast amount of choice m...,mindfulness
5199,5243,Stumbling On Happiness,examines the capacity of our brains to fill i...,mindfulness


In [3]:
df['categories'].unique()

array(['science', 'biography', 'politics', 'economics', 'environment',
       'relationships', 'happiness', 'money', 'productivity',
       'psychology', 'motivation', 'marketing', 'management', 'health',
       'business', 'creativity', 'education', 'fiction', 'communication',
       'religion', 'technology', 'work', 'mindfulness'], dtype=object)

In [4]:
print(df['summaries'][342])


 explains why the old positions of power aren’t as powerful as they used to be due to recent changes in society and technology and how this shift has put more influence in the hands of everyday citizens like you and what it might mean for the future of our governments and world.


In [5]:
df = df.drop('Unnamed: 0',axis = 1)

In [6]:
df.isnull().any()

book_name     False
summaries      True
categories    False
dtype: bool

In [7]:
df.isnull().sum()

book_name     0
summaries     7
categories    0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(5194, 3)

In [10]:
df.duplicated().any()

True

In [11]:
duplicate = df[df.duplicated(keep=False)]
duplicate

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
...,...,...,...
4894,Brave New World,presents a futuristic society engineered perf...,mindfulness
4895,Stolen Focus,explains why our attention spans have been dw...,mindfulness
4896,The Daily Laws,"is a page-a-day, calendar-style book covering...",mindfulness
4897,Dopamine Nation,talks about the importance of living a balance...,mindfulness


In [12]:
(df['book_name'] == 'The Highly Sensitive Person').sum()

12

In [13]:
filtered_df = df[df['book_name'] == 'The Highly Sensitive Person']

filtered_df

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
10,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
566,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,relationships
576,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,relationships
841,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,happiness
851,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,happiness
1905,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,psychology
1915,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,psychology
4128,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,communication
4137,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,communication


In [14]:
filtered_df = df.groupby(['book_name', 'summaries'], as_index=False).agg({'categories': lambda x: ', '.join(sorted(set(x)))})

In [15]:
filtered_df

Unnamed: 0,book_name,summaries,categories
0,"Outer Order, Inner Calm",gives you advice to declutter your space and ...,"happiness, health, mindfulness, productivity, ..."
1,The Book,is a spiritual exploration of true human natur...,mindfulness
2,#GIRLBOSS,shows that even an unconventional life can le...,"business, creativity, motivation, work"
3,10 Days To Faster Reading,helps you bring your reading skills to the cu...,"education, productivity, psychology"
4,10% Happier,"gives skeptics an easy “in” to meditation, by...","happiness, mindfulness, psychology"
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,is Ramit Sethi’s no-BS guide to starting your...,"business, marketing, money, work"
1226,You’ll See It When You Believe It,"shows you how to discover your true, best sel...","motivation, productivity, psychology, work"
1227,You’re Not Listening,is a book that will improve your communicatio...,"business, communication, happiness, management..."
1228,Zero To One,is an inside look at Peter Thiel’s philosophy...,"business, management, marketing, motivation, p..."


In [16]:
(filtered_df['book_name'] == 'The Highly Sensitive Person').sum()

1

In [17]:
filtered_df[filtered_df['book_name'] == 'The Highly Sensitive Person']


Unnamed: 0,book_name,summaries,categories
915,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,"communication, happiness, psychology, relation..."


In [18]:
data = []
import csv
from tqdm import tqdm
with open("booksummaries.txt", 'r', encoding='utf-8') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in tqdm(reader):
        data.append(row)

16559it [00:00, 67480.42it/s]


In [19]:
book_id = []
book_name = []
summary = []
genre = []

for i in tqdm(data):
    book_id.append(i[0])
    book_name.append(i[2])
    genre.append(i[5])
    summary.append(i[6])

books = pd.DataFrame({'book_id': book_id, 'book_name': book_name,
                       'genre': genre, 'summary': summary})
books

100%|███████████████████████████████████████████████████████████████████████| 16559/16559 [00:00<00:00, 2369536.35it/s]


Unnamed: 0,book_id,book_name,genre,summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,986,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,1756,An Enquiry Concerning Human Understanding,,The argument of the Enquiry proceeds by a ser...
4,2080,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...
...,...,...,...,...
16554,36934824,Under Wildwood,,"Prue McKeel, having rescued her brother from ..."
16555,37054020,Transfer of Power,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction""}",The reader first meets Rapp while he is doing...
16556,37122323,Decoded,"{""/m/0xdf"": ""Autobiography""}",The book follows very rough chronological ord...
16557,37132319,America Again: Re-becoming The Greatness We Ne...,,Colbert addresses topics including Wall Stree...


In [20]:
books.isnull().sum()

book_id      0
book_name    0
genre        0
summary      0
dtype: int64

In [21]:
import numpy as np

mask = books['genre'].apply(lambda x: x == '' or pd.isna(x) or not isinstance(x, str) or (isinstance(x, str) and not x.startswith('{')))

# Replace identified values with NaN
books.loc[mask, 'genre'] = np.nan

# Now check for null values again
null_values = books.isnull().sum()

print(null_values)

book_id         0
book_name       0
genre        3718
summary         0
dtype: int64


In [22]:
books.drop("book_id", axis = 1, inplace = True)

In [23]:
books

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,,The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...
...,...,...,...
16554,Under Wildwood,,"Prue McKeel, having rescued her brother from ..."
16555,Transfer of Power,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction""}",The reader first meets Rapp while he is doing...
16556,Decoded,"{""/m/0xdf"": ""Autobiography""}",The book follows very rough chronological ord...
16557,America Again: Re-becoming The Greatness We Ne...,,Colbert addresses topics including Wall Stree...


In [24]:
import json

In [25]:
def extract_genres(genre):
    if pd.isna(genre) or genre == '':
        return np.nan
    try:
        genre_dict = json.loads(genre)
        genre_values = list(genre_dict.values())
        return ', '.join(genre_values)
    except (json.JSONDecodeError, TypeError):
        return np.nan

In [26]:
books['genre'] = books['genre'].apply(extract_genres)


In [27]:
books

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"Roman à clef, Satire, Children's literature, S...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"Science Fiction, Novella, Speculative fiction,...","Alex, a teenager living in near-future Englan..."
2,The Plague,"Existentialism, Fiction, Absurdist fiction, Novel",The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,,The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,"Hard science fiction, Science Fiction, Specula...",The novel posits that space around the Milky ...
...,...,...,...
16554,Under Wildwood,,"Prue McKeel, having rescued her brother from ..."
16555,Transfer of Power,"Thriller, Fiction",The reader first meets Rapp while he is doing...
16556,Decoded,Autobiography,The book follows very rough chronological ord...
16557,America Again: Re-becoming The Greatness We Ne...,,Colbert addresses topics including Wall Stree...


In [28]:
filtered_df.rename(columns={'summaries': 'summary', 'categories': 'genre'}, inplace=True)


In [29]:
books

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"Roman à clef, Satire, Children's literature, S...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,"Science Fiction, Novella, Speculative fiction,...","Alex, a teenager living in near-future Englan..."
2,The Plague,"Existentialism, Fiction, Absurdist fiction, Novel",The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,,The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,"Hard science fiction, Science Fiction, Specula...",The novel posits that space around the Milky ...
...,...,...,...
16554,Under Wildwood,,"Prue McKeel, having rescued her brother from ..."
16555,Transfer of Power,"Thriller, Fiction",The reader first meets Rapp while he is doing...
16556,Decoded,Autobiography,The book follows very rough chronological ord...
16557,America Again: Re-becoming The Greatness We Ne...,,Colbert addresses topics including Wall Stree...


In [30]:
filtered_df

Unnamed: 0,book_name,summary,genre
0,"Outer Order, Inner Calm",gives you advice to declutter your space and ...,"happiness, health, mindfulness, productivity, ..."
1,The Book,is a spiritual exploration of true human natur...,mindfulness
2,#GIRLBOSS,shows that even an unconventional life can le...,"business, creativity, motivation, work"
3,10 Days To Faster Reading,helps you bring your reading skills to the cu...,"education, productivity, psychology"
4,10% Happier,"gives skeptics an easy “in” to meditation, by...","happiness, mindfulness, psychology"
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,is Ramit Sethi’s no-BS guide to starting your...,"business, marketing, money, work"
1226,You’ll See It When You Believe It,"shows you how to discover your true, best sel...","motivation, productivity, psychology, work"
1227,You’re Not Listening,is a book that will improve your communicatio...,"business, communication, happiness, management..."
1228,Zero To One,is an inside look at Peter Thiel’s philosophy...,"business, management, marketing, motivation, p..."


In [31]:
combined_df = pd.concat([books, filtered_df])

In [32]:
df = combined_df

In [33]:
df['summary'] = df['summary'].str.lower()

In [35]:
df['genre'] = df['genre'].str.lower()

In [34]:
df

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"Roman à clef, Satire, Children's literature, S...","old major, the old boar on the manor farm, ca..."
1,A Clockwork Orange,"Science Fiction, Novella, Speculative fiction,...","alex, a teenager living in near-future englan..."
2,The Plague,"Existentialism, Fiction, Absurdist fiction, Novel",the text of the plague is divided into five p...
3,An Enquiry Concerning Human Understanding,,the argument of the enquiry proceeds by a ser...
4,A Fire Upon the Deep,"Hard science fiction, Science Fiction, Specula...",the novel posits that space around the milky ...
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work",is ramit sethi’s no-bs guide to starting your...
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work","shows you how to discover your true, best sel..."
1227,You’re Not Listening,"business, communication, happiness, management...",is a book that will improve your communicatio...
1228,Zero To One,"business, management, marketing, motivation, p...",is an inside look at peter thiel’s philosophy...


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17789 entries, 0 to 1229
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   book_name  17789 non-null  object
 1   genre      14071 non-null  object
 2   summary    17789 non-null  object
dtypes: object(3)
memory usage: 555.9+ KB


In [36]:
import string
df['summary'] = df['summary'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))

In [37]:
df

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...",old major the old boar on the manor farm call...
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...",alex a teenager living in nearfuture england ...
2,The Plague,"existentialism, fiction, absurdist fiction, novel",the text of the plague is divided into five p...
3,An Enquiry Concerning Human Understanding,,the argument of the enquiry proceeds by a ser...
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...",the novel posits that space around the milky ...
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work",is ramit sethi’s nobs guide to starting your ...
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work",shows you how to discover your true best self...
1227,You’re Not Listening,"business, communication, happiness, management...",is a book that will improve your communicatio...
1228,Zero To One,"business, management, marketing, motivation, p...",is an inside look at peter thiel’s philosophy...


In [38]:
df

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...",old major the old boar on the manor farm call...
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...",alex a teenager living in nearfuture england ...
2,The Plague,"existentialism, fiction, absurdist fiction, novel",the text of the plague is divided into five p...
3,An Enquiry Concerning Human Understanding,,the argument of the enquiry proceeds by a ser...
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...",the novel posits that space around the milky ...
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work",is ramit sethi’s nobs guide to starting your ...
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work",shows you how to discover your true best self...
1227,You’re Not Listening,"business, communication, happiness, management...",is a book that will improve your communicatio...
1228,Zero To One,"business, management, marketing, motivation, p...",is an inside look at peter thiel’s philosophy...


In [39]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
df['summary'] = df['summary'].apply(word_tokenize)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
df

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...","[old, major, the, old, boar, on, the, manor, f..."
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...","[alex, a, teenager, living, in, nearfuture, en..."
2,The Plague,"existentialism, fiction, absurdist fiction, novel","[the, text, of, the, plague, is, divided, into..."
3,An Enquiry Concerning Human Understanding,,"[the, argument, of, the, enquiry, proceeds, by..."
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...","[the, novel, posits, that, space, around, the,..."
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work","[is, ramit, sethi, ’, s, nobs, guide, to, star..."
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work","[shows, you, how, to, discover, your, true, be..."
1227,You’re Not Listening,"business, communication, happiness, management...","[is, a, book, that, will, improve, your, commu..."
1228,Zero To One,"business, management, marketing, motivation, p...","[is, an, inside, look, at, peter, thiel, ’, s,..."


In [41]:
df

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...","[old, major, the, old, boar, on, the, manor, f..."
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...","[alex, a, teenager, living, in, nearfuture, en..."
2,The Plague,"existentialism, fiction, absurdist fiction, novel","[the, text, of, the, plague, is, divided, into..."
3,An Enquiry Concerning Human Understanding,,"[the, argument, of, the, enquiry, proceeds, by..."
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...","[the, novel, posits, that, space, around, the,..."
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work","[is, ramit, sethi, ’, s, nobs, guide, to, star..."
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work","[shows, you, how, to, discover, your, true, be..."
1227,You’re Not Listening,"business, communication, happiness, management...","[is, a, book, that, will, improve, your, commu..."
1228,Zero To One,"business, management, marketing, motivation, p...","[is, an, inside, look, at, peter, thiel, ’, s,..."


In [42]:
from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english'))
df['summary'] = df['summary'].apply(lambda x: [word for word in x if word not in stop_words])

In [43]:
df

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...","[old, major, old, boar, manor, farm, calls, an..."
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...","[alex, teenager, living, nearfuture, england, ..."
2,The Plague,"existentialism, fiction, absurdist fiction, novel","[text, plague, divided, five, parts, town, ora..."
3,An Enquiry Concerning Human Understanding,,"[argument, enquiry, proceeds, series, incremen..."
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...","[novel, posits, space, around, milky, way, div..."
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work","[ramit, sethi, ’, nobs, guide, starting, busin..."
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work","[shows, discover, true, best, self, revealing,..."
1227,You’re Not Listening,"business, communication, happiness, management...","[book, improve, communication, skills, reveali..."
1228,Zero To One,"business, management, marketing, motivation, p...","[inside, look, peter, thiel, ’, philosophy, st..."


In [44]:
df

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...","[old, major, old, boar, manor, farm, calls, an..."
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...","[alex, teenager, living, nearfuture, england, ..."
2,The Plague,"existentialism, fiction, absurdist fiction, novel","[text, plague, divided, five, parts, town, ora..."
3,An Enquiry Concerning Human Understanding,,"[argument, enquiry, proceeds, series, incremen..."
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...","[novel, posits, space, around, milky, way, div..."
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work","[ramit, sethi, ’, nobs, guide, starting, busin..."
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work","[shows, discover, true, best, self, revealing,..."
1227,You’re Not Listening,"business, communication, happiness, management...","[book, improve, communication, skills, reveali..."
1228,Zero To One,"business, management, marketing, motivation, p...","[inside, look, peter, thiel, ’, philosophy, st..."


In [45]:
from nltk.stem import WordNetLemmatizer

def lemmatize_list(word_list):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_list]
    return lemmatized_words

df['lemmatized_summary'] = df['summary'].apply(lemmatize_list)

In [46]:
df

Unnamed: 0,book_name,genre,summary,lemmatized_summary
0,Animal Farm,"roman à clef, satire, children's literature, s...","[old, major, old, boar, manor, farm, calls, an...","[old, major, old, boar, manor, farm, call, ani..."
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...","[alex, teenager, living, nearfuture, england, ...","[alex, teenager, living, nearfuture, england, ..."
2,The Plague,"existentialism, fiction, absurdist fiction, novel","[text, plague, divided, five, parts, town, ora...","[text, plague, divided, five, part, town, oran..."
3,An Enquiry Concerning Human Understanding,,"[argument, enquiry, proceeds, series, incremen...","[argument, enquiry, proceeds, series, incremen..."
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...","[novel, posits, space, around, milky, way, div...","[novel, posit, space, around, milky, way, divi..."
...,...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work","[ramit, sethi, ’, nobs, guide, starting, busin...","[ramit, sethi, ’, nob, guide, starting, busine..."
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work","[shows, discover, true, best, self, revealing,...","[show, discover, true, best, self, revealing, ..."
1227,You’re Not Listening,"business, communication, happiness, management...","[book, improve, communication, skills, reveali...","[book, improve, communication, skill, revealin..."
1228,Zero To One,"business, management, marketing, motivation, p...","[inside, look, peter, thiel, ’, philosophy, st...","[inside, look, peter, thiel, ’, philosophy, st..."


In [47]:
df['lemmatized_summary'] == df['summary']

0       False
1       False
2       False
3       False
4       False
        ...  
1225    False
1226    False
1227    False
1228    False
1229    False
Length: 17789, dtype: bool

In [48]:
df['summary'] = df['lemmatized_summary']

In [49]:
df.drop('lemmatized_summary',axis = 1)

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...","[old, major, old, boar, manor, farm, call, ani..."
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...","[alex, teenager, living, nearfuture, england, ..."
2,The Plague,"existentialism, fiction, absurdist fiction, novel","[text, plague, divided, five, part, town, oran..."
3,An Enquiry Concerning Human Understanding,,"[argument, enquiry, proceeds, series, incremen..."
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...","[novel, posit, space, around, milky, way, divi..."
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work","[ramit, sethi, ’, nob, guide, starting, busine..."
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work","[show, discover, true, best, self, revealing, ..."
1227,You’re Not Listening,"business, communication, happiness, management...","[book, improve, communication, skill, revealin..."
1228,Zero To One,"business, management, marketing, motivation, p...","[inside, look, peter, thiel, ’, philosophy, st..."


In [50]:
df['summary'] = df['summary'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df
df.drop('lemmatized_summary',axis = 1)

Unnamed: 0,book_name,genre,summary
0,Animal Farm,"roman à clef, satire, children's literature, s...",old major old boar manor farm call animal farm...
1,A Clockwork Orange,"science fiction, novella, speculative fiction,...",alex teenager living nearfuture england lead g...
2,The Plague,"existentialism, fiction, absurdist fiction, novel",text plague divided five part town oran thousa...
3,An Enquiry Concerning Human Understanding,,argument enquiry proceeds series incremental s...
4,A Fire Upon the Deep,"hard science fiction, science fiction, specula...",novel posit space around milky way divided con...
...,...,...,...
1225,Your Move: The Underdog’s Guide to Building Yo...,"business, marketing, money, work",ramit sethi ’ nob guide starting business ’ he...
1226,You’ll See It When You Believe It,"motivation, productivity, psychology, work",show discover true best self revealing use pow...
1227,You’re Not Listening,"business, communication, happiness, management...",book improve communication skill revealing unc...
1228,Zero To One,"business, management, marketing, motivation, p...",inside look peter thiel ’ philosophy strategy ...


In [51]:
df.loc[1555:1666]

Unnamed: 0,book_name,genre,summary,lemmatized_summary
1555,"Akhenaten, Dweller in Truth","history, novel",way thebe father scribe amunhoben point ruin a...,"[way, thebe, father, scribe, amunhoben, point,..."
1556,Faerie Tale,"speculative fiction, fantasy",phil hastings family moved back hometown much ...,"[phil, hastings, family, moved, back, hometown..."
1557,Lirael,"science fiction, children's literature, fantas...",lirael see outcast within world clayr ravenbla...,"[lirael, see, outcast, within, world, clayr, r..."
1558,Abhorsen,"children's literature, fantasy, speculative fi...",main novel begin abhorsens house besieged dead...,"[main, novel, begin, abhorsens, house, besiege..."
1559,The Girl Who Owned a City,"science fiction, apocalyptic and post-apocalyp...",deadly virus swept world killing everyone age ...,"[deadly, virus, swept, world, killing, everyon..."
...,...,...,...,...
1662,Les amitiés particulières,,plot revolves around george de sarre fourteeny...,"[plot, revolves, around, george, de, sarre, fo..."
1663,The Worm Ouroboros,"science fiction, speculative fiction, fantasy,...",framing story first two chapter describes worl...,"[framing, story, first, two, chapter, describe..."
1664,Rising Sun,"crime fiction, mystery, science fiction, ficti...",nakamoto corporation celebrating grand opening...,"[nakamoto, corporation, celebrating, grand, op..."
1665,"Norby, the Mixed-Up Robot","science fiction, speculative fiction",book start jeff need teaching robot buy norby ...,"[book, start, jeff, need, teaching, robot, buy..."


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Convert summaries to vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['summary'])

# Step 2: Function to recommend books based on user description
def recommend_books(user_description, df, X, vectorizer, top_n=5):
    # Transform user input
    user_vec = vectorizer.transform([user_description])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(user_vec, X)
    
    # Find top N matches
    top_n_indices = similarities.argsort()[0][-top_n:][::-1]
    
    # Return the top N recommended books
    return df.iloc[top_n_indices][['book_name', 'genre', 'summary']]

# Example usage
user_description = "a book based on a college campus"
recommended_books = recommend_books(user_description, df, X, vectorizer)
recommended_books 

ModuleNotFoundError: No module named 'sklearn'