In [38]:
import numpy as np
import pandas as pd

In [39]:
books = pd.read_csv("dataset/categorical/books_clean.csv")
books.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [40]:
irrelevant_cols = [0, 6]
books.drop(books.columns[irrelevant_cols], axis=1, inplace=True)
books.head()

Unnamed: 0,isbn10,title,subtitle,authors,categories,description,published_year,average_rating,num_pages,ratings_count
0,2005883,Gilead,,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,6163831,The One Tree,,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,6178731,Rage of angels,,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,6280897,The Four Loves,,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(
    min_df=3,
    max_features=None,
    strip_accents="unicode",
    analyzer="word",
    token_pattern=r"\w{1,}",
    ngram_range=(1, 3),
    stop_words="english",
)

books['description'] = books['description'].fillna('')

In [98]:
content = books['title'] + ": " + books['description']

tfv_matrix = tfv.fit_transform(content)

In [99]:
tfv_matrix.shape

(6810, 19736)

In [100]:
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [101]:
sig[0]

array([0.76161543, 0.76159434, 0.7615944 , ..., 0.76159416, 0.76159471,
       0.76159416])

In [102]:
indices = pd.Series(books.index, index=books['title']).drop_duplicates()
indices

title
Gilead                          0
Spider's Web                    1
The One Tree                    2
Rage of angels                  3
The Four Loves                  4
                             ... 
I Am that                    6805
Secrets Of The Heart         6806
Fahrenheit 451               6807
The Berlin Phenomenology     6808
'I'm Telling You Stories'    6809
Length: 6810, dtype: int64

In [103]:
indices['The Fellowship of the Ring']


title
The Fellowship of the Ring      65
The Fellowship of the Ring    1779
The Fellowship of the Ring    1780
The Fellowship of the Ring    3745
The Fellowship of the Ring    6414
dtype: int64

In [104]:
def give_rec(title, sig=sig):
    idx = indices[title]

    if isinstance(idx, pd.Series) or isinstance(idx, np.ndarray):
        idx = idx.iloc[0]

    sig_scores = list(enumerate(sig[idx]))

        
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    
    sig_scores = sig_scores[1:11]
    
    book_indices = [i[0] for i in sig_scores]
    
    return books.iloc[book_indices]['title']

give_rec("Dracula")


5865               A Dracula Handbook
5892       Happy Hour at Casa Dracula
4933            Bram Stoker's Dracula
2501             In Search of Dracula
1087    Frankenstein Makes a Sandwich
1635            The Fantastic Vampire
2447                          Dracula
3184    Best Ghost and Horror Stories
4443                          Dracula
4850                       Bad Dreams
Name: title, dtype: object

In [107]:
give_rec("Animal Farm and 1984")

3099                                   1984
679                             The Assault
2660    The Complete Works of George Orwell
1156                      The Orwell Reader
913                         Orwell in Spain
1680      Essentials of American Government
516                                 Hexwood
1514                               Dominion
2061                                 Essays
684                         Of Mice and Men
Name: title, dtype: object

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def recommend_from_description(user_description, tfv, tfv_matrix, books_df, top_n=10):
    user_vector = tfv.transform([user_description])

    similarities = cosine_similarity(user_vector, tfv_matrix).flatten()

    top_indices = similarities.argsort()[-top_n:][::-1]

    return books_df.iloc[top_indices][["title"]]


recommend_from_description(
    "Boring life", tfv=tfv, tfv_matrix=tfv_matrix, books_df=books
)

Unnamed: 0,title
4751,In the Land of the Lawn Weenies
6737,Theory of Fun for Game Design
4408,March Upcountry
3472,Stories
597,Lady of Quality
4484,The Judy Moody Mood Journal
5933,Bleach
6105,The Life You Were Born to Live
215,Love Overboard
1643,Twilight
