# Content Based Filtering Recommendation using TF-IDF

### Author: Xuezhang (Brickhoff) Wu

##### 1. Load the dataset

In [3]:
import pandas as pd
from pandas import Series, DataFrame
import json
import numpy as np

In [5]:
# You need to change the file directory to your own directory where you store poetry data.
poetryFile = pd.read_json('C:/Users/90312/Desktop/670/goodreads_books_poetry.json', lines=True)

In [7]:
poetryFile.head()

Unnamed: 0,asin,authors,average_rating,book_id,country_code,description,edition_information,format,image_url,is_ebook,...,publication_day,publication_month,publication_year,publisher,ratings_count,series,similar_books,text_reviews_count,url,work_id
0,,"[{'author_id': '15585', 'role': ''}]",3.83,16037549,US,Number 30 in a series of literary pamphlets pu...,,Paperback,https://images.gr-assets.com/books/1348176637m...,False,...,1,11,1887,"Houghton, Mifflin and Company",3,[],[],1,https://www.goodreads.com/book/show/16037549-v...,5212748
1,,"[{'author_id': '16073', 'role': ''}, {'author_...",3.83,22466716,US,Fairy Tales gathers the unconventional verse d...,,Paperback,https://images.gr-assets.com/books/1404958407m...,False,...,20,4,2015,New Directions,37,[],[],2,https://www.goodreads.com/book/show/22466716-f...,41905435
2,,"[{'author_id': '18540', 'role': ''}, {'author_...",4.38,926662,US,Three poems describe the nighttime adventures ...,,Paperback,https://s.gr-assets.com/assets/nophoto/book/11...,False,...,12,7,2008,Farrar Straus Giroux,45,[],[],7,https://www.goodreads.com/book/show/926662.Gro...,911665
3,,"[{'author_id': '18540', 'role': ''}]",3.71,926667,US,A modern verse play about the search for meani...,,Paperback,https://images.gr-assets.com/books/1382939971m...,False,...,18,3,1964,Mariner Books,115,[],"[1230072, 315167, 676169, 18522, 124335, 88263...",12,https://www.goodreads.com/book/show/926667.The...,995066
4,,"[{'author_id': '14308759', 'role': ''}]",5.0,29065952,US,Louder Than Everything You Love is about trans...,First,Paperback,https://images.gr-assets.com/books/1455198396m...,False,...,23,12,2015,ELJ Publications,9,[],"[25869488, 23630890, 25448131, 25464039, 42166...",4,https://www.goodreads.com/book/show/29065952-l...,49294781


##### 2. Preprocessing the data to increase the accuracy of the result

In [12]:
import re
reg = re.compile('<[^>]*>')
re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) # match CDATA
re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script
re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style
re_br=re.compile('<br\s*?/?>')# <br>
re_h=re.compile('</?\w+[^>]*>')#HTML tag
re_comment=re.compile('<!--[^>]*-->')#HTML comment
re_symbol = re.compile('[\s+\.\!\/_,$%^*(+\"\-)]')# basic punctuation and symbols
blank_line=re.compile('\n+')

desc = []

for index, row in poetryFile.iterrows():
    s=re_cdata.sub('',row['description'])#remove CDATA
    s=re_script.sub('',s) #remove SCRIPT
    s=re_style.sub('',s)#remove style
    s=re_br.sub('',s)#remove br
    s=re_h.sub('',s) #remove HTML tag
    s=re_comment.sub('',s)#remove HTML comment
    s=blank_line.sub('\n',s)#remove extra blank lines
    s=re_symbol.sub(' ',s) # remove other kinds of symbols
    desc.append(s)

In [13]:
print(desc[0])

Number 30 in a series of literary pamphlets published monthly and available at the price of 15 cents per copy  or a yearly subscription  19 numbers  for  1 25


In [14]:
poetryFile['description'] = desc

In [15]:
poetryFile

Unnamed: 0,asin,authors,average_rating,book_id,country_code,description,edition_information,format,image_url,is_ebook,...,publication_day,publication_month,publication_year,publisher,ratings_count,series,similar_books,text_reviews_count,url,work_id
0,,"[{'author_id': '15585', 'role': ''}]",3.83,16037549,US,Number 30 in a series of literary pamphlets pu...,,Paperback,https://images.gr-assets.com/books/1348176637m...,false,...,1,11,1887,"Houghton, Mifflin and Company",3,[],[],1,https://www.goodreads.com/book/show/16037549-v...,5212748
1,,"[{'author_id': '16073', 'role': ''}, {'author_...",3.83,22466716,US,Fairy Tales gathers the unconventional verse d...,,Paperback,https://images.gr-assets.com/books/1404958407m...,false,...,20,4,2015,New Directions,37,[],[],2,https://www.goodreads.com/book/show/22466716-f...,41905435
2,,"[{'author_id': '18540', 'role': ''}, {'author_...",4.38,926662,US,Three poems describe the nighttime adventures ...,,Paperback,https://s.gr-assets.com/assets/nophoto/book/11...,false,...,12,7,2008,Farrar Straus Giroux,45,[],[],7,https://www.goodreads.com/book/show/926662.Gro...,911665
3,,"[{'author_id': '18540', 'role': ''}]",3.71,926667,US,A modern verse play about the search for meani...,,Paperback,https://images.gr-assets.com/books/1382939971m...,false,...,18,3,1964,Mariner Books,115,[],"[1230072, 315167, 676169, 18522, 124335, 88263...",12,https://www.goodreads.com/book/show/926667.The...,995066
4,,"[{'author_id': '14308759', 'role': ''}]",5.00,29065952,US,Louder Than Everything You Love is about trans...,First,Paperback,https://images.gr-assets.com/books/1455198396m...,false,...,23,12,2015,ELJ Publications,9,[],"[25869488, 23630890, 25448131, 25464039, 42166...",4,https://www.goodreads.com/book/show/29065952-l...,49294781
5,,"[{'author_id': '11563', 'role': ''}, {'author_...",4.09,35235890,US,Cunku gocup gideriz guzellik ise kalicid...,,Paperback,https://images.gr-assets.com/books/1495907022m...,false,...,26,5,2017,Everest Yayinlari,4,[],"[15796750, 1504664, 138241, 254039, 92852, 419...",1,https://www.goodreads.com/book/show/35235890-s...,1306728
6,,"[{'author_id': '2988946', 'role': ''}]",4.75,15861988,US,Into Temptation is the debut collection of poe...,,Paperback,https://images.gr-assets.com/books/1346225281m...,false,...,,,2009,Tollington Press,8,[],[],3,https://www.goodreads.com/book/show/15861988-i...,21611807
7,B003XIIVBG,"[{'author_id': '25492', 'role': ''}]",3.83,9495428,US,,,,https://s.gr-assets.com/assets/nophoto/book/11...,true,...,,,,,4,[],"[2078239, 178478, 709979, 824499, 4417990, 387...",1,https://www.goodreads.com/book/show/9495428-da...,2360399
8,,"[{'author_id': '619932', 'role': ''}]",3.47,346381,US,,,Paperback,https://s.gr-assets.com/assets/nophoto/book/11...,false,...,,,2005,frzn,405,[],"[265786, 163339, 650197, 383428, 346507, 81605...",16,https://www.goodreads.com/book/show/346381._,336680
9,,"[{'author_id': '516921', 'role': ''}]",4.25,598195,US,First published in 1987 Borderlands has bec...,,Paperback,https://s.gr-assets.com/assets/nophoto/book/11...,false,...,12,12,1987,Aunt Lute Books,98,[],"[143367, 342834, 61442, 476775, 379013, 568869...",8,https://www.goodreads.com/book/show/598195.Bor...,45046


##### 3. Calculate TF-IDF scores and Cosine Similarity Matrix.

In [52]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(descriptionFile['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [53]:
similarity = cosine_sim

##### 4. Construct the recommendation system

In [54]:
smd = poetryFile.reset_index()
indices = pd.Series(smd.index, index=smd['book_id'])

######  -- Load the poetryTitle file which has the title for each poetry. The poetryTitle file is generated by poetryTitleGeneration.ipynb.

In [19]:
# You need to change the file directory to your own directory where you generate the poetryTitle file.
with open('C:/Users/90312/Desktop/670/poetryTitle.json','r') as f:
  dict = json.load(f)

###### -- Recommendation system

In [60]:
def improved_recommendations(bookId):
    idx = indices[bookId]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # We remove the books which have extremely high similarity
    i = 0
    for sim in sim_scores:
        if sim[1] >= 0.999999999:
            i = i + 1
            #print(i)
    if i == 0:
      i = 1
    
    sim_scores = sim_scores[i:i+50]
    book_indices = [i[0] for i in sim_scores]
    sim = [i[1] for i in sim_scores]
    
    books = smd.iloc[book_indices][['isbn', 'book_id','average_rating']]
    bookTitle = []
    for index, row in books.iterrows():
      id = str(row['book_id'])
      bookTitle.append(dict[id])
    books['title'] = bookTitle
    description = smd.iloc[book_indices]['description']
    books['description'] = description
    books['similarity'] = sim
    # If you want to use this TF-IDF recommendation system alone, you can add the following line in order to
    # make the recommendation more practical
    #books = books.head(10).sort_values(by='average_rating',ascending=False)
    return books

##### 5. Example

In [32]:
for index, row in poetryFile.iterrows():
    if row['book_id'] == 1293847:
        targetDesc = row['description']

In [34]:
print('The target book is: ' +dict['1293847'])
print('The description is: ' + targetDesc)

The target book is: Season Songs
The description is: A collection of twenty eight poems grouped to represent the four seasons 


In [61]:
improved_recommendations(1293847)

Unnamed: 0,isbn,book_id,average_rating,title,description,similarity
27057,,7075534,4.38,The Collected Poems of Sara Teasdale,collection poems,0.262049
9451,0711223580,135221,4.5,The Garden,From the author of The Land this poem is a mu...,0.233685
18867,0385062494,211513,4.22,Favorite Poems Old and New,This is a collection of over seven hundred cla...,0.180776
27738,,32057973,5.0,A Million-Dollar Bill,This volume is a collection of forty six poems,0.137014
22354,000712032X,225833,4.14,Rumi: Hidden Music,A breathtaking never before translated collect...,0.132202
29661,0198312407,1233827,3.87,The New Dragon Book of Verse,This anthology is reissued with a new cover an...,0.128023
707,,11801718,4.39,Kuchh aur Nazmein,A collection of poems by Gulzar,0.103561
29477,,25187534,4.39,Kuchh aur Nazmein,A collection of poems by Gulzar,0.103561
23325,0876141432,853867,3.81,Swing Around the Sun,A collection of poems that celebrates the seas...,0.100505
6824,,3245678,4.48,That Tiny Insane Voluptuousness,Poems,0.09878


###### -- Print out the description of the recommended book

In [62]:
res = improved_recommendations(1293847)

In [63]:
des = []
for index, row in res.iterrows():
    des.append(row['description'])

In [71]:
id = 1
print('Description for recommended book')
for d in des:
    print(str(id) +": " + d)
    id = id + 1

Description for recommended book
1: collection poems
2: From the author of The Land  this poem is a much more personal and symbolic offering  Set against the backdrop of war  the seasons in the garden represent the seasons of life 
3: This is a collection of over seven hundred classic and modern poems grouped by children's interests  such as pets  playtime  family  nature  and others 
4: This volume is a collection of forty six poems 
5: A breathtaking never before translated collection of poems by Rumi  one of the world's most mystical teachers  These beautiful  contemporary new translations combined with gorgeous full color images speak directly to us now  The poems are grouped thematically  and explore the intricacies of love  longing  and the quest for truth and joy with mystical splendor 
6: This anthology is reissued with a new cover and should continue to appeal to secondary students  The poems are grouped thematically in eight sections: childhood  creatures  landscapes  seascap