In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Step 1: Importing Data

### Import poetry book and review JSON files

In [2]:
books = pd.read_json("poetry_data/goodreads_books_poetry.json", lines = True)
reviews = pd.read_json("poetry_data/goodreads_reviews_poetry.json", lines = True)

### Displaying first 5 books  (show some sample data 😊)

In [3]:
books.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,3.83,,...,11,,1887,https://www.goodreads.com/book/show/16037549-v...,https://images.gr-assets.com/books/1348176637m...,16037549,3,5212748,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems
1,811223981.0,2,[],US,,"[{'count': '100', 'name': 'to-read'}, {'count'...",,False,3.83,B00U2WY9U8,...,4,,2015,https://www.goodreads.com/book/show/22466716-f...,https://images.gr-assets.com/books/1404958407m...,22466716,37,41905435,Fairy Tales: Dramolettes,Fairy Tales: Dramolettes
2,374428115.0,7,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,4.38,,...,7,,2008,https://www.goodreads.com/book/show/926662.Gro...,https://s.gr-assets.com/assets/nophoto/book/11...,926662,45,911665,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems
3,156182890.0,12,[],US,,"[{'count': '554', 'name': 'to-read'}, {'count'...",,False,3.71,B00IWTRB1W,...,3,,1964,https://www.goodreads.com/book/show/926667.The...,https://images.gr-assets.com/books/1382939971m...,926667,115,995066,The Cocktail Party,The Cocktail Party
4,1942004192.0,4,[],US,eng,"[{'count': '228', 'name': 'to-read'}, {'count'...",,False,5.0,,...,12,First,2015,https://www.goodreads.com/book/show/29065952-l...,https://images.gr-assets.com/books/1455198396m...,29065952,9,49294781,Louder Than Everything You Love,Louder Than Everything You Love


### Displaying first 5 reviews (show some sample data 😊)

In [4]:
reviews.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,5,I have three younger siblings and we grew up w...,Tue Jun 12 08:59:04 -0700 2012,Fri Jun 15 11:41:12 -0700 2012,,,0,0
1,0ef32090550901ead25cb0ea21c4d36b,92270,2db1180992e2b0b1631a3ac5644bde84,5,This is my favorite collection of poetry.,Mon Apr 14 18:42:40 -0700 2014,Mon Apr 14 18:43:05 -0700 2014,Wed Jan 01 00:00:00 -0800 1997,,0,0
2,0ef32090550901ead25cb0ea21c4d36b,908708,bca57fa40e92c9261b00b03dbebd96fe,4,"He's so disturbing. So very, very disturbing.",Tue Apr 22 13:58:10 -0700 2008,Tue Apr 22 13:58:33 -0700 2008,,,0,0
3,d37b46b2190ed7c518259f29b47a9b36,253264,cb1ebc02d8b2aff15735d513877463ce,5,I just reread this play for a class I am takin...,Wed Sep 27 19:08:08 -0700 2017,Sat Sep 30 06:39:45 -0700 2017,Wed Sep 27 00:00:00 -0700 2017,Tue Sep 26 00:00:00 -0700 2017,1,0
4,af157d0205b8a901dee6d4a2aed7e6ad,70885,8dca128b8e869048a7442c18659dbece,5,"Cuanto mas leo, mas me gusta. Su poesia es env...",Thu Jun 18 20:00:03 -0700 2015,Thu Jun 18 20:01:29 -0700 2015,Thu Jun 18 00:00:00 -0700 2015,Tue Jun 16 00:00:00 -0700 2015,0,0


### Total number of poetry books in the dataset

In [5]:
len(books)

36514

### Total number of reviews in the dataset

In [6]:
len(reviews)

154555

### Merge both book and review dataset on book_id

In [7]:
books_and_reviews = pd.merge(reviews, books, on=['book_id'])

### Displaying first 5 rows of data

In [8]:
books_and_reviews.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,...,isbn13,publication_month,edition_information,publication_year,url,image_url,ratings_count,work_id,title,title_without_series
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,5,I have three younger siblings and we grew up w...,Tue Jun 12 08:59:04 -0700 2012,Fri Jun 15 11:41:12 -0700 2012,,,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
1,408ad43536dd5340cf6f955fc44f8014,402128,b4499b5b99b4462b2ebdb883561cf6b3,3,Although T.S. Eliot displays his complete mast...,Sat Dec 28 07:30:50 -0800 2013,Sat Dec 28 07:35:21 -0800 2013,Fri Dec 27 00:00:00 -0800 2013,,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
2,a93cf495a6ec60a24ed11c1ae6f45aa9,402128,a87527ad1fe429dc41cc332723fb91de,3,"Eh, as an avid cat lover I was a little disapp...",Thu Jun 21 18:01:08 -0700 2012,Wed Jun 12 14:32:42 -0700 2013,Wed Jun 12 14:32:42 -0700 2013,Wed Jun 12 00:00:00 -0700 2013,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
3,8d742d4c7a925de61e1574884bd2985a,402128,5e77a73030136445dcc359de17bd3e6a,5,This one was given to me for my graduation fro...,Sun May 18 21:10:26 -0700 2008,Sun May 18 21:15:35 -0700 2008,,,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
4,d4f1cec123612b1bfc04752fa0c190ab,402128,8521d4c827321716b81d1d36bf9a6f6e,5,I love cats and T.S Elliot. Why I have never r...,Sat Jul 16 11:33:49 -0700 2016,Thu Aug 18 20:26:59 -0700 2016,Thu Aug 18 00:00:00 -0700 2016,Tue Aug 16 00:00:00 -0700 2016,3,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats


### Total Number of Rows in Dataframe

In [9]:
len(books_and_reviews)

154555

## Step 2: Clean the data

Looking into the language codes available in the dataset - notice an empty one.

In [10]:
books_and_reviews.language_code.unique()

array(['en-US', 'eng', '', 'ara', 'fin', 'ind', 'swe', 'en-GB', 'en-CA',
       'spa', 'msa', 'por', 'nl', 'mul', 'fre', 'ger', 'cze', 'per',
       'fil', 'srp', 'ita', 'bul', 'tur', 'lit', 'rum', 'hin', 'pol',
       'ben', 'nor', 'nno', 'nob', 'vie', 'rus', 'ukr', 'est', 'scr',
       'tlh', 'nep', 'enm', 'dan', 'lav', 'slo', 'gre', 'urd', 'afr',
       'mon', 'arw', 'heb', 'lat', 'sco', 'grc', 'dum', 'hye', '--',
       'slv', 'kat', 'tel', 'pes', 'tam', 'mkd', 'peo', 'sqi', 'hun',
       'jpn', 'tha', 'ota', 'glg', 'ang', 'sin', 'aus', 'bel', 'snd',
       'tgl', 'ira', 'mal', 'cat', 'isl', 'aze', 'amh', 'zho', 'dgr',
       'pan', 'fro', 'bos', 'gmh', 'kur', 'san', 'frm', 'mar', 'fao',
       'dut', 'gla', 'kor'], dtype=object)

We will only be looking at english books for the purpose of this assignment - those language codes are: `en-US, eng, en-GB, and en-CA`

In [11]:
books_and_reviews = books_and_reviews.loc[books_and_reviews['language_code'].isin(['en-US', 'eng', 'en-GB', 'en-CA'])]

### New numbers of rows after removing books not in english

In [12]:
len(books_and_reviews)

66394

## Step 3: Initialize module

### “Gets the system started based only on a very limited text description, or very few examples, from the user.”

## Step 4: Decision module

### "Given a text document and a profile description of the user, decide whether the document should be delivered or not”

In [58]:
# columns available - will delete later (arely)
books_and_reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments', 'isbn', 'text_reviews_count', 'series', 'country_code',
       'language_code', 'popular_shelves', 'asin', 'is_ebook',
       'average_rating', 'kindle_asin', 'similar_books', 'description',
       'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month', 'edition_information',
       'publication_year', 'url', 'image_url', 'ratings_count', 'work_id',
       'title', 'title_without_series'],
      dtype='object')

In [60]:
# to test on later (arely)
books_and_reviews[books_and_reviews.title=="Milk and Honey"]

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,...,isbn13,publication_month,edition_information,publication_year,url,image_url,ratings_count,work_id,title,title_without_series
9470,20fd7b31a6445c3959594b61d3659274,23513349,9e632a8de83863ad255886b0e493a51f,3,shitty poetry \n for people with \n no sense o...,Sat Jul 22 11:05:24 -0700 2017,Sat Jul 22 11:17:14 -0700 2017,Sat Jul 22 13:06:01 -0700 2017,Sat Jul 22 11:05:59 -0700 2017,2,...,9781502784278,11,,2014,https://www.goodreads.com/book/show/23513349-m...,https://images.gr-assets.com/books/1491595510m...,92450,43116473,Milk and Honey,Milk and Honey
9471,8b53476f67d6863bb422ee5a2b9403ab,23513349,fde700923731037e58f323aafe34d454,2,I was disappointed. I had heard so many positi...,Tue Feb 14 13:26:51 -0800 2017,Tue Feb 14 18:19:15 -0800 2017,Tue Feb 14 00:00:00 -0800 2017,Tue Feb 14 00:00:00 -0800 2017,0,...,9781502784278,11,,2014,https://www.goodreads.com/book/show/23513349-m...,https://images.gr-assets.com/books/1491595510m...,92450,43116473,Milk and Honey,Milk and Honey
9472,47e063740482b4217e3a84c0f3c06d08,23513349,1804a934cba532144f82349b649cd08a,4,Real Rating: 3.5 \n It was touch and go for me...,Sun Apr 03 23:42:57 -0700 2016,Mon Dec 19 18:01:41 -0800 2016,Mon Dec 19 00:00:00 -0800 2016,Mon Dec 19 00:00:00 -0800 2016,2,...,9781502784278,11,,2014,https://www.goodreads.com/book/show/23513349-m...,https://images.gr-assets.com/books/1491595510m...,92450,43116473,Milk and Honey,Milk and Honey
9473,c5a480631b5ed8072dcfaa0f8d2534da,23513349,ead7c67fde89da5e612084c517fd069f,4,"Beautiful poems that were simple, honest and c...",Sun Jun 04 12:39:58 -0700 2017,Sun Jun 04 14:37:50 -0700 2017,,Sun Jun 04 00:00:00 -0700 2017,3,...,9781502784278,11,,2014,https://www.goodreads.com/book/show/23513349-m...,https://images.gr-assets.com/books/1491595510m...,92450,43116473,Milk and Honey,Milk and Honey
9474,70fd7b91953f86c2c27a15da63efb130,23513349,eb1b7be5b83a4c8a25d4ff234af5ede8,4,"Honestly, I feel dissappointed while reading t...",Mon Jan 30 01:52:55 -0800 2017,Sat Apr 01 04:55:13 -0700 2017,Sat Apr 01 04:55:13 -0700 2017,Wed Mar 15 00:00:00 -0700 2017,0,...,9781502784278,11,,2014,https://www.goodreads.com/book/show/23513349-m...,https://images.gr-assets.com/books/1491595510m...,92450,43116473,Milk and Honey,Milk and Honey
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106176,9006a3cc15e4213fe1b70cd3edec6387,33246653,87e244d31cf86ea86f80f3b0590d0165,4,"For my fist poetry book, this was really beaut...",Tue Aug 29 19:39:30 -0700 2017,Tue Oct 17 19:22:30 -0700 2017,Thu Oct 12 00:00:00 -0700 2017,Tue Oct 10 00:00:00 -0700 2017,1,...,9781449483135,10,B&amp;N Exclusive Edition,2016,https://www.goodreads.com/book/show/33246653-m...,https://images.gr-assets.com/books/1480919673m...,344,43116473,Milk and Honey,Milk and Honey
106177,ba3638124f51eb4b0e2552450c9a75fb,33246653,6184ce88c3e246a9af2f31c4a5993d87,5,Magnificent!,Thu Jan 05 17:23:30 -0800 2017,Fri Jan 20 12:53:27 -0800 2017,Fri Jan 20 12:53:27 -0800 2017,Thu Jan 05 00:00:00 -0800 2017,0,...,9781449483135,10,B&amp;N Exclusive Edition,2016,https://www.goodreads.com/book/show/33246653-m...,https://images.gr-assets.com/books/1480919673m...,344,43116473,Milk and Honey,Milk and Honey
106178,4275a6aba8a2ad598886af3d22b3d79e,33246653,a73dfee0963d3c12c3a5cc4655c02eac,5,"""Healing"" is obviously my favorite section of ...",Sat Jan 14 09:30:35 -0800 2017,Wed Jan 18 20:28:28 -0800 2017,Wed Jan 18 00:00:00 -0800 2017,Sat Jan 14 00:00:00 -0800 2017,0,...,9781449483135,10,B&amp;N Exclusive Edition,2016,https://www.goodreads.com/book/show/33246653-m...,https://images.gr-assets.com/books/1480919673m...,344,43116473,Milk and Honey,Milk and Honey
106179,cb3ce60110ca3c49f477dd6bc62a3aa6,33246653,769c6dd68187c13aa1b478d66e74ba52,5,Phenomenal. A must read for all women. All I h...,Mon Dec 05 21:09:57 -0800 2016,Mon Dec 05 21:12:12 -0800 2016,Mon Dec 05 00:00:00 -0800 2016,Mon Dec 05 00:00:00 -0800 2016,0,...,9781449483135,10,B&amp;N Exclusive Edition,2016,https://www.goodreads.com/book/show/33246653-m...,https://images.gr-assets.com/books/1480919673m...,344,43116473,Milk and Honey,Milk and Honey
