# Goodbooks-10k Book Recommender

#### Imports

In [16]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from ast import literal_eval

#### Load Data

In [41]:
load_dotenv()

BOOK_DATA_PATH = os.environ.get("BOOK_DATA_PATH")
RATINGS_DATA_PATH = os.environ.get("RATINGS_DATA_PATH")
BOOK_TAGS_DATA_PATH = os.environ.get("BOOK_TAGS_DATA_PATH")
TAGS_DATA_PATH = os.environ.get("TAGS_DATA_PATH")
TO_READ_DATA_PATH = os.environ.get("TO_READ_DATA_PATH")

books_df = pd.read_csv(BOOK_DATA_PATH, index_col=[0], converters={'genres':literal_eval, 'authors':literal_eval})
ratings_df = pd.read_csv(RATINGS_DATA_PATH)
book_tags_df = pd.read_csv(BOOK_TAGS_DATA_PATH)
tags_df = pd.read_csv(TAGS_DATA_PATH)
to_read_df = pd.read_csv(TO_READ_DATA_PATH)

## Data Exploration

In [18]:
books_df.columns

Index(['index', 'authors', 'average_rating', 'best_book_id', 'book_id',
       'books_count', 'description', 'genres', 'goodreads_book_id',
       'image_url', 'isbn', 'isbn13', 'language_code',
       'original_publication_year', 'original_title', 'pages', 'publishDate',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'ratings_count', 'small_image_url', 'title', 'work_id',
       'work_ratings_count', 'work_text_reviews_count', 'authors_2'],
      dtype='object')

##### Book Data

In [19]:
books_df.head()

Unnamed: 0,index,authors,average_rating,best_book_id,book_id,books_count,description,genres,goodreads_book_id,image_url,...,ratings_3,ratings_4,ratings_5,ratings_count,small_image_url,title,work_id,work_ratings_count,work_text_reviews_count,authors_2
0,0,[Suzanne Collins],4.34,2767052,1,272,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"[young-adult, fiction, fantasy, science-fictio...",2767052,https://images.gr-assets.com/books/1447303603m...,...,560092,1481305,2706317,4780653,https://images.gr-assets.com/books/1447303603s...,"The Hunger Games (The Hunger Games, #1)",2792775,4942365,155254,['Suzanne Collins']
1,1,"[J.K. Rowling, Mary GrandPré]",4.44,3,2,491,Harry Potter's life is miserable. His parents ...,"[fantasy, fiction, young-adult, classics]",3,https://images.gr-assets.com/books/1474154022m...,...,455024,1156318,3011543,4602479,https://images.gr-assets.com/books/1474154022s...,Harry Potter and the Sorcerer's Stone (Harry P...,4640799,4800065,75867,"['J.K. Rowling', 'Mary GrandPré']"
2,2,[Stephenie Meyer],3.57,41865,3,226,About three things I was absolutely positive.\...,"[young-adult, fantasy, romance, fiction, paran...",41865,https://images.gr-assets.com/books/1361039443m...,...,793319,875073,1355439,3866839,https://images.gr-assets.com/books/1361039443s...,"Twilight (Twilight, #1)",3212258,3916824,95009,['Stephenie Meyer']
3,3,[Harper Lee],4.25,2657,4,487,The unforgettable novel of a childhood in a sl...,"[classics, fiction, historical-fiction, young-...",2657,https://images.gr-assets.com/books/1361975680m...,...,446835,1001952,1714267,3198671,https://images.gr-assets.com/books/1361975680s...,To Kill a Mockingbird,3275794,3340896,72586,['Harper Lee']
4,4,[F. Scott Fitzgerald],3.89,4671,5,1356,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"[classics, fiction, historical-fiction, romance]",4671,https://images.gr-assets.com/books/1490528560m...,...,606158,936012,947718,2683664,https://images.gr-assets.com/books/1490528560s...,The Great Gatsby,245494,2773745,51992,['F. Scott Fitzgerald']


In [20]:
books_df.loc[:,['ratings_count', 'average_rating', 'original_publication_year']].describe()

Unnamed: 0,ratings_count,average_rating,original_publication_year
count,10000.0,10000.0,9979.0
mean,54001.24,4.002191,1981.987674
std,157370.0,0.254427,152.576665
min,2716.0,2.47,-1750.0
25%,13568.75,3.85,1990.0
50%,21155.5,4.02,2004.0
75%,41053.5,4.18,2011.0
max,4780653.0,4.82,2017.0


In [21]:
ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [22]:
to_read_df.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


In [23]:
book_tags_df.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


Merge tag with their names.

In [24]:
merged_book_tags = book_tags_df.merge(tags_df, on="tag_id")
merged_book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [25]:
merged_book_tags.groupby("tag_name").tag_name.count().sort_values(ascending = False).head(20)

tag_name
to-read              9983
favorites            9881
owned                9858
books-i-own          9799
currently-reading    9776
library              9415
owned-books          9221
fiction              9097
to-buy               8692
kindle               8316
default              8239
ebook                8054
my-books             7561
audiobook            7242
ebooks               7203
wish-list            7192
my-library           7000
audiobooks           6862
i-own                6670
adult                6604
Name: tag_name, dtype: int64

In [38]:
books_df_explode = books_df.explode('genres')
books_df_explode.groupby('genres').size().sort_values(ascending=False)

genres
fiction               8272
fantasy               3746
romance               3307
contemporary          2918
young-adult           2756
mystery               2481
classics              2110
thriller              1822
historical-fiction    1793
nonfiction            1641
crime                 1488
suspense              1446
science-fiction       1428
paranormal            1308
chick-lit             1266
horror                 876
history                860
biography              823
books                  685
memoir                 664
philosophy             617
psychology             499
religion               488
graphic-novels         472
comics                 455
science                452
self-help              435
spirituality           327
christian              295
business               277
poetry                 277
art                    221
travel                 203
music                  146
manga                  144
sports                 134
cookbooks            

In [42]:
books_df.groupby('authors').size().sort_values(ascending=False)

authors
['Stephen King']       58
['Nora Roberts']       46
['Terry Pratchett']    40
['Agatha Christie']    38
['Dean Koontz']        34
                       ..
['Melinda Haynes']      1
['Melissa Bank']        1
['Melissa Landers']     1
['Dav Pilkey']          1
['Matt Haig']           1
Length: 5057, dtype: int64

## Data Prep