In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

import warnings
warnings.filterwarnings('ignore')

In [3]:
# books_data = pd.read_csv('../data/books_data.csv')
books_data = pd.read_csv('books_data.csv')
ratings_data = pd.read_csv('../data/books_ratings_data.csv')
book_tags_data = pd.read_csv('../data/book_tags_data.csv')
tags_data = pd.read_csv('../data/tags_data.csv')

In [4]:
books_data['original_publication_year'] = books_data['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

In [5]:
unique_ratings = ratings_data.drop_duplicates()

# removing users with less than 4 ratings
unwanted_users = unique_ratings.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 4]
unwanted_ratings = unique_ratings[unique_ratings.user_id.isin(unwanted_users.index)]
new_ratings = unique_ratings.drop(unwanted_ratings.index)

In [6]:
books_data.head(2)

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...


In [7]:
new_ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [8]:
new_ratings['title'] = books_data.set_index('id').title.loc[new_ratings.book_id].values

In [9]:
new_ratings.head()

Unnamed: 0,book_id,user_id,rating,title
0,1,314,5,"The Hunger Games (The Hunger Games, #1)"
1,1,439,3,"The Hunger Games (The Hunger Games, #1)"
2,1,588,5,"The Hunger Games (The Hunger Games, #1)"
3,1,1169,4,"The Hunger Games (The Hunger Games, #1)"
4,1,1185,4,"The Hunger Games (The Hunger Games, #1)"


In [10]:
v = books_data['ratings_count']
m = books_data['ratings_count'].quantile(0.95)
R = books_data['average_rating']
C = books_data['average_rating'].mean()
W = (R*v + C*m) / (v + m)

In [11]:
books_data['weighted_rating'] = W

In [12]:
qualified  = books_data.sort_values('weighted_rating', ascending=False).head(250)

In [13]:
qualified[['title', 'authors', 'average_rating', 'weighted_rating']].head(15)

Unnamed: 0,title,authors,average_rating,weighted_rating
24,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré",4.61,4.555956
26,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré",4.54,4.490428
17,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck",4.53,4.48509
23,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré",4.53,4.483227
1,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",4.44,4.424365
20,Harry Potter and the Order of the Phoenix (Har...,"J.K. Rowling, Mary GrandPré",4.46,4.419054
30,The Help,Kathryn Stockett,4.45,4.405158
38,"A Game of Thrones (A Song of Ice and Fire, #1)",George R.R. Martin,4.45,4.398759
134,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,4.54,4.396645
421,"Harry Potter Boxset (Harry Potter, #1-7)",J.K. Rowling,4.74,4.391147


In [14]:
book_tags_data.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [15]:
tags_data.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [16]:
genres = ["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay and Lesbian", "Graphic Novels", "Historical Fiction", "History", "Horror",
          "Humor and Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science Fiction", 
          "Self Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young Adult"]


In [17]:
genres = list(map(str.lower, genres))
genres[:10]

['art',
 'biography',
 'business',
 'chick lit',
 "children's",
 'christian',
 'classics',
 'comics',
 'contemporary',
 'cookbooks']

In [18]:
available_genres = tags_data.loc[tags_data.tag_name.str.lower().isin(genres)]

In [19]:
available_genres_books = book_tags_data[book_tags_data.tag_id.isin(available_genres.tag_id)]

In [20]:
print('There are {} books that are tagged with above genres'.format(available_genres_books.shape[0]))

There are 60573 books that are tagged with above genres


In [21]:
available_genres_books.head()

Unnamed: 0,goodreads_book_id,tag_id,count
1,1,11305,37174
5,1,11743,9954
25,1,7457,958
38,1,22973,673
52,1,20939,465


In [22]:
available_genres_books['genre'] = available_genres.tag_name.loc[available_genres_books.tag_id].values
available_genres_books.head()

Unnamed: 0,goodreads_book_id,tag_id,count,genre
1,1,11305,37174,fantasy
5,1,11743,9954,fiction
25,1,7457,958,classics
38,1,22973,673,paranormal
52,1,20939,465,mystery


In [26]:


def build_chart(genre, percentile=0.85):
    df = available_genres_books[available_genres_books['genre'] == genre.lower()]
    qualified = books_data.set_index('book_id').loc[df.goodreads_book_id]

    v = qualified['ratings_count']
    m = qualified['ratings_count'].quantile(percentile)
    R = qualified['average_rating']
    C = qualified['average_rating'].mean()
    qualified['weighted_rating'] = (R*v + C*m) / (v + m)

    qualified.sort_values('weighted_rating', ascending=False, inplace=True)
    return qualified



In [27]:
cols = ['title','authors','original_publication_year','average_rating','ratings_count','work_text_reviews_count','weighted_rating']

In [28]:
genre = 'Fiction'
build_chart(genre)[cols].head(15)

Unnamed: 0_level_0,title,authors,original_publication_year,average_rating,ratings_count,work_text_reviews_count,weighted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
136251,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré",2007,4.61,1746574,51942,4.587098
862041,"Harry Potter Boxset (Harry Potter, #1-7)",J.K. Rowling,1998,4.74,190050,6508,4.544691
1,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré",2005,4.54,1678823,27520,4.518933
5,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck",1999,4.53,1832823,36099,4.510997
6,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré",2000,4.53,1753043,31084,4.510164
62291,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,2000,4.54,469022,19497,4.471466
186074,The Name of the Wind (The Kingkiller Chronicle...,Patrick Rothfuss,2007,4.55,400101,28631,4.469922
1215032,"The Wise Man's Fear (The Kingkiller Chronicle,...",Patrick Rothfuss,2011,4.57,245686,15503,4.446163
18512,"The Return of the King (The Lord of the Rings,...",J.R.R. Tolkien,1955,4.51,463959,6644,4.444645
2,Harry Potter and the Order of the Phoenix (Har...,"J.K. Rowling, Mary GrandPré",2003,4.46,1735368,28685,4.442607


In [29]:
list(enumerate(available_genres.tag_name))

[(0, 'art'),
 (1, 'biography'),
 (2, 'business'),
 (3, 'christian'),
 (4, 'classics'),
 (5, 'comics'),
 (6, 'contemporary'),
 (7, 'cookbooks'),
 (8, 'crime'),
 (9, 'ebooks'),
 (10, 'fantasy'),
 (11, 'fiction'),
 (12, 'history'),
 (13, 'horror'),
 (14, 'manga'),
 (15, 'memoir'),
 (16, 'music'),
 (17, 'mystery'),
 (18, 'nonfiction'),
 (19, 'paranormal'),
 (20, 'philosophy'),
 (21, 'poetry'),
 (22, 'psychology'),
 (23, 'religion'),
 (24, 'romance'),
 (25, 'science'),
 (26, 'spirituality'),
 (27, 'sports'),
 (28, 'suspense'),
 (29, 'thriller'),
 (30, 'travel')]

In [30]:


idx = 11  # fiction
build_chart(list(available_genres.tag_name)[idx])[cols].head(15)



Unnamed: 0_level_0,title,authors,original_publication_year,average_rating,ratings_count,work_text_reviews_count,weighted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
136251,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré",2007,4.61,1746574,51942,4.587098
862041,"Harry Potter Boxset (Harry Potter, #1-7)",J.K. Rowling,1998,4.74,190050,6508,4.544691
1,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré",2005,4.54,1678823,27520,4.518933
5,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck",1999,4.53,1832823,36099,4.510997
6,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré",2000,4.53,1753043,31084,4.510164
62291,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,2000,4.54,469022,19497,4.471466
186074,The Name of the Wind (The Kingkiller Chronicle...,Patrick Rothfuss,2007,4.55,400101,28631,4.469922
1215032,"The Wise Man's Fear (The Kingkiller Chronicle,...",Patrick Rothfuss,2011,4.57,245686,15503,4.446163
18512,"The Return of the King (The Lord of the Rings,...",J.R.R. Tolkien,1955,4.51,463959,6644,4.444645
2,Harry Potter and the Order of the Phoenix (Har...,"J.K. Rowling, Mary GrandPré",2003,4.46,1735368,28685,4.442607
