In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

import warnings
warnings.filterwarnings('ignore')

In [13]:
books = pd.read_csv('/Users/vamsidharreddy/CMPE-255-Final-Project/data/books_data.csv')
ratings  = pd.read_csv('/Users/vamsidharreddy/CMPE-255-Final-Project/data/books_ratings_data.csv')
book_tags = pd.read_csv('/Users/vamsidharreddy/CMPE-255-Final-Project/data/book_tags_data.csv')
tags = pd.read_csv('/Users/vamsidharreddy/CMPE-255-Final-Project/data/tags_data.csv')

In [5]:
!pwd

/Users/vamsidharreddy/CMPE-255-Final-Project/model/Hybrid


/Users/vamsidharreddy/CMPE-255-Final-Project/model/Hybrid


In [3]:
books['original_publication_year'] = books['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)


In [4]:
ratings_rmv_duplicates = ratings.drop_duplicates()
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 3]
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [5]:
new_ratings['title'] = books.set_index('id').title.loc[new_ratings.book_id].values

In [6]:
v = books['ratings_count']
m = books['ratings_count'].quantile(0.95)
R = books['average_rating']
C = books['average_rating'].mean()
W = (R*v + C*m) / (v + m)

In [7]:
books['weighted_rating'] = W


In [8]:
qualified  = books.sort_values('weighted_rating', ascending=False).head(250)

In [9]:
genres = ["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay and Lesbian", "Graphic Novels", "Historical Fiction", "History", "Horror",
          "Humor and Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science Fiction", 
          "Self Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young Adult"]

In [10]:
genres = list(map(str.lower, genres))
genres[:4]

['art', 'biography', 'business', 'chick lit']

In [11]:
available_genres = tags.loc[tags.tag_name.str.lower().isin(genres)]

In [12]:
available_genres_books = book_tags[book_tags.tag_id.isin(available_genres.tag_id)]

In [13]:
print('There are {} books that are tagged with above genres'.format(available_genres_books.shape[0]))

There are 60573 books that are tagged with above genres


In [14]:
available_genres_books['genre'] = available_genres.tag_name.loc[available_genres_books.tag_id].values


In [15]:
def build_chart(genre, percentile=0.85):
    df = available_genres_books[available_genres_books['genre'] == genre.lower()]
    qualified = books.set_index('book_id').loc[df.goodreads_book_id]

    v = qualified['ratings_count']
    m = qualified['ratings_count'].quantile(percentile)
    R = qualified['average_rating']
    C = qualified['average_rating'].mean()
    qualified['weighted_rating'] = (R*v + C*m) / (v + m)

    qualified.sort_values('weighted_rating', ascending=False, inplace=True)
    return qualified


In [16]:
cols = ['title','authors','original_publication_year','average_rating','ratings_count','work_text_reviews_count','weighted_rating']

In [17]:
genre = 'Fiction'


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [21]:
books['authors'] = books['authors'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x.split(', ')])

In [22]:
def get_genres(x):
    t = book_tags[book_tags.goodreads_book_id==x]
    return [i.lower().replace(" ", "") for i in tags.tag_name.loc[t.tag_id].values]


In [23]:
books['genres'] = books.book_id.apply(get_genres)

In [24]:
books['soup'] = books.apply(lambda x: ' '.join([x['title']] + x['authors'] + x['genres']), axis=1)



In [25]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(books['soup'])

In [26]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [27]:
indices = pd.Series(books.index, index=books['title'])
titles = books['title']

In [33]:
def get_name_from_partial(title):
    return list(books.title[books.title.str.lower().str.contains(title) == True].values)

In [34]:
title = "business"
l = get_name_from_partial(title)
list(enumerate(l))

[(0, 'The Power of Habit: Why We Do What We Do in Life and Business'),
 (1,
  "The Lean Startup: How Today's Entrepreneurs Use Continuous Innovation to Create Radically Successful Businesses"),
 (2,
  'Caps for Sale: A Tale of a Peddler, Some Monkeys and Their Monkey Business'),
 (3,
  "The E-Myth Revisited: Why Most Small Businesses Don't Work and What to Do About It"),
 (4, 'The Snowball: Warren Buffett and the Business of Life'),
 (5,
  "The Innovator's Dilemma: The Revolutionary Book that Will Change the Way You Do Business (Collins Business Essentials)"),
 (6, 'The Intelligent Investor (Collins Business Essentials)'),
 (7, 'Purple Cow: Transform Your Business by Being Remarkable'),
 (8, 'Business Model Generation'),
 (9, 'The Long Tail: Why the Future of Business is Selling Less of More'),
 (10,
  "Losing My Virginity: How I've Survived, Had Fun, and Made a Fortune Doing Business My Way"),
 (11,
  'The Hard Thing About Hard Things: Building a Business When There Are No Easy Answer

In [39]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [40]:
reader = Reader()
data = Dataset.load_from_df(new_ratings[['user_id', 'book_id', 'rating']], reader)

In [41]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.84227395, 0.84265814, 0.84103162, 0.84304537, 0.84158009]),
 'test_mae': array([0.65863303, 0.65874983, 0.65817277, 0.65929872, 0.65775841]),
 'fit_time': (53.663198947906494,
  54.344008922576904,
  54.35425090789795,
  54.315794944763184,
  54.32724404335022),
 'test_time': (1.3036479949951172,
  1.4018669128417969,
  1.5516672134399414,
  1.588667869567871,
  1.3276221752166748)}

In [42]:
trainset = data.build_full_trainset()
svd.fit(trainset);

In [43]:
new_ratings[new_ratings['user_id'] == 10]

Unnamed: 0,book_id,user_id,rating,title
150478,1506,10,4,The Zahir
282986,2833,10,4,The Prisoner of Heaven (The Cemetery of Forgot...
340448,3409,10,5,The Winner Stands Alone
393966,3946,10,5,Matterhorn
452158,4531,10,4,The Joke
506878,5084,10,2,The Sheltering Sky
588312,5907,10,4,Our Mutual Friend
590191,5926,10,2,The Night Watch
610487,6131,10,2,The Longest Day
696035,7002,10,5,A Mercy


In [44]:
svd.predict(10, 1506)

Prediction(uid=10, iid=1506, r_ui=None, est=3.380445865479927, details={'was_impossible': False})

In [45]:
# bookmat = new_ratings.groupby(['user_id', 'title'])['rating'].mean().unstack()
bookmat = new_ratings.pivot_table(index='user_id', columns='title', values='rating')


In [46]:
def get_similar(title, mat):
    title_user_ratings = mat[title]
    similar_to_title = mat.corrwith(title_user_ratings)
    corr_title = pd.DataFrame(similar_to_title, columns=['correlation'])
    corr_title.dropna(inplace=True)
    corr_title.sort_values('correlation', ascending=False, inplace=True)
    return corr_title

In [47]:
title = "Twilight (Twilight, #1)"
smlr = get_similar(title, bookmat)

In [48]:
smlr.head(10)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
god is Not Great: How Religion Poisons Everything,1.0
The Day of the Triffids,1.0
Sh*t My Dad Says,1.0
"Shadow and Bone (Shadow and Bone, #1)",1.0
Skipping Christmas,1.0
"Splintered (Splintered, #1)",1.0
"Stolen Songbird (The Malediction Trilogy, #1)",1.0
"Bared to You (Crossfire, #1)",1.0
The Autobiography of Malcolm X,1.0
Balzac and the Little Chinese Seamstress,1.0


In [49]:
smlr = smlr.join(books.set_index('title')['ratings_count'])
smlr.head()

Unnamed: 0_level_0,correlation,ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Salem's Lot,0.275938,228680
'Salem's Lot,0.275938,72797
11/22/63,0.431331,258464
"13 Little Blue Envelopes (Little Blue Envelope, #1)",-0.5,66950
1776,0.301511,130293


In [50]:
smlr[smlr.ratings_count > 5e5].sort_values('correlation', ascending=False).head(10)

Unnamed: 0_level_0,correlation,ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Twilight (Twilight, #1)",1.0,3866839
"New Moon (Twilight, #2)",0.8854,1149630
"The Selection (The Selection, #1)",0.866025,505340
"Eclipse (Twilight, #3)",0.857845,1134511
"Me Before You (Me Before You, #1)",0.771845,587647
"Matched (Matched, #1)",0.707029,511815
"Breaking Dawn (Twilight, #4)",0.689029,1070245
Bossypants,0.669954,506250
"City of Bones (The Mortal Instruments, #1)",0.654081,1154031
The Perks of Being a Wallflower,0.574701,888806


In [51]:
def hybrid(user_id, title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    book_indices = [i[0] for i in sim_scores]
    
    df = books.iloc[book_indices][['book_id', 'title', 'original_publication_year', 'ratings_count', 'average_rating']]
    df['est'] = df['book_id'].apply(lambda x: svd.predict(user_id, x).est)
    df = df.sort_values('est', ascending=False)
    return df.head(n)

In [52]:
hybrid(4, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,est
382,1241,A Million Little Pieces,2003,184241,3.62,4.026722
3118,3465,Three Weeks With My Brother,2004,32208,4.01,3.887518
4038,6365221,Mennonite in a Little Black Dress: A Memoir of...,2009,23096,3.17,3.883971
4079,1918305,The Geography of Bliss: One Grump's Search for...,2008,23023,3.83,3.883971
3984,46190,Love Is a Mix Tape,2007,21971,3.83,3.883971
744,12868761,Let's Pretend This Never Happened: A Mostly Tr...,2012,118475,3.9,3.883971
4724,13642929,My Beloved World,2013,17742,4.03,3.883971
5702,316558,Kabul Beauty School: An American Woman Goes Be...,2007,17002,3.63,3.883971
2803,18039963,A House in the Sky,2013,29369,4.2,3.883971
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.883971


In [53]:
hybrid(10, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,est
80,7445,The Glass Castle,2005,621099,4.24,3.936469
3118,3465,Three Weeks With My Brother,2004,32208,4.01,3.920686
4038,6365221,Mennonite in a Little Black Dress: A Memoir of...,2009,23096,3.17,3.810366
4079,1918305,The Geography of Bliss: One Grump's Search for...,2008,23023,3.83,3.810366
3984,46190,Love Is a Mix Tape,2007,21971,3.83,3.810366
744,12868761,Let's Pretend This Never Happened: A Mostly Tr...,2012,118475,3.9,3.810366
4724,13642929,My Beloved World,2013,17742,4.03,3.810366
5702,316558,Kabul Beauty School: An American Woman Goes Be...,2007,17002,3.63,3.810366
2803,18039963,A House in the Sky,2013,29369,4.2,3.810366
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.810366


In [54]:
def improved_hybrid(user_id, title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    book_indices = [i[0] for i in sim_scores]
    
    df = books.iloc[book_indices][['book_id', 'title', 'ratings_count', 'average_rating', 'original_publication_year']]
    v = df['ratings_count']
    m = df['ratings_count'].quantile(0.60)
    R = df['average_rating']
    C = df['average_rating'].mean()
    df['weighted_rating'] = (R*v + C*m) / (v + m)
    
    df['est'] = df['book_id'].apply(lambda x: svd.predict(user_id, x).est)
    
    df['score'] = (df['est'] + df['weighted_rating']) / 2
    df = df.sort_values('score', ascending=False)
    return df[['book_id', 'title', 'original_publication_year', 'ratings_count', 'average_rating', 'score']].head(n)

In [55]:
improved_hybrid(4, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,score
80,7445,The Glass Castle,2005,621099,4.24,4.051204
328,2318271,The Last Lecture,2008,241869,4.25,4.041568
198,12691,Marley and Me: Life and Love With the World's ...,2005,367304,4.12,3.990549
1669,104189,Same Kind of Different as Me,2005,52964,4.21,3.977255
2803,18039963,A House in the Sky,2013,29369,4.2,3.950718
753,6366437,Half Broke Horses,2008,110597,4.05,3.944794
1067,29209,The Color of Water: A Black Man's Tribute to H...,1996,80906,4.06,3.942658
6286,8564644,Little Princes: One Man's Promise to Bring Hom...,2010,14765,4.25,3.93251
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.929478
4593,31845516,Love Warrior,2016,20094,4.1,3.918943


In [56]:
improved_hybrid(10, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,score
80,7445,The Glass Castle,2005,621099,4.24,4.077728
328,2318271,The Last Lecture,2008,241869,4.25,4.004766
198,12691,Marley and Me: Life and Love With the World's ...,2005,367304,4.12,3.953747
1669,104189,Same Kind of Different as Me,2005,52964,4.21,3.940452
3118,3465,Three Weeks With My Brother,2004,32208,4.01,3.929278
2803,18039963,A House in the Sky,2013,29369,4.2,3.913916
753,6366437,Half Broke Horses,2008,110597,4.05,3.907991
1067,29209,The Color of Water: A Black Man's Tribute to H...,1996,80906,4.06,3.905855
6286,8564644,Little Princes: One Man's Promise to Bring Hom...,2010,14765,4.25,3.895708
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.892676


/Users/vamsidharreddy/Downloads/SJSU/CMPE 255
