In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
books = pd.read_csv('../data/cleaned/books.csv')
users = pd.read_csv('../data/cleaned/users.csv')
ratings = pd.read_csv('../data/cleaned/ratings.csv')

## Popularity Based Recommendation System - TOP 50 Books
Displaying top 50 most rated books.  
Criteria - The book must have at least 250 rating count and highest avg rating

In [4]:
ratings_books = ratings.merge(books, on='isbn')

In [5]:
ratings_books.head()

Unnamed: 0,user_id,isbn,rating,title,author,year,publisher,Image-URL-S,Image-URL-M,image_url
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...


In [6]:
ratings_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031128 entries, 0 to 1031127
Data columns (total 10 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   user_id      1031128 non-null  int64 
 1   isbn         1031128 non-null  object
 2   rating       1031128 non-null  int64 
 3   title        1031128 non-null  object
 4   author       1031128 non-null  object
 5   year         1031128 non-null  int64 
 6   publisher    1031128 non-null  object
 7   Image-URL-S  1031128 non-null  object
 8   Image-URL-M  1031128 non-null  object
 9   image_url    1031128 non-null  object
dtypes: int64(3), object(7)
memory usage: 78.7+ MB


In [7]:
num_ratings_df = ratings_books.groupby('title').count()['rating'].reset_index()
num_ratings_df.rename(columns={'rating':'num_ratings'}, inplace=True)
num_ratings_df

Unnamed: 0,title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241060,Ã?Â?lpiraten.,2
241061,Ã?Â?rger mit Produkt X. Roman.,4
241062,Ã?Â?sterlich leben.,1
241063,Ã?Â?stlich der Berge.,3


In [8]:
avg_ratings_df = ratings_books.groupby('title')['rating'].mean().reset_index()
avg_ratings_df.rename(columns={'rating':'avg_rating'}, inplace=True)
avg_ratings_df

Unnamed: 0,title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241060,Ã?Â?lpiraten.,0.000000
241061,Ã?Â?rger mit Produkt X. Roman.,5.250000
241062,Ã?Â?sterlich leben.,7.000000
241063,Ã?Â?stlich der Berge.,2.666667


In [9]:
popular_df = num_ratings_df.merge(avg_ratings_df, on='title')
popular_df

Unnamed: 0,title,num_ratings,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241060,Ã?Â?lpiraten.,2,0.000000
241061,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241062,Ã?Â?sterlich leben.,1,7.000000
241063,Ã?Â?stlich der Berge.,3,2.666667


In [10]:
popular_df = popular_df[popular_df['num_ratings'] >= 250].sort_values(by='avg_rating', ascending=False)
popular_df.head()

Unnamed: 0,title,num_ratings,avg_rating
80431,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80419,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80438,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80423,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80411,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453


In [11]:
popular_df = popular_df.merge(books, on='title').drop_duplicates('title')[['title','author','Image-URL-M','num_ratings','avg_rating']].head(50)

In [12]:
popular_df

Unnamed: 0,title,author,Image-URL-M,num_ratings,avg_rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.73741
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.183453
16,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.0...,281,5.007117
17,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.0...,368,4.94837
26,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,http://images.amazon.com/images/P/059035342X.0...,575,4.895652
28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.0...,260,4.880769
39,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.0...,510,4.7


## Collaborative Based Recommendation System
- Creating a df with columns as users and rows as books, there will be each user rating in each column with respect to each book in a row.
- The criteria will be, will only include the users who have rated more than 200 ratings, and the books with only more than 50 ratings.

In [13]:
x = ratings_books.groupby('user_id').count()['rating'] > 200
top_users = x[x].index

In [14]:
filtered_ratings = ratings_books[ratings_books['user_id'].isin(top_users)]

In [15]:
y = filtered_ratings.groupby('title').count()['rating'] >= 50
famous_books = y[y].index

In [16]:
final_df = filtered_ratings[filtered_ratings['title'].isin(famous_books)]

In [17]:
final_df.head()

Unnamed: 0,user_id,isbn,rating,title,author,year,publisher,Image-URL-S,Image-URL-M,image_url
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...
1163,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...
1165,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...
1168,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...
1174,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...


In [18]:
pt = final_df.pivot_table(index='title',columns='user_id', values='rating')
pt.fillna(0, inplace=True)

In [19]:
pt.shape

(706, 810)

In [20]:
similarity_scores = cosine_similarity(pt)

In [21]:
similarity_scores.shape

(706, 706)

In [22]:
def recommend(book_name):
    index  = np.where(pt.index == book_name)[0][0]
    suggestions = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:6]
    recommended_books = []
    for i in suggestions:
        book_title = pt.index[i[0]]
        book_data = final_df[final_df['title'] == book_title].iloc[0]
        
        recommended_books.append({
            'title': book_title,
            'author': book_data['author'],
            'image_url': book_data['Image-URL-M'],
            'year':book_data['year']
        })

    return recommended_books

In [23]:
recommend('Message in a Bottle')

[{'title': 'Nights in Rodanthe',
  'author': 'Nicholas Sparks',
  'image_url': 'http://images.amazon.com/images/P/0446531332.01.MZZZZZZZ.jpg',
  'year': np.int64(2002)},
 {'title': 'The Mulberry Tree',
  'author': 'Jude Deveraux',
  'image_url': 'http://images.amazon.com/images/P/0671014218.01.MZZZZZZZ.jpg',
  'year': np.int64(2002)},
 {'title': 'A Walk to Remember',
  'author': 'Nicholas Sparks',
  'image_url': 'http://images.amazon.com/images/P/0446608955.01.MZZZZZZZ.jpg',
  'year': np.int64(2000)},
 {'title': "River's End",
  'author': 'Nora Roberts',
  'image_url': 'http://images.amazon.com/images/P/0515127833.01.MZZZZZZZ.jpg',
  'year': np.int64(2003)},
 {'title': 'Nightmares &amp; Dreamscapes',
  'author': 'Stephen King',
  'image_url': 'http://images.amazon.com/images/P/0451180232.01.MZZZZZZZ.jpg',
  'year': np.int64(2004)}]

In [27]:
pickle.dump(pt,open('../artifacts/pt.pkl', 'wb'))
pickle.dump(similarity_scores,open('../artifacts/similarity_scores.pkl', 'wb'))
final_df.to_csv('../artifacts/final_df.csv', index=False)
popular_df.to_csv('../artifacts/popular.csv', index=False)