In [1]:
import pandas as pd 
import numpy as np 
import pickle 

In [2]:
books = pd.read_csv('../../data/processed/current_books.csv')
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,Categories,Image
0,0316109215,Arthur's New Puppy (Arthur Adventure Series),Marc Brown,1995.0,"Little, Brown Young Readers",Arthur is overjoyed when he brings home his ne...,Puppies,https://covers.openlibrary.org/b/id/188015-M.jpg
1,7513305943,Ge zi xiang yao xiao gou gou!,Mo Willems,2012.0,Xin xing chu ban she =,"He really, really, REALLY wants one. He'll tak...",Puppies,https:https://archive.org/download/gezixiangya...
2,0525463631,Puppy,Discovery Kids,2000.0,Dutton Children's Books,"Watch a tiny, newborn puppy grow into a playfu...",Puppies,https://covers.openlibrary.org/b/id/6554907-M.jpg
3,1471144607,Dork Diaries Puppy Love,Rachel Renée Russell,2016.0,Simon & Schuster,Nikki meets adorable puppies with her crush Br...,Puppies,https://covers.openlibrary.org/b/id/15053179-M...
4,141694737X,Dear Zoo,Rod Campbell,2007.0,Little Simon,Each animal arriving from the zoo as a possibl...,Puppies,https://covers.openlibrary.org/b/id/10710152-M...


In [3]:
ratings = pd.read_csv('../../data/processed/current_reviews.csv')
ratings.head()

Unnamed: 0,ISBN,User_id,rating,review
0,015201294X,A2L8RR2B6HO24F,4.0,Hello! In am reading this story for a 6th grad...
1,015201294X,A3U3RS1HYT8BK,5.0,"Okay, we can forgive Mrs.Rinaldi for giving Em..."
2,015201294X,AJ9RZKCNVA7DK,5.0,This is one of my favorite books by Anne Rinal...
3,015201294X,AAYH3NGH1TCUT,5.0,You've heard about the body snatching. Gross r...
4,015201294X,A20A1RL7J10Y1Y,5.0,"It's the end of Civil War, 1865 in Washington ..."


In [4]:
ratings["rating"].value_counts()

rating
5.0    51618
4.0    16427
3.0     6816
1.0     5507
2.0     4196
Name: count, dtype: int64

In [5]:
df = books.merge(ratings , on = "ISBN" , how = "left")
df_for_CF  = df[['ISBN' , 'User_id' , 'rating']]
df = df[['ISBN','rating']]
df['rating'] = df['rating'].fillna(0)
df

Unnamed: 0,ISBN,rating
0,0316109215,0.0
1,7513305943,0.0
2,0525463631,0.0
3,1471144607,0.0
4,141694737X,0.0
...,...,...
87605,0385680910,0.0
87606,067168339X,0.0
87607,0545504929,0.0
87608,0689116152,0.0


### Popularity Based Recommender System


In [6]:
df['rating'].value_counts()

rating
5.0    52268
4.0    16571
3.0     6872
1.0     5541
2.0     4218
0.0     2140
Name: count, dtype: int64

In [7]:
named_aggs = {
    'avg_rating'  : ('rating', 'mean'),
    'num_ratings' : ('rating', 'count')
}
df = df.groupby('ISBN').agg(**named_aggs).reset_index()
df 

Unnamed: 0,ISBN,avg_rating,num_ratings
0,0001052888,4.555556,9
1,0001515195,5.000000,2
2,0001604112,0.000000,2
3,0002155400,4.493450,916
4,0002174960,3.818182,11
...,...,...,...
4995,B000U2H6Q4,4.708333,24
4996,B000U2H7SG,4.680672,119
4997,B000U2I21C,3.500000,4
4998,B000U2M5HO,4.015385,130


In [8]:
df ['num_ratings'].value_counts()

num_ratings
2      1587
1      1158
3       382
4       256
5       186
       ... 
110       1
240       1
88        1
644       1
946       1
Name: count, Length: 194, dtype: int64

In [9]:
most_popular_books = df [df['num_ratings'] >= 50]
most_popular_books.reset_index(drop=True , inplace=True)
most_popular_books

Unnamed: 0,ISBN,avg_rating,num_ratings
0,0002155400,4.493450,916
1,0007140029,4.773300,397
2,0028642643,4.096386,83
3,0060012234,4.382022,356
4,0060513659,4.324074,108
...,...,...,...
261,B000QEARDU,3.782946,129
262,B000TNGU5M,4.642706,946
263,B000U2H682,4.185185,54
264,B000U2H7SG,4.680672,119


In [10]:
most_popular_books = most_popular_books.sort_values(by="avg_rating" , ascending=False).reset_index(drop=True)
most_popular_books

Unnamed: 0,ISBN,avg_rating,num_ratings
0,0944092691,4.916667,240
1,0072936533,4.825000,80
2,B0006A011M,4.803571,56
3,0007140029,4.773300,397
4,0764229893,4.749206,315
...,...,...,...
261,B000MV8HLQ,2.557143,70
262,B000PKPEWY,2.148649,74
263,B000O2VAPI,2.148649,74
264,B000KJPK10,2.148649,74


In [30]:
pickle.dump(most_popular_books, open('../../models/artifacts/most_popular', 'wb'))

## Collaborative Filtering Based Recommender System

In [14]:
df =  books.merge(ratings , on = "ISBN" , how = "inner")

In [16]:
df = df[['ISBN' , 'User_id' , 'rating']]

In [17]:
df.head()

Unnamed: 0,ISBN,User_id,rating
0,140449701,AN2MKV62BREXE,4.0
1,140449701,AN2MKV62BREXE,4.0
2,140449701,A1O6XH64AWLR6I,5.0
3,140449701,A1O6XH64AWLR6I,5.0
4,140449701,A3I8FT707XKQAJ,3.0


In [19]:
pt = df.pivot_table(index='User_id', columns='ISBN' , values='rating' , fill_value=0)
pt

ISBN,0001052888,0001515195,0002155400,0002174960,0002261359,0005060982,0006514413,0007101929,0007122993,0007123469,...,B000TZ74ZK,B000TZ8E36,B000U2H682,B000U2H696,B000U2H6D2,B000U2H6Q4,B000U2H7SG,B000U2I21C,B000U2M5HO,B000U2M5JM
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00540411RKGTDNU543WS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00878773S2MNB00COHKV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00940571GAOITYS675AR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01038432MVI9JXYTTK5T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0147018299H7B1MXYFW1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZZK8T7V8NQ9N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZMLMIWDT2H0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZMO52V8WZ68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZVZL4QEHEHO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from scipy.sparse import csr_matrix
pt_sparse = csr_matrix(pt.values)
pt_sparse.shape

(50262, 3884)

In [21]:
pt.columns.get_loc("0001052888")

0

### A) User based similarity

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(pt_sparse, dense_output=False)
import numpy as np

def user_based_recommend(
    user_id,
    pt,
    pt_sparse,
    user_similarity,
    top_n_items=10
):

    user_idx = pt.index.get_loc(user_id)

    sim_vector = user_similarity[user_idx].toarray().flatten()

    sim_vector[user_idx] = 0

    scores = sim_vector @ pt_sparse.toarray()

    user_ratings = pt_sparse[user_idx].toarray().flatten()
    scores[user_ratings > 0] = 0

    top_items_idx = np.argsort(scores)[-top_n_items:][::-1]

    return pt.columns[top_items_idx]


In [24]:
print(user_similarity[pt.index.get_loc("A00540411RKGTDNU543WS")])

  (0, 50257)	1.0
  (0, 50241)	0.8944271909999159
  (0, 50239)	1.0
  (0, 50232)	1.0
  (0, 50220)	1.0
  (0, 50211)	1.0
  (0, 50203)	1.0
  (0, 50191)	1.0
  (0, 50187)	1.0
  (0, 50178)	1.0
  (0, 50165)	1.0
  (0, 50160)	1.0
  (0, 50138)	1.0
  (0, 50133)	1.0
  (0, 50090)	1.0
  (0, 50007)	1.0
  (0, 50002)	1.0
  (0, 49996)	1.0
  (0, 49995)	1.0
  (0, 49994)	1.0
  (0, 49981)	1.0
  (0, 49943)	1.0
  (0, 49942)	1.0
  (0, 49936)	1.0
  (0, 49925)	1.0
  :	:
  (0, 192)	1.0
  (0, 180)	0.5144957554275265
  (0, 176)	1.0
  (0, 136)	1.0
  (0, 133)	1.0
  (0, 121)	1.0
  (0, 109)	0.5521576303742327
  (0, 101)	1.0
  (0, 90)	1.0
  (0, 78)	1.0
  (0, 71)	1.0
  (0, 56)	1.0
  (0, 41)	1.0
  (0, 23)	1.0
  (0, 22)	1.0
  (0, 20)	1.0
  (0, 18)	1.0
  (0, 15)	1.0
  (0, 11)	1.0
  (0, 10)	1.0
  (0, 9)	1.0
  (0, 8)	1.0
  (0, 5)	1.0
  (0, 1)	1.0
  (0, 0)	1.0


In [25]:
user_based_recommend(
    user_id='A00540411RKGTDNU543WS',
    pt=pt,
    pt_sparse=pt_sparse,
    user_similarity=user_similarity
)

Index(['1556909330', 'B000J521DU', 'B0007F0LD4', 'B0007GZPJI', 'B0008CVEW8',
       '0786197005', '0140860096', 'B00005BC14', 'B000MZWXNA', '0743528557'],
      dtype='object', name='ISBN')

### B) item based similarity 

In [26]:
item_similarity = cosine_similarity(pt_sparse.T, dense_output=False)

def item_based_recommend(
    ISBN,
    pt_sparse,
    pt,
    item_similarity,
    top_n_items=10
):
    # index of the book
    item_idx = pt.columns.get_loc(ISBN)

    # book vector (1 × n_items)
    item_vector = item_similarity[item_idx]
         
    # similarity scores
    scores = item_vector.toarray().flatten()

    # remove the book itself
    scores[item_idx] = 0

    # top-N similar books
    top_items_idx = np.argsort(scores)[-top_n_items:][::-1]

    return pt.columns[top_items_idx]


In [27]:
item_based_recommend("0001052888" , pt_sparse , pt , item_similarity)

Index(['0874831431', 'B000PDESPU', 'B000NP2BO0', 'B000NVRSAG', '0679762280',
       '0754005801', '0754054055', '0896214869', '0786192178', '158926018X'],
      dtype='object', name='ISBN')

In [28]:
ISBNS = item_based_recommend("0001052888" , pt_sparse , pt , item_similarity)
books[books["ISBN"].isin(ISBNS)].drop_duplicates(subset=["ISBN"])

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Description,Categories,Image
1714,B000PDESPU,The Listening Eye,['Patricia Wentworth'],2011.0,Open Road Media,A deaf woman learns some dangerous information...,['Fiction'],http://books.google.com/books/content?id=lv7UM...
1742,0679762280,Rainbow's End,['Lauren St John'],2008.0,Simon and Schuster,Traces the author's coming-of-age in civil-war...,['Biography & Autobiography'],http://books.google.com/books/content?id=nFM8v...
3323,B000NP2BO0,The Portable Thoreau,['Henry David Thoreau'],2012.0,Penguin,An updated edition of Thoreau's most widely re...,['Literary Collections'],http://books.google.com/books/content?id=JpJPE...
3324,B000NVRSAG,The Portable Thoreau,['Henry David Thoreau'],2012.0,Penguin,An updated edition of Thoreau's most widely re...,['Literary Collections'],http://books.google.com/books/content?id=JpJPE...
3404,0896214869,Anodyne Necklace,['Martha Grimes'],2013.0,Simon and Schuster,The third in the bestselling Richard Jury myst...,['Fiction'],http://books.google.com/books/content?id=9--1t...
3701,0874831431,Walking Trees (American Storytelling),['Roberta Simpson Brown'],1991.0,august house,"Twenty-one short contemporary scary stories, i...",['Juvenile Fiction'],http://books.google.com/books/content?id=Fncrx...
4103,0786192178,"Jerusalem Interlude (Zion Covenant, Book 4)","['Bodie Thoene', 'Brock Thoene']",2000.0,Bethany House Pub,As Central Europe is served up on a platter to...,['Fiction'],http://books.google.com/books/content?id=yjPOJ...
4104,158926018X,"Jerusalem Interlude (Zion Covenant, Book 4)","['Bodie Thoene', 'Brock Thoene']",2000.0,Bethany House Pub,As Central Europe is served up on a platter to...,['Fiction'],http://books.google.com/books/content?id=yjPOJ...
4692,0754054055,With All Despatch,['Alexander Kent'],2014.0,Random House,More action from the master storyteller of the...,['Admirals'],http://books.google.com/books/content?id=VQDqA...
4693,0754005801,With All Despatch,['Alexander Kent'],2014.0,Random House,More action from the master storyteller of the...,['Admirals'],http://books.google.com/books/content?id=VQDqA...


In [31]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute' , n_jobs=-1)

In [32]:
model.fit(pt_sparse.T)

In [33]:
pt.T

User_id,A00540411RKGTDNU543WS,A00878773S2MNB00COHKV,A00940571GAOITYS675AR,A01038432MVI9JXYTTK5T,A0147018299H7B1MXYFW1,A01487226AT9QN55VGVV,A02355172J55BBI6MHY9G,A02475631NYWMHYV2A7M,A02660181QI9HHAVFK06O,A03816223LL3Q1P48HRU,...,AZZ14AIF8UYOM,AZZ1E6SG4RAT7,AZZB9LP44Z1MX,AZZCSTUZ5EO4I,AZZJLZHL0VQI6,AZZK8T7V8NQ9N,AZZMLMIWDT2H0,AZZMO52V8WZ68,AZZVZL4QEHEHO,AZZWUSPT2FJYE
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001052888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0001515195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002155400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002174960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002261359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B000U2H6Q4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000U2H7SG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000U2I21C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000U2M5HO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
pt.T.index[0]

'0006499155'

In [34]:
distance , suggestions = model.kneighbors(pt.T.values[0].reshape(1, -1) , n_neighbors = 7)

In [35]:
distance

array([[ 0.        , 13.78404875, 13.78404875, 13.78404875, 13.78404875,
        13.78404875, 13.78404875]])

In [36]:
pt.T.index[121]

'0132366134'

In [37]:
suggestions

array([[  0, 368, 671, 643, 607, 252, 744]], dtype=int64)

In [38]:
pt.T.index[suggestions.flatten()]

Index(['0001052888', '0373692102', '052138334X', '0517118319', '0471998826',
       '0275958760', '0590922424'],
      dtype='object', name='ISBN')

In [39]:
def recommend_CF_model_based(ISBN , pt_sparse , pt , top_k = 5) :
    book_index = pt.columns.get_loc(ISBN)
    distance , suggestions = model.kneighbors(pt.T.values[book_index].reshape(1, -1) , n_neighbors= top_k + 2)
    return pt.T.index[suggestions.flatten()]

In [40]:
pickle.dump(item_similarity, open('../../models/artifacts/item_similarity', 'wb'))
pickle.dump(user_similarity, open('../../models/artifacts/user_similarity', 'wb'))
pickle.dump(pt , open('../../models/artifacts/pt', 'wb'))
pickle.dump(pt_sparse , open('../../models/artifacts/pt_sparse', 'wb'))