# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [1]:
import os
for dirname, _, filenames in os.walk('models/archive'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Handle warning messages
import warnings
warnings.filterwarnings('ignore')

models/archive\Books.csv
models/archive\classicRec.png
models/archive\DeepRec.png
models/archive\Ratings.csv
models/archive\recsys_taxonomy2.png
models/archive\Users.csv


In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture

In [6]:
books = pd.read_csv("models/archive/Books.csv")
users = pd.read_csv("models/archive/Users.csv")
ratings = pd.read_csv("models/archive/Ratings.csv")
print('Books:',books.shape)
print('Users:',users.shape)
print('Ratinsg:',ratings.shape)

Books: (271360, 8)
Users: (278858, 3)
Ratinsg: (1149780, 3)


In [7]:
print('Books:',books.shape)

Books: (271360, 8)


In [8]:
books.sample(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
28069,006018518X,"Cemetery Stories : Haunted Graveyards, Embalmi...",Katherine Ramsland,2001,Perennial Currents,http://images.amazon.com/images/P/006018518X.0...,http://images.amazon.com/images/P/006018518X.0...,http://images.amazon.com/images/P/006018518X.0...
164276,2742731059,Le pÃ?Â¨re NoÃ?Â«l est une ordure,Josiane Balasko,2000,Actes Sud,http://images.amazon.com/images/P/2742731059.0...,http://images.amazon.com/images/P/2742731059.0...,http://images.amazon.com/images/P/2742731059.0...
63915,0312275056,Creative Stars : Using Astrology to Tap Your Muse,Trish MacGregor,2002,St. Martin's Griffin,http://images.amazon.com/images/P/0312275056.0...,http://images.amazon.com/images/P/0312275056.0...,http://images.amazon.com/images/P/0312275056.0...
215636,188438904X,The Sewing Connection Series V,Shirley Adams,1992,Shirley Adams Pubns,http://images.amazon.com/images/P/188438904X.0...,http://images.amazon.com/images/P/188438904X.0...,http://images.amazon.com/images/P/188438904X.0...
194706,0898798574,Writing for Young Adults,Sherry Garland,1998,Writer's Digest Books,http://images.amazon.com/images/P/0898798574.0...,http://images.amazon.com/images/P/0898798574.0...,http://images.amazon.com/images/P/0898798574.0...


In [9]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [10]:
books.rename(columns={'Book-Title':'Title','Book-Author':'Author','Year-Of-Publication':'Publication_Year'},inplace=True)

In [11]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ISBN              271360 non-null  object
 1   Title             271360 non-null  object
 2   Author            271358 non-null  object
 3   Publication_Year  271360 non-null  object
 4   Publisher         271358 non-null  object
 5   Image-URL-S       271360 non-null  object
 6   Image-URL-M       271360 non-null  object
 7   Image-URL-L       271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [12]:
books.isnull().sum()

ISBN                0
Title               0
Author              2
Publication_Year    0
Publisher           2
Image-URL-S         0
Image-URL-M         0
Image-URL-L         3
dtype: int64

In [13]:
books.duplicated().sum()

0

In [14]:
books.shape

(271360, 8)

In [15]:
print('Users:',users.shape)

Users: (278858, 3)


In [16]:
users.rename(columns={'User-ID':'User_ID'},inplace=True)

In [17]:
ratings.rename(columns={'User-ID':'User_ID','Book-Rating':'Rating'},inplace=True)

In [18]:
ratings_with_books = ratings.merge(books,on='ISBN')
ratings_with_books.shape

(1031136, 10)

In [19]:
num = ratings_with_books.groupby('Title')['Rating'].count().reset_index().rename(columns={'Rating':'Number_of_Ratings'})
num

Unnamed: 0,Title,Number_of_Ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [20]:
avg = ratings_with_books.groupby('Title')['Rating'].mean().reset_index().rename(columns={'Rating':'Average_Rating'})
avg

Unnamed: 0,Title,Average_Rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [21]:
Popularity_df = num.merge(avg,on='Title')
Popularity_df.sample(5)

Unnamed: 0,Title,Number_of_Ratings,Average_Rating
194726,The Last Sunday,1,0.0
212696,The Waltz of Hearts (Magna Library Series),1,5.0
116758,"Marriage Campaign (Presents , No 1960)",7,0.0
145329,Qpb Book of Irish Literature,1,0.0
97871,Just Enough Greek (Just Enough),1,5.0


In [22]:
Popularity_df = Popularity_df[Popularity_df['Number_of_Ratings']>250].sort_values('Average_Rating',ascending=False).head(50)
Popularity_df.head(5)

Unnamed: 0,Title,Number_of_Ratings,Average_Rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453


In [23]:
books1 = books.drop_duplicates('Title')

In [24]:
df = Popularity_df.merge(books1,on='Title')

In [25]:
Popularity_Final = df[['ISBN','Title','Author','Publication_Year','Publisher','Image-URL-L','Number_of_Ratings', 'Average_Rating']]

In [26]:
aGoodBook = 50
aGoodUser = 200

In [27]:
x = ratings_with_books.groupby('User_ID')['Rating'].count()>aGoodUser
good_users_index = x[x].index
filtered_ratings = ratings_with_books[ratings_with_books['User_ID'].isin(good_users_index)]
filtered_ratings.shape

(474007, 10)

In [28]:
y = filtered_ratings.groupby('Title')['Rating'].count()>aGoodBook
good_book_index = y[y].index
filtered_ratings_books = filtered_ratings[filtered_ratings['Title'].isin(good_book_index)]
filtered_ratings_books.shape

(57236, 10)

In [29]:
pt = filtered_ratings_books.pivot_table(index='Title',columns='User_ID',values='Rating')

In [30]:
pt.fillna(0,inplace=True)

In [31]:
final_dataset = pt.copy()

In [32]:
final_dataset_sparse = csr_matrix(final_dataset)

In [33]:
nn_model = NearestNeighbors(metric='cosine',algorithm='brute')
nn_model.fit(final_dataset_sparse)

In [34]:
kmeans_model = KMeans(n_clusters=3,init="k-means++",random_state=12)
km = kmeans_model.fit(final_dataset)
km_pred = km.predict(final_dataset)

silhouette_score(final_dataset,km_pred)

0.0402502781169519

In [35]:
gm_model = GaussianMixture(n_components=3,random_state=12)
gm = gm_model.fit(final_dataset)
gm_pred = km.predict(final_dataset)

silhouette_score(final_dataset,gm_pred)

0.0402502781169519

In [36]:
def recommend_nn(book_name):
    dist , sugg = nn_model.kneighbors(final_dataset[final_dataset.index == book_name],n_neighbors=6)
    print('Book Recommendation for',book_name,'are:')
    for i in range(len(sugg[0])):
        if i!=0:
            print(i,final_dataset.index[sugg[0][i]])

In [37]:
recommend_nn('Message in a Bottle')

Book Recommendation for Message in a Bottle are:
1 Nights in Rodanthe
2 The Mulberry Tree
3 A Walk to Remember
4 River's End
5 Nightmares &amp; Dreamscapes


In [38]:
import joblib

# Save the NearestNeighbors model
joblib.dump(nn_model, 'nn_model.pkl')

# Save the pivot table (final dataset)
final_dataset.to_pickle('final_dataset.pkl')
