In [6]:
import pandas as pd
import numpy as np
import math
import warnings

pd.options.display.max_rows = 999
warnings.filterwarnings("ignore")

## Books Data set

In [7]:
books = pd.read_csv('books_data.csv')
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB


#### Features image, previewLink, publisher, publishedDate and infoLink won't be needed

In [8]:
books.drop('image', axis=1, inplace=True)
books.drop('previewLink', axis=1, inplace=True)
books.drop('infoLink', axis=1, inplace=True)
books.drop('publisher', axis=1, inplace=True)
books.drop('publishedDate', axis=1, inplace=True)

In [9]:
books.head()

Unnamed: 0,Title,description,authors,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,


#### The percentage of missing data by book feature is given below.
Conciderable amount of ratings count is missing - 76.58%

In [10]:
books.isna().sum()/books.shape[0]*100

Title            0.000471
description     32.222557
authors         14.789270
categories      19.396527
ratingsCount    76.576712
dtype: float64

In [11]:
books.isna().sum()/books.shape[0]*100

Title            0.000471
description     32.222557
authors         14.789270
categories      19.396527
ratingsCount    76.576712
dtype: float64

In [12]:
books = books.drop(books[books.Title.isna()].index)

#### Transforming Authors and Categories features
For example\
\['Julie Strain'\] - juliestrain\
\['Allen Gersho', 'Robert M. Gray'\] - allengersho robertm.gray\
\['Comics & Graphic Novels'\] - comics&graphicnovels

In [13]:
def transform(column):
    try:
        if isinstance(column, float):
            raise TypeError
        column = [x.strip('[\'\"]') for x in column.split(', ')]
        def clean_text(author):
            names = author.split(' ')
            cleaned_names = [name.lower() for name in names]
            return ''.join(cleaned_names)
        concatenated_text = ' '.join(clean_text(obj) for obj in column)
        return concatenated_text
    except TypeError:
        pass

In [14]:
books['authors'].head()

0       ['Julie Strain']
1         ['Philip Nel']
2       ['David R. Ray']
3    ['Veronica Haddon']
4        ['Edward Long']
Name: authors, dtype: object

In [15]:
books['authors'] = books['authors'].apply(transform)

In [16]:
books.authors.head()

0       juliestrain
1         philipnel
2        davidr.ray
3    veronicahaddon
4        edwardlong
Name: authors, dtype: object

In [17]:
books['categories'].head()

0      ['Comics & Graphic Novels']
1    ['Biography & Autobiography']
2                     ['Religion']
3                      ['Fiction']
4                              NaN
Name: categories, dtype: object

In [18]:
books['categories'] = books['categories'].apply(transform)

In [19]:
books['categories'].head()

0       comics&graphicnovels
1    biography&autobiography
2                   religion
3                    fiction
4                       None
Name: categories, dtype: object

In [20]:
books.head()

Unnamed: 0,Title,description,authors,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,juliestrain,comics&graphicnovels,
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,philipnel,biography&autobiography,
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,davidr.ray,religion,
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,veronicahaddon,fiction,
4,"Nation Dance: Religion, Identity and Cultural ...",,edwardlong,,


#### Taking into account only books with high ratings count (100+)

In [21]:
popular_books = books[books['ratingsCount'] >= 100]
popular_books.reset_index(drop=True, inplace = True)

#### Renaming columns

In [22]:
popular_books.rename(columns=
                    {'Title':'title',
                    'ratingsCount' : 'ratings_count' },inplace=True)

## Ratings Data set

In [23]:
ratings = pd.read_csv('Books_rating.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB


#### Features Price, profileName, review/time and review/summary won't be needed

In [24]:
ratings.drop(columns=['Price', 'profileName', 'review/time', 'review/summary'], inplace=True)

#### Records without Title won't be useful

In [25]:
ratings.drop(ratings[ratings['Title'].isna()].index, inplace=True)

#### Renaming columns

In [26]:
ratings.rename(columns=
               {'Id':'book_id', 
                'Title':'title',
                'User_id':'user_id',
                'review/helpfulness' :'helpfulness',
                'review/summary' :'summary',
                'review/score':'rating',
                'review/text':'review' },inplace=True)

#### Taking into account only ratings from cosiderable users (250+ reviews)

In [27]:
x = ratings.groupby('user_id').count()['rating'] > 250

In [28]:
considerable_users = x[x].index

In [29]:
filtered_ratings = ratings[ratings['user_id'].isin(considerable_users)]

#### Transforming helpfulness

In [30]:
def transform_helpfulness(x):
    try:
        x = x.split('/')
        return round(eval(x[0]) / eval(x[1]), 2) if eval(x[1]) != 0 else None
    except:
        return None

In [31]:
filtered_ratings['helpfulness'] = filtered_ratings['helpfulness'].apply(lambda x: transform_helpfulness(x))

#### Taking into account only reviews of popular books (25+ ratings)

In [32]:
y = filtered_ratings.groupby('title').count()['rating']>=25
famous_books = y[y].index

In [33]:
final_ratings = filtered_ratings[filtered_ratings['title'].isin(famous_books)]

# Final Books and Ratings Data sets

In [34]:
popular_books.sample(5)

Unnamed: 0,title,description,authors,categories,ratings_count
519,Matthew Henry's Commentary In One Volume,The Most Popular Commentary Ever Written in a ...,matthewhenry,religion,121.0
58,Mr. Beddle had a lamb.(short story): An articl...,A serial murderer known only by a grotesquely ...,thomasharris,fiction,3135.0
194,A Walk to Remember,NOVEL LEARNING SERIES(TM) A WALK TO REMEMBER b...,nicholassparks,studyaids,3251.0
706,Collector's Encyclopedia of Electric Christmas...,Journalist Walls grew up with parents whose id...,jeannettewalls,biography&autobiography,153.0
571,Roughdrafts: The Process of Writing,From the acclaimed New York Times bestselling ...,annelamott,languagearts&disciplines,133.0


In [35]:
final_ratings.sample(5)

Unnamed: 0,book_id,title,user_id,helpfulness,rating,review
1870480,B000MWC3FQ,Atlas Shrugged,A55MRYPUAX4QU,0.69,5.0,Pundits have stated that ATLAS SHRUGGED is the...
2157076,0613371488,The Hound of the Baskervilles (Signet Classics),AHXAPVSHPJ6OJ,0.5,4.0,Reviewing a Sherlock Holmes story is like revi...
407465,B000L4056E,The Fellowship of the Ring,AQQLWCMRNDFGI,1.0,5.0,I read this way back when I was an undergradua...
29777,0140860096,Of Mice and Men (Penguin Audiobooks),A1S3C5OFU508P3,,5.0,This is the first book by Steinbeck that I eve...
859709,0394556380,The Postman Always Rings Twice,A1XTWXIMUCDGQE,1.0,4.0,"I'm usually not a big reader of crime novels, ..."


### Dimensions

In [36]:
print(f'Books dimensions:\t{popular_books.shape[0]} rows\t{popular_books.shape[1]} features')
print(f'Ratings dimensions:\t{final_ratings.shape[0]} rows\t{final_ratings.shape[1]} features')

Books dimensions:	789 rows	5 features
Ratings dimensions:	20933 rows	6 features


# Recommender System - Content-based

In [149]:
from surprise import Dataset
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import accuracy

from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [40]:
data = popular_books[['title','description','ratings_count']]

In [43]:
print(data.isnull().sum())

title            0
description      0
ratings_count    0
dtype: int64


In [42]:
data = data.dropna()

In [72]:
print(f'There are {data.shape[0]} books in the dataset.')

There are 756 books in the dataset.


In [104]:
tfidf = text.TfidfVectorizer(input='content', stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['description'])


In [105]:
print(f'The shape of term-frequency inverse-document-frequency matrix is {tfidf_matrix.shape}.')
print(f'There are {tfidf_matrix.shape[0]} books, described by {tfidf_matrix.shape[1]} terms.')

The shape of term-frequency inverse-document-frequency matrix is (756, 10704).
There are 756 books, described by 10704 terms.


In [106]:
similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [107]:
indices = pd.Series(data.index, index=data['title'])

In [112]:
def get_book_recommendations(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:5]
    bookindices = [i[0] for i in similarity_scores]
    return data['title'].iloc[bookindices]

print(get_book_recommendations("The Hobbit"))

67                                      Esperanza Rising
100    Where Am I Hiding? / Donde Me Escondo? (Un Bue...
428    CliffsNotes on Cisnero's The House on Mango St...
277                                   The Book of Wonder
338    Si te dicen que cai (Biblioteca breve ; 398 : ...
Name: title, dtype: object


# User-based Collaborative Filtering

In [114]:
data = final_ratings[['user_id','book_id','rating']]

In [116]:
data.shape

(20933, 3)

In [118]:
unique_users = data['user_id'].unique()
unique_books = data['book_id'].unique()

In [120]:
print(f'Unique users: {unique_users.size}')
print(f'Unique books: {unique_books.size}')

Unique users: 216
Unique books: 842


In [125]:
matrix = pd.DataFrame(unique_users)
matrix.columns = ['user_id']

In [126]:
for book in unique_books:
    matrix[book] = None

In [132]:
for j, user in enumerate(matrix['user_id'].values):
    for i in range(data[data['user_id'] == user].shape[0]):
        product_id = data[data['user_id'] == user]['book_id'].iloc[i]
        rating = data[data['user_id'] == user]['rating'].iloc[i]
        matrix[product_id][j] = rating

In [133]:
matrix.shape

(216, 843)

In [155]:
user = pd.DataFrame(matrix.iloc[104]) # random user
user_id = user.loc['user_id'].values
user = user.drop(['user_id'])
user[user.notnull().values].size

30

In [146]:
similar_users = matrix[matrix[user[user.notnull().values].index[0]].notnull()]

In [147]:
similar_users.head()

Unnamed: 0,user_id,B000N6DDJQ,B000J5KSK8,B0000CKD7E,B000PBZH5M,B0006AONEI,1588550311,B000PBZH6Q,158855032X,9562910334,...,B000L9Z1WW,B000PJWRH0,B000NW46MI,B000FML2C8,B000MTRTTE,B000Q032UY,B000JJKRKK,140004006X,B000OVMUX0,B000P91JYW
2,AWLFVCT9128JV,4.0,4.0,,4.0,,,4.0,,5.0,...,,,,,,5.0,,,3.0,5.0
4,A3IKBHODOTYYHM,4.0,,,,,,,,4.0,...,,,,4.0,,,,,,4.0
5,A3KF4IP2MUS8QQ,5.0,,,,,,,,,...,,,,,,,,,,
12,A20EEWWSFMZ1PN,3.0,5.0,,,,,,,5.0,...,,5.0,,5.0,,5.0,,,,
13,A96K1ZGW56S2I,5.0,,,,,,,,,...,,,,3.0,,,,,,


In [148]:
similar_users = similar_users.replace([None],0)
similar_users = similar_users.set_index('user_id')
similar_users.head()

Unnamed: 0_level_0,B000N6DDJQ,B000J5KSK8,B0000CKD7E,B000PBZH5M,B0006AONEI,1588550311,B000PBZH6Q,158855032X,9562910334,B000TZ19TC,...,B000L9Z1WW,B000PJWRH0,B000NW46MI,B000FML2C8,B000MTRTTE,B000Q032UY,B000JJKRKK,140004006X,B000OVMUX0,B000P91JYW
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AWLFVCT9128JV,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,5.0
A3IKBHODOTYYHM,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0
A3KF4IP2MUS8QQ,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A20EEWWSFMZ1PN,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,...,0.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0
A96K1ZGW56S2I,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [156]:
similar_users['cos_similarity'] = 0.0
for user in similar_users.index:
    cos = cosine_similarity(similar_users.loc[user_id].values.reshape(1, -1), similar_users.loc[user].values.reshape(1, -1))
    similar_users['cos_similarity'][user] = cos 

In [159]:
similar_users['cos_similarity'].sort_values(ascending=False)

user_id
A2RPIGO3G4M6GN    1.000000
A3NIQK6ZLYEP1L    0.316150
A2F3M93RRLFQNJ    0.278061
A2CR57GAJKNWVV    0.276686
A1XTWXIMUCDGQE    0.274022
A3QVAKVRAH657N    0.236385
A30KEXFT9SILL6    0.215502
A2B9Y0WXNSN17U    0.203007
A32ZKBXJJ45BRY    0.188052
A17FLA8HQOFVIG    0.187384
A3R19YKNL641X3    0.186993
A1LVZOK9F7K4CN    0.185526
A96K1ZGW56S2I     0.169151
AWLFVCT9128JV     0.161931
AU6DIIDZK2OQM     0.161303
A3OH101U0CPUC7    0.160715
A370Z6I5GBWU44    0.156761
A3KF4IP2MUS8QQ    0.149478
A1X8VZWTOG8IS6    0.144562
A2E3GFHUDNPYDH    0.144086
A2S166WSCFIFP5    0.139463
A3OJFPKMCXKOM0    0.134318
A29NUB3P6YIWZG    0.128015
A1EKTLUL24HDG8    0.123467
A20EEWWSFMZ1PN    0.113986
A12A08OL0TZY0W    0.106900
AFYYHRPSFBLWS     0.105773
A1DYXCF4148PJT    0.092632
A1CHM200OEN65X    0.092435
A2EENLV6OQ3DYM    0.091516
AUTBHG6070SL4     0.090149
A1IU7S4HCK1XK0    0.089583
A2NJO6YE954DBH    0.088918
A2NTJUI2DLZF4R    0.088656
A32VWSQ0FPZKZ3    0.075567
A36K2N527TXXJN    0.070635
A1S3C5OFU508P3    0.