In [1]:
# Import all the necessary dependencies
import os
import numpy as np
import scipy as sp
import pandas as pd
import scipy.sparse
from mlxtend.frequent_patterns import apriori
import hashlib # for grading purposes

# You will be working with data from an Online Book Store. 

### Every time a customer buys a book, the customer can rate the book and the Book Store uses that data to create recommendations to future customers.

### In this exercise you will have the opportunity to help the Book Store team to choose which books to display in different areas of the website.

## Task 1: Understanding the data

Data is available in the `./data/` folder. In this folder you will find 2 files:

* `BookRatings.csv` has the historical ratings given by the customers and represents all the books sold. 
* `BooksInfo.csv`: has the information about the main genre of the book. 

In [2]:
ratings = pd.read_csv('data/BookRatings.csv')
books_info = pd.read_csv('data/BooksInfo.csv')

Look at the raw files and print out the first rows of each file.

In [3]:
#BookRatings
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,1155,451402103,8
1,1155,671689231,7
2,1155,671032658,9
3,1155,671701053,5
4,1155,451409256,9


In [4]:
#BooksInfo
books_info.head()

Unnamed: 0,ISBN,Genre
0,195153448,Social Science
1,2005018,Actresses
2,60973129,1940-1949
3,374157065,Medical
4,393045218,Design


### Task 1.1 EDA (ungraded)
- check for Ratings with incomplete data, 
- check for the duplicate records in ratings 
- check for books without Genre

In [5]:
### Your answer
pd.unique(ratings['User-ID'])

array([  1155,   1211,   1248, ..., 278633, 278843, 278851])

In [6]:
#duplicate records in ratings
ratings[ratings.duplicated()]

Unnamed: 0,User-ID,ISBN,Book-Rating


In [7]:
#books without Genre
books_info[books_info['Genre'].isna()]

Unnamed: 0,ISBN,Genre
27,3404921038,
28,3442353866,
46,0961769947,
117,8420639133,
129,3257212054,
...,...,...
112326,3423200944,
112328,3548740146,
112330,3257217323,
112337,1845170423,


## Task 2: Rating Matrix

### Task 2.1: Create the ratings matrix

In [8]:
def make_ratings(data):
    """
    Parameters
        data - the ratings dataframe with ratings per ISBN and User-ID
        
    Returns:
        R - (numpy.ndarray) Ratings matrix with the User-ID, ISBN and Book-Rating
        hint: don't forget to put zeros in places where you do not have ratings
    
    Extra Hint: Your input is a pandas DataFrame but you want to output an array (use .to_numpy)!
    """

    # YOUR CODE HERE

    matrix_dataframe = pd.DataFrame(data).pivot(index='User-ID', columns='ISBN', 
    values='Book-Rating').fillna(0)
    data_matrix = matrix_dataframe.to_numpy()
    return data_matrix

R = make_ratings(ratings)

f"We have {R.shape[0]} users and {R.shape[1]} items."

'We have 5693 users and 47711 items.'

In [9]:
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
expected_hash = '226ef8abe773e3aceec1c057383c1628959c25882846e686412ef7e1ff96873d'
assert hashlib.sha256(str(R.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '0729c13ebd725201c1445a00c825237d305ff650cd72f50e45259bd942a75ef4'
assert hashlib.sha256(str(R[0].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = 'f1e42019aecc858ffbcca7fddec511b761b474916fde37b1a6ff321a9b459330'
assert hashlib.sha256(str(R[:,0].sum()).encode()).hexdigest() == expected_hash_2

### Task 2.2: Convert the Ratings Matrix to a Sparse Representation

In [11]:
from scipy.sparse import csr_matrix

def get_csr(matrix):
    """
    Parameters
        matrix - The Ratings Matrix.
    
    Returns
        H - The Compressed Sparse Row Matrix
        
    """
    H = csr_matrix(matrix)
    return H
    
sparse_mat = get_csr(R)

In [12]:
expected_hash = '3068469d4140f3f5fd47d88d14718db567a2ed03bf28240202061d61ea56147c'
assert hashlib.sha256(str(sparse_mat).encode()).hexdigest() == expected_hash

### Task 2.3: Calculate the density score of the matrix

In [13]:
def get_density_score(matrix):
    """
    Parameters
        matrix - Ratings Matrix
        
    Returns:
        dense_score - (float) Density Score of Orig Matrix. 
    """
    # YOUR CODE HERE
    R = matrix
    dense_score = R[R.nonzero()].size / R.size
    return dense_score
    
dense_score = get_density_score(R)
f"The Density Score is {dense_score}."

'The Density Score is 0.0004009664679853458.'

In [14]:
np.testing.assert_almost_equal(dense_score,0.0004,4)

## Task 3: Non-personalized Recommendations

### Task 3.0: Merge the 2 datasets (rating and books_info)

Merge the dataframes `ratings` and `books_info` in order to have information about the genre of each book. Include only the books that have a rating.

Hint | You might need to use the function <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html">merge()</a>  and explore the parameter 'how'.

In [15]:
def get_book_ratings_df(ratings_, books_info_):
    """
    Parameters
        ratings_ - DataFrame
        books_info_ - DataFrame
        
    Returns:
        book_ratings - DataFrame
    """
    
    # YOUR CODE HERE
    book_ratings = ratings_.merge(books_info_, how='inner', on='ISBN')
    
    return book_ratings

book_ratings = get_book_ratings_df(ratings, books_info)
book_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Genre
0,1155,451402103,8,True Crime
1,1155,671689231,7,Biography & Autobiography
2,70052,671689231,7,Biography & Autobiography
3,98904,671689231,6,Biography & Autobiography
4,110029,671689231,8,Biography & Autobiography


In [16]:
book_ratings.isnull().sum()

User-ID           0
ISBN              0
Book-Rating       0
Genre          6529
dtype: int64

In [17]:
expected_hash = 'c1d3dbf9ef7fb86036e5c933ff8de7a66d67b7dd25508764451e3ac8c300f110'
assert hashlib.sha256(str(book_ratings.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '3c4340f3a5aa8a40da4f7a2dc2f3ef4645ba099b58e986d12bd5f65b709efb20'
assert hashlib.sha256(str(book_ratings['Book-Rating'].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '581cd6bccf7862e391ce07768616c8427d6cf9ddec881f6984e3cbd835379997'
assert hashlib.sha256((book_ratings[(book_ratings['ISBN']=='1558744150')&(book_ratings['User-ID']==48579)].reset_index()['Genre'][0]).encode()).hexdigest() == expected_hash_2

### Task 3.1: The most popular books in the store

The Book Store wants to display on the website a collection of the most popular books in the store. Since we don't have information on purchases we are going to use the ratings.

Create a function that returns a list with the ISBNs of the top N most popular books in the store - the N books that received the most ratings. The values in the list should be ordered from the most popular to the least popular book.

Hint | You might find it useful to use the following functions (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html ">groupby()</a> - to group the data 
- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.size.html">size()</a> -  to get the number of lines


In [18]:
pd.DataFrame(ratings['ISBN'].value_counts())

Unnamed: 0,ISBN
0316666343,238
0312195516,166
0971880107,151
059035342X,144
0142001740,142
...,...
1575663988,1
0804113629,1
0553096036,1
0812553276,1


In [19]:
def get_popular_books(df, n):
    """
    Parameters
        df - DataFrame
        n - Integer
        
    Returns:
        top_n_popular_books - list of the top n popular books
    """
    #sorted_df = df.groupby(by='ISBN').sum().sort_values(by=['Book-Rating'], ascending=False)
    sorted_df = df['ISBN'].value_counts()
    books_list = sorted_df.index.tolist()
    return books_list[:n]

top_5_popular_books = get_popular_books(ratings, 5)
top_5_popular_books

['0316666343', '0312195516', '0971880107', '059035342X', '0142001740']

In [20]:
expected_hash = 'ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d'
assert hashlib.sha256(str(len(top_5_popular_books)).encode()).hexdigest() == expected_hash

expected_hash_1 = 'ecf0bb677736450811308765d0a80c698603dae939c42388f4f19880fa7dc704'
assert hashlib.sha256(str(top_5_popular_books[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '6cf1c4943f89becc6f4a3d7013d542d14082edcb7038bc38792f2045419a556e'
assert hashlib.sha256(str(top_5_popular_books[4]).encode()).hexdigest() == expected_hash_2


### Task 3.2: The best rated books

The Book Store also wants to display on the website a collection of the books with the best ratings in the store. 

Create a function that returns the top N best rated books with more than k ratings. Use the mean rating of each book for comparison. The list should be ordered from the best to the worst rated book.

In [21]:
def get_topn_rates(data, n, k):
    """
    Parameters
        data - DataFrame with ratings
        n - Top-n books
        k - Minimum number of ratings
        
    Returns
        top_books - List of ISBNs of top-n best mean rated books.
        Only consider books with more than k ratings.
    """
    # YOUR CODE HERE
    df_rating = data[['ISBN', 'Book-Rating']].groupby(by='ISBN').mean()
    df_count = pd.DataFrame(data['ISBN'].value_counts(), index=None).reset_index()
    df_count.rename(columns={'index': 'ISBN', 'ISBN': 'Count'}, inplace=True)
    merge = df_rating.merge(df_count, on = 'ISBN')
    
    select_k = merge[merge['Count'] > k]
    sort = select_k.sort_values(by=['Book-Rating'], ascending=False)
    sort_list = sort['ISBN'].values.tolist()
    return sort_list[:n]
    
top5_rates = get_topn_rates(ratings, 5, 10)
top5_rates

['0060256656', '0439425220', '0836204387', '0836218221', '0064409422']

In [22]:
expected_hash = 'ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d'
assert hashlib.sha256(str(len(top5_rates)).encode()).hexdigest() == expected_hash

expected_hash_1 = '176e1ad48051114c46de83e1b5b55bf6bc21dbfce49a62ff352cfdef48ff6357'
assert hashlib.sha256(str(top5_rates[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '20865e898050bb593da47f242377658bf3653fe9931bb645e6b1bbf29440d9f0'
assert hashlib.sha256(str(top5_rates[4]).encode()).hexdigest() == expected_hash_2

### Task 3.3: Loyal customers

The Book Store wants to reward the customers that gave the most ratings on the website. 

Create a function that returns a list of the top N users that gave the most ratings. Order the list by the number of given ratings in descending order.

In [23]:
def get_loyal_customers(df, n):
    """
    Parameters
        df - DataFrame
        n - Integer
        
    Returns:
        top_n_loyal_customers - The top n loyal customers
    """
    users_df = pd.DataFrame(df['User-ID'].value_counts())

    user_df = df['User-ID'].value_counts()
    user_list = user_df.index.tolist()
    return user_list[:n]
    

top_10_loyal_customers = get_loyal_customers(ratings, 10)

In [24]:
expected_hash = '4a44dc15364204a80fe80e9039455cc1608281820fe2b24f1e5233ade6af1dd5'
assert hashlib.sha256(str(len(top_10_loyal_customers)).encode()).hexdigest() == expected_hash

expected_hash_1 = 'c182d826ceb2b42f749faf0dd41929c88dff7a57a6000e2e7d16e5229ca6640b'
assert hashlib.sha256(str(top_10_loyal_customers[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '4df81fd140c781b33e9adde8d4bc1dbf520c4a2748f31f5abbe04182176580c6'
assert hashlib.sha256(str(top_10_loyal_customers[7]).encode()).hexdigest() == expected_hash_2

### Task 3.4: For which genre did the users give the most ratings?

Hint | You may find the following function useful (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html">sort_values()</a> -  to sort the data by the number of ratings

In [25]:
book_ratings['Genre'].value_counts()

Fiction                           60712
Juvenile Fiction                   6878
Biography & Autobiography          3850
Humor                              1898
History                            1438
                                  ...  
Rodeo                                 1
Rock music fans                       1
Book clubs (Discussion groups)        1
Fireworks                             1
Alternative histories                 1
Name: Genre, Length: 2543, dtype: int64

In [26]:

genre_user_top_rating =  'Fiction'

In [27]:
expected_hash = 'efa9a3729d47c5c47c0c763107f82dbeb8ba63e479274b2661edf418850791fb'
assert hashlib.sha256(str(genre_user_top_rating).encode()).hexdigest() == expected_hash

### Task 3.5: The most popular books by genre

The Book Store wants to display the most popular book in each genre when the customer navigates to the genre tab.

Create a function that returns a dataframe with the most popular book of each genre, judging by the number of ratings each book received. The columns of the dataset should be `Genre`,`count` for the number of ratings of the most popular book and `ISBN`.

Hint | You might find the following function useful (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html">head()</a> - to get a retricted number of lines per group

In [28]:
book_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Genre
0,1155,451402103,8,True Crime
1,1155,671689231,7,Biography & Autobiography
2,70052,671689231,7,Biography & Autobiography
3,98904,671689231,6,Biography & Autobiography
4,110029,671689231,8,Biography & Autobiography


In [29]:
def get_top1_popularity_genre(df):
    """
    Parameters
        df - merged DataFrame with book ratings and genres
        
    Returns:
        top_books_genre - DataFrame with three columns: Genre, ISBN, count
    """
    
    # YOUR CODE HERE
    top_books_genre = pd.DataFrame()
    count_dict = df['ISBN'].value_counts().to_dict()

    top_books_genre['Genre'] = df['Genre']
    top_books_genre['count'] = df['ISBN']
    top_books_genre['ISBN'] = df['ISBN']
    top_books_genre['count'] = top_books_genre['count'].map(count_dict)
    top_books_genre = top_books_genre.sort_values(by=['count'], ascending=False)

    return top_books_genre.groupby('Genre').head(1)


top_books_genre = get_top1_popularity_genre(book_ratings)
top_books_genre

Unnamed: 0,Genre,count,ISBN
5539,Fiction,238,0316666343
1571,Juvenile Fiction,144,059035342X
14674,Biography & Autobiography,90,0385484518
17492,Humor,85,0316776963
12843,FICTION,80,0842329129
...,...,...,...
77609,Compost,1,0962976806
77542,Funny animal comics,1,0448405024
77664,Ability,1,1850919186
78537,HEALTH & FITNESS,1,0874779588


In [30]:
expected_hash = '9a85c6d41062f7ba7fd7c7130eb5975156f0fd04f93d74fc27778a6726d7c1f3'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='Fiction'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash

expected_hash_1 = '481b11af7b7f0cab7895d47507e7d85310dc49d4fc951117abecfbf7e23a28f2'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='poems'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '3d89b8a0dd59309c672f7a1af89ba217cf9cba6213adecf1906d4f3992a85cc9'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='Biography & Autobiography'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash_2

### Task 3.6: Top 3 best average rated books by genre

The Book Store also wants to display in the genre tab the "Best books to read". 

Create a function that returns a Dataframe with the top 3 books with the highest average rating in each genre. Don't forget to display the `Genre`, `ISBN` and average `Book-Rating`. Sort the dataframe by `Genre`.

In [78]:
book_ratings.isnull().sum()

User-ID           0
ISBN              0
Book-Rating       0
Genre          6529
dtype: int64

In [98]:
def get_top3_rates_genre(df):
    
    """
    Parameters
        df - merged DataFrame with ratings and genre
              
    Returns
        books - DataFrame sorted by genre with top 3 books with the highest average rating per genre
              - columns: Genre, ISBN, Book-Rating
    """
    
    # YOUR CODE HERE
    books = pd.DataFrame()

    #books = df[['ISBN', 'Book-Rating']].groupby(by='ISBN').mean().reset_index()
    #books['Genre'] = books['ISBN']
    #genre_dict = df[['ISBN', 'Genre']].set_index('ISBN').groupby(by='Genre').head(1).to_dict()
    #genre_dict = genre_dict['Genre']
    #books['Genre'] = books['Genre'].map(genre_dict)
    rating_dict = df[['ISBN', 'Book-Rating']].groupby(by='ISBN').mean().to_dict()
    rating_dict = rating_dict['Book-Rating']

    books['ISBN'] = df['ISBN']
    books['Genre'] = df['Genre']
    books['Book-Rating'] = df['ISBN']
    books['Book-Rating'] = books['Book-Rating'].map(rating_dict)
    books = books.groupby('ISBN').head(1)
    books = books.sort_values(by=['Book-Rating'], ascending=False)
    return books.groupby('Genre').head(3)

top3_rates_genre = get_top3_rates_genre(book_ratings)
top3_rates_genre

Unnamed: 0,ISBN,Genre,Book-Rating
91129,0500282420,Architecture,10.0
54779,0696013258,Cooking,10.0
54761,0448433400,Juvenile Fiction,10.0
54770,0609802348,House & Home,10.0
54771,0671537660,"Erotic stories, American",10.0
...,...,...,...
56389,1929976011,Inside Passage,1.0
78537,0874779588,HEALTH & FITNESS,1.0
84970,067150469X,Real estate investment,1.0
67429,067187120x,Social problems,1.0


In [99]:
top3_rates_genre[top3_rates_genre['Genre']=='Fiction']

Unnamed: 0,ISBN,Genre,Book-Rating
54788,684834456,Fiction,10.0
54744,373226160,Fiction,10.0
95743,896211231,Fiction,10.0


In [100]:
expected_hash = '250302a44bedd984034e258ba47827a340db357e8553b4d85ff573d894329123'
assert hashlib.sha256(str(top3_rates_genre.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '1e56c660887ba75c099588c47bf90b565fe315821214b14f1255a73cab988ed5'
assert hashlib.sha256(str(round(top3_rates_genre['Book-Rating'].sum(),0)).encode()).hexdigest() == expected_hash_1

### Task 4: Most common groups of books

The Book Store wants to display groups of books that the users usually rate together.

Create a function that returns the N most frequent sets of M books that the users rate together for a given minimum support, ordered by support. The function should return a dataframe with two columns `support` and `itemsets`. The input of the function is the rating matrix that you created in Task 2.

In [110]:
def get_apriori_booksets(R, min_support=0.003, n=3, m=3):
    
    """
    Parameters
        R - rating matrix
        min_support - minimal support for the itemsets
        n - number of top n itemsets to return
        m - number of items in itemsets
              
    Returns
        booksets - dataframe with the top n itemsets, 
                   with columns support and itemsets,
                   ordered by support in descending order
    """

    
    # YOUR CODE HERE
    R_ = pd.DataFrame(R > 0)
    booksets = apriori(R_, min_support).sort_values(by=['support'], ascending=False)
    booksets['length'] = booksets['itemsets'].apply(lambda x: len(x))
    booksets = booksets[(booksets['length'] >= m)]
    
    return booksets[['support', 'itemsets']][:n]

get_3_booksets = get_apriori_booksets(R, min_support=0.003, n=3, m=3)

get_3_booksets

Unnamed: 0,support,itemsets
664,0.005094,"(23929, 16018, 15979)"
663,0.004216,"(16018, 15979, 16130)"
662,0.00404,"(6738, 6754, 6791)"


In [111]:
assert get_3_booksets.shape[0]==3, 'The returned dataframe does not have the correct shape.'
assert get_3_booksets.shape[1]==2, 'The returned dataframe does not have the correct shape.'

assert 16018 in get_3_booksets.reset_index()['itemsets'][0]
assert 15979 in get_3_booksets.reset_index()['itemsets'][0]
assert 16130 in get_3_booksets.reset_index()['itemsets'][1]