# Import libraries, initialise variables

In [59]:
import os
import re
import pickle
from collections import Counter
import zipfile

import requests
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors


In [60]:
def get_stats(df):
    print(f"Data points: {len(df)}")
    print(f"Missing values: \n{df.isna().sum()}")
    print('-'*50)

# Read data

In [61]:
DATE = "asdf"

def create_dir_if_not_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

DATADIR = 'data'
create_dir_if_not_exists(DATADIR)

MODELDIR = 'models'
create_dir_if_not_exists(MODELDIR)


In [62]:
urls = [
    'https://cdn.freecodecamp.org/project-data/books/book-crossings.zip'
]

for i in urls:
    filename = os.path.basename(i)
    filename_dir = os.path.join( DATADIR, filename )
    if os.path.exists(filename_dir):
        print(f'[INFO] Path {filename_dir} already exists! Skipping download.')
    else:
        r = requests.get(i)
        with open(filename_dir, 'wb') as f:
            f.write(r.content)
        print(f'[INFO] Downloaded from web to path {filename_dir}')

books_filename = 'data/BX-Books.csv'
ratings_filename = 'data/BX-Book-Ratings.csv'

[INFO] Path data\book-crossings.zip already exists! Skipping download.


In [63]:
### Unzip
with zipfile.ZipFile('data/book-crossings.zip', 'r') as zip:
    zip.extractall(path='data')

In [64]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

get_stats(df_books)
df_books.head()

Data points: 271379
Missing values: 
isbn      0
title     0
author    2
dtype: int64
--------------------------------------------------


Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [65]:
df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

get_stats(df_ratings)
df_ratings.head()

Data points: 1149780
Missing values: 
user      0
isbn      0
rating    0
dtype: int64
--------------------------------------------------


Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


# EDA

In [66]:
def eda(df:pd.DataFrame(), column_name:str):
    print(f"Stats for variable '{column_name}':")
    print(f" - Number of data points: {len(df)}")
    df2 = df[column_name].value_counts()
    print(f" - Mean: {df2.mean():.0f}") 
    print(f" - Median: {df2.median()}")
    print(f" - Mode: {df2.mode()}")
    print(f" - Min: {df2.min()} | Max: {df2.max()}")
    print(f" - Quartiles: \n{df2.quantile([0.01, 0.25, 0.5, 0.99])}")
    print('-'*50)

eda(df_ratings, 'user')

Stats for variable 'user':
 - Number of data points: 1149780
 - Mean: 11
 - Median: 1.0
 - Mode: 0    1
Name: count, dtype: int64
 - Min: 1 | Max: 13602
 - Quartiles: 
0.01      1.0
0.25      1.0
0.50      1.0
0.99    174.0
Name: count, dtype: float64
--------------------------------------------------


In [67]:
eda(df_ratings, 'isbn')

Stats for variable 'isbn':
 - Number of data points: 1149780
 - Mean: 3
 - Median: 1.0
 - Mode: 0    1
Name: count, dtype: int64
 - Min: 1 | Max: 2502
 - Quartiles: 
0.01     1.0
0.25     1.0
0.50     1.0
0.99    36.0
Name: count, dtype: float64
--------------------------------------------------


In [68]:
### create DF with value_counts of each user
df_filter_users = df_ratings['user'].value_counts().to_frame().reset_index()
### Filter in only those users whose count is higher than 200
df_filter_users_2 = df_filter_users[df_filter_users['count'] > 200]
### Create a list of users whose count is higher than 200
filter_users = list(df_filter_users_2['user'])

### same with books
df_filter_books = df_ratings['isbn'].value_counts().to_frame().reset_index()
df_filter_books_2 = df_filter_books[df_filter_books['count'] > 100]
filter_books = list(df_filter_books_2['isbn'])

df_ratings_2 = df_ratings[df_ratings['user'].isin(filter_users) & df_ratings['isbn'].isin(filter_books)]

eda(df_ratings_2, 'user')
eda(df_ratings_2, 'isbn')


Stats for variable 'user':
 - Number of data points: 49254
 - Mean: 56
 - Median: 44.0
 - Mode: 0    26
1    32
2    46
3    48
Name: count, dtype: int64
 - Min: 1 | Max: 651
 - Quartiles: 
0.01      1.81
0.25     23.00
0.50     44.00
0.99    242.00
Name: count, dtype: float64
--------------------------------------------------
Stats for variable 'isbn':
 - Number of data points: 49254
 - Mean: 68
 - Median: 60.0
 - Mode: 0    63
Name: count, dtype: int64
 - Min: 16 | Max: 363
 - Quartiles: 
0.01     23.2
0.25     46.0
0.50     60.0
0.99    191.0
Name: count, dtype: float64
--------------------------------------------------


In [69]:
df_ratings_3 = df_ratings_2

In [70]:
df_ratings_3.head()

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1469,277427,0060930535,0.0
1471,277427,0060934417,0.0
1474,277427,0061009059,9.0
1484,277427,0140067477,0.0


# Preprocessing

In [71]:
df_books_preproc = df_books[['isbn', 'title']]
get_stats(df_books_preproc)
df_books_preproc.head()

Data points: 271379
Missing values: 
isbn     0
title    0
dtype: int64
--------------------------------------------------


Unnamed: 0,isbn,title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [72]:
df_combined = pd.merge(df_ratings_3, df_books_preproc, on='isbn')
get_stats(df_combined)
df_combined.head()

Data points: 48990
Missing values: 
user      0
isbn      0
rating    0
title     0
dtype: int64
--------------------------------------------------


Unnamed: 0,user,isbn,rating,title
0,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...


In [73]:
df_combined.drop_duplicates(subset=['user', 'title'], inplace=True, keep='first')
df_combined.head()

Unnamed: 0,user,isbn,rating,title
0,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...


In [74]:
df_combined = df_combined[['isbn', 'title', 'user', 'rating']]
df_combined = df_combined.reset_index()
df_combined = df_combined[['isbn', 'title', 'user', 'rating']]
df_combined.head()


Unnamed: 0,isbn,title,user,rating
0,002542730X,Politically Correct Bedtime Stories: Modern Ta...,277427,10.0
1,002542730X,Politically Correct Bedtime Stories: Modern Ta...,3363,0.0
2,002542730X,Politically Correct Bedtime Stories: Modern Ta...,11676,6.0
3,002542730X,Politically Correct Bedtime Stories: Modern Ta...,12538,10.0
4,002542730X,Politically Correct Bedtime Stories: Modern Ta...,13552,0.0


In [75]:
get_stats(df_combined)

Data points: 48615
Missing values: 
isbn      0
title     0
user      0
rating    0
dtype: int64
--------------------------------------------------


In [76]:
df_combined2 = df_combined.pivot_table( index='isbn', columns='user', values='rating', aggfunc='mean' ).reset_index().rename_axis(None, axis=1)
df_combined2.head()


Unnamed: 0,isbn,254,2276,2766,2977,3363,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
0,002542730X,,,,,0.0,,,,,...,,,,,,,10.0,,,
1,0060008032,,,,,,,,,,...,,,,,,,,,,
2,0060096195,,,,,0.0,,,,,...,,,,,,,,,,
3,006016848X,,,,,,,,,,...,,,0.0,,,,,,,0.0
4,0060173289,,,,,,,,,,...,,,,,,,,,,


In [77]:
df_combined2 = df_combined2.fillna(0)
df_combined2

Unnamed: 0,isbn,254,2276,2766,2977,3363,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
0,002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
1,0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
712,1573227331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
713,1573229326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
714,1573229571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
715,1592400876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
df_combined2.set_index('isbn', inplace=True)
df_combined2

Unnamed: 0_level_0,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573227331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1592400876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train

In [79]:
N_predicted_neighbours = 5
KNN = NearestNeighbors(metric='cosine', n_neighbors=N_predicted_neighbours, n_jobs=-1)

In [80]:
KNN.fit(df_combined2)

In [81]:
# idd = '0001047213'
# df_books[df_books['isbn'] == idd]



In [82]:
# KNN.kneighbors([list(df_combined2.loc['B0001I1KOG'])])

In [83]:
# df_books[df_books['title'].str.contains('Potter')]

In [99]:
df_books

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [143]:
def predict_from_title(title:str):
    print(f"You entered: '{title}'")
    ### Try to find the title in the books list
    if title in list(df_books['title'].values):
        print('Found the title!')
        isbn_to_predict = df_books[df_books['title'] == title]['isbn'].values[0]
    else:
        ### Else - get the first string containing substring of "title"
        print("Title was not found exactly - getting the closest match...")
        df_books['title_lower'] = df_books['title'].str.lower()
        title_lower = title.lower()
        isbn_to_predict = df_books[df_books['title_lower'].str.contains(title_lower)]['isbn'].values[0]
    ###
    title_to_predict = df_books[df_books['isbn'] == isbn_to_predict]['title'].values[0]
    print(f"Get 5 closest recommendations to '{title_to_predict}':")
    # isbn_to_predict = df_books[df_books['title'] == title]['isbn'].values[0]
    # print(isbn_to_predict)
    try:
        predictions = KNN.kneighbors([list(df_combined2.loc[isbn_to_predict])])
    except:
        print('It seems that this book was not used during training. Please try again with another book :(')
        print('-'*50)
        return None
    # print(predictions)
    counter = 1
    for i in predictions[1][0]:
        # print(i)
        # print(df_books[df_books['isbn'] == i])
        indexx = df_combined2.index[i]
        predicted_title = df_books[df_books['isbn'] == indexx]['title'].values[0]
        print(f"{counter}) {predicted_title}")
        counter += 1
    print('-'*50)

predict_from_title("Harry Potter and the Sorcerer's Stone (Book 1)")
predict_from_title('harry potter and the goblet')
predict_from_title("Where the Heart Is (Oprah's Book Club (Paperback))")
predict_from_title("harry potter")

You entered: 'Harry Potter and the Sorcerer's Stone (Book 1)'
Found the title!
Get 5 closest recommendations to 'Harry Potter and the Sorcerer's Stone (Book 1)':
1) Harry Potter and the Sorcerer's Stone (Book 1)
2) Harry Potter and the Prisoner of Azkaban (Book 3)
3) Harry Potter and the Chamber of Secrets (Book 2)
4) Harry Potter and the Goblet of Fire (Book 4)
5) Harry Potter and the Order of the Phoenix (Book 5)
--------------------------------------------------
You entered: 'harry potter and the goblet'
Title was not found exactly - getting the closest match...
Get 5 closest recommendations to 'Harry Potter and the Goblet of Fire (Book 4)':
1) Harry Potter and the Goblet of Fire (Book 4)
2) Harry Potter and the Prisoner of Azkaban (Book 3)
3) Harry Potter and the Chamber of Secrets (Book 2)
4) Harry Potter and the Order of the Phoenix (Book 5)
5) Harry Potter and the Sorcerer's Stone (Book 1)
--------------------------------------------------
You entered: 'Where the Heart Is (Oprah

In [85]:

isbn_to_predict = df_books[df_books['title'] == "Where the Heart Is (Oprah's Book Club (Paperback))"]['isbn'].values[0]
print(isbn_to_predict)

predictions = KNN.kneighbors([list(df_combined2.loc[isbn_to_predict])])
print(predictions)
for i in predictions[1][0]:
    print(i)
    # print(df_books[df_books['isbn'] == i])
    indexx = df_combined2.index[i]
    print(df_books[df_books['isbn'] == indexx]['title'].values[0])

0446672211
(array([[2.22044605e-16, 7.23018438e-01, 7.67707509e-01, 7.69941097e-01,
        7.70858357e-01]]), array([[427, 104,  26, 178, 116]], dtype=int64))
427
Where the Heart Is (Oprah's Book Club (Paperback))
104
The Lovely Bones: A Novel
26
I Know This Much Is True
178
The Surgeon
116
The Weight of Water
