In [2]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [4]:
#Users
u_cols = ['user_id', 'location', 'age']
users = pd.read_csv('BX-Users.csv', sep=';', names=u_cols, encoding='latin-1', low_memory=False)

#Books
i_cols = ['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
items = pd.read_csv('BX_Books.csv', sep=';', names=i_cols, encoding='latin-1', low_memory=False)

#Ratings
r_cols = ['user_id', 'isbn', 'rating']
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', names=r_cols, encoding='latin-1', low_memory=False)

In [5]:
users = users.drop(users.index[0])
items = items.drop(items.index[0])
ratings = ratings.drop(ratings.index[0])

In [6]:
users.head()

Unnamed: 0,user_id,location,age
1,1,"nyc, new york, usa",
2,2,"stockton, california, usa",18.0
3,3,"moscow, yukon territory, russia",
4,4,"porto, v.n.gaia, portugal",17.0
5,5,"farnborough, hants, united kingdom",


In [7]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 1 to 278858
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   278858 non-null  object
 1   location  278858 non-null  object
 2   age       168096 non-null  object
dtypes: object(3)
memory usage: 6.4+ MB


In [8]:
users['age'] = users['age'].astype(float)
users['user_id'] = users['user_id'].astype(int)

In [9]:
users.loc[(users.age > 99) | (users.age < 5), 'age'] = np.nan
users.age = users.age.fillna(users.age.mean())

In [10]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 1 to 278858
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278858 non-null  int64  
 1   location  278858 non-null  object 
 2   age       278858 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [11]:
users.describe()

Unnamed: 0,user_id,age
count,278858.0,278858.0
mean,139429.5,34.7439
std,80499.51502,10.540292
min,1.0,5.0
25%,69715.25,29.0
50%,139429.5,34.7439
75%,209143.75,35.0
max,278858.0,99.0


In [12]:
items.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l
1,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
3,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
4,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
5,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [13]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 1 to 271379
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 271379 non-null  object
 1   book_title           271379 non-null  object
 2   book_author          271377 non-null  object
 3   year_of_publication  271379 non-null  object
 4   publisher            271377 non-null  object
 5   img_s                271379 non-null  object
 6   img_m                271379 non-null  object
 7   img_l                271379 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [14]:
items.loc[items.publisher.isnull(), :]

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l
128897,193169656X,Tyrant Moon,Elaine Corvidae,2002,,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...
129044,1931696993,Finders Keepers,Linnea Sinclair,2001,,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...


In [15]:
items.loc[items.isbn == '193169656X', 'publisher'] = 'Mundania Press LLC'
items.loc[items.isbn == '1931696993', 'publisher'] = 'Novelbooks Incorporated'

In [16]:
items.loc[items.book_author.isnull(), :]

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l
118039,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley,http://images.amazon.com/images/P/0751352497.0...,http://images.amazon.com/images/P/0751352497.0...,http://images.amazon.com/images/P/0751352497.0...
187701,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...


In [17]:
items.loc[items.isbn == '9627982032', 'book_author'] = 'Larissa Anne Downe'
items.loc[items.isbn == '0751352497', 'book_author'] = 'None'

In [18]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 1 to 271379
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 271379 non-null  object
 1   book_title           271379 non-null  object
 2   book_author          271379 non-null  object
 3   year_of_publication  271379 non-null  object
 4   publisher            271379 non-null  object
 5   img_s                271379 non-null  object
 6   img_m                271379 non-null  object
 7   img_l                271379 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [19]:
items['year_of_publication'] = items['year_of_publication'].astype(int)

In [20]:
items['year_of_publication'].describe()

count    271379.000000
mean       1959.756050
std         258.011363
min           0.000000
25%        1989.000000
50%        1995.000000
75%        2000.000000
max        2050.000000
Name: year_of_publication, dtype: float64

In [21]:
print(sorted(items['year_of_publication'].unique()))

[np.int64(0), np.int64(1376), np.int64(1378), np.int64(1806), np.int64(1897), np.int64(1900), np.int64(1901), np.int64(1902), np.int64(1904), np.int64(1906), np.int64(1908), np.int64(1909), np.int64(1910), np.int64(1911), np.int64(1914), np.int64(1917), np.int64(1919), np.int64(1920), np.int64(1921), np.int64(1922), np.int64(1923), np.int64(1924), np.int64(1925), np.int64(1926), np.int64(1927), np.int64(1928), np.int64(1929), np.int64(1930), np.int64(1931), np.int64(1932), np.int64(1933), np.int64(1934), np.int64(1935), np.int64(1936), np.int64(1937), np.int64(1938), np.int64(1939), np.int64(1940), np.int64(1941), np.int64(1942), np.int64(1943), np.int64(1944), np.int64(1945), np.int64(1946), np.int64(1947), np.int64(1948), np.int64(1949), np.int64(1950), np.int64(1951), np.int64(1952), np.int64(1953), np.int64(1954), np.int64(1955), np.int64(1956), np.int64(1957), np.int64(1958), np.int64(1959), np.int64(1960), np.int64(1961), np.int64(1962), np.int64(1963), np.int64(1964), np.int64(1

In [22]:
items.loc[(items.year_of_publication == 0) |(items.year_of_publication > 2008) , 'year_of_publication'] = np.nan
items.year_of_publication = items.year_of_publication.fillna(round(items.year_of_publication.mean()))

In [23]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
1,276725,034545104X,0
2,276726,0155061224,5
3,276727,0446520802,0
4,276729,052165615X,3
5,276729,0521795028,6


In [24]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 1 to 1149780
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1149780 non-null  object
 1   isbn     1149780 non-null  object
 2   rating   1149780 non-null  object
dtypes: object(3)
memory usage: 26.3+ MB


In [25]:
ratings['user_id'] = ratings['user_id'].astype(int)
ratings['rating'] = ratings['rating'].astype(int)

In [26]:
df = pd.merge(users, ratings, on='user_id')
df = pd.merge(df, items, on='isbn')
df.head()

Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l
0,2,"stockton, california, usa",18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,8,"timmins, ontario, canada",34.7439,60973129,0,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,8,"timmins, ontario, canada",34.7439,374157065,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,8,"timmins, ontario, canada",34.7439,393045218,0,The Mummies of Urumchi,E. J. W. Barber,1999.0,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [27]:
df = df.drop(['location','img_s','img_m','img_l'], axis=1)

In [28]:
df.head()

Unnamed: 0,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher
0,2,18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press
1,8,34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
2,8,34.7439,60973129,0,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial
3,8,34.7439,374157065,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux
4,8,34.7439,393045218,0,The Mummies of Urumchi,E. J. W. Barber,1999.0,W. W. Norton & Company


In [29]:
df.shape

(1031175, 8)

In [30]:
df = df.drop(['age', 'book_author', 'publisher', 'year_of_publication'], axis=1)

In [31]:
top_books = df['book_title'].value_counts().index[:10000]
top_users = df['user_id'].value_counts().index[:10000]

df_limited = df[df['book_title'].isin(top_books) & df['user_id'].isin(top_users)]

user_book_table = df_limited.pivot_table(index="book_title", columns="user_id", values="rating").fillna(0)

In [32]:
random_book_index = np.random.choice(user_book_table.shape[0])
print(f'Chosen book: {user_book_table.index[random_book_index]}')

Chosen book: A Man in Full


In [33]:
user_book_table_matrix = csr_matrix(user_book_table)

In [34]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_book_table_matrix)

In [35]:
distances, indices = model_knn.kneighbors(user_book_table.iloc[random_book_index,:].values.reshape(1,-1), n_neighbors = 11)

In [36]:
book = []
distance = []

for i in range(0, len(distances.flatten())):
    if i != 0:
        book.append(user_book_table.index[indices.flatten()[i]])
        distance.append(distances.flatten()[i])

book_df = pd.Series(book, name='book')
distance_df = pd.Series(distance, name='distance')
recommendation = pd.concat([book_df, distance_df], axis=1)
recommendation = recommendation.sort_values('distance', ascending=False)

print('Recommendations for {0}:\n'.format(user_book_table.index[random_book_index]))
for i in range(1, recommendation.shape[0] + 1):
    print('{0}: {1}, with distance of {2}'.format(i, recommendation["book"].iloc[i - 1], recommendation["distance"].iloc[i - 1]))

Recommendations for A Man in Full:

1: Bandits, with distance of 0.8378565238592491
2: Duane's Depressed, with distance of 0.8376730689906806
3: Comanche Moon (Lonesome Dove), with distance of 0.8187453779482057
4: All the Pretty Horses (Border Trilogy, Vol 1), with distance of 0.8161607449576647
5: The Covenant of the Flame, with distance of 0.8143046618229481
6: Bright Orange for the Shroud, with distance of 0.8131476703290851
7: Coyote Waits, with distance of 0.7930487071861343
8: My Antonia (Dover Thrift Editions), with distance of 0.7910927445508167
9: The right stuff, with distance of 0.7889476826892948
10: HEAVEN'S PRISONERS (Dave Robicheaux Mysteries (Paperback)), with distance of 0.7853644847485304
