In [2]:
import pandas as pd
dataFile='C:\\Users\\Carolina\\Documents\\GitHub\\calderon-mena-diana-carolina\\06-recommending-algorithm\\data\\BX-Book-Ratings.csv'

In [4]:
data=pd.read_csv(dataFile,sep=";",header=0,names=["user","isbn","rating"], encoding="ISO-8859-1")


In [5]:
data

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
5,276733,2080674722,0
6,276736,3257224281,8
7,276737,0600570967,6
8,276744,038550120X,7
9,276745,342310538,10


In [6]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
bookFile='C:\\Users\\Carolina\\Documents\\GitHub\\calderon-mena-diana-carolina\\06-recommending-algorithm\\data\\BX-Books.csv'

In [8]:
books=pd.read_csv(bookFile,sep=";",header=0,error_bad_lines=False, usecols=[0,1,2],index_col=0,names=['isbn',"title","author"], encoding="ISO-8859-1")

In [9]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [10]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author

In [11]:
bookMeta("0393045218")


('The Mummies of Urumchi', 'E. J. W. Barber')

In [12]:
def faveBooks(user,N):
    userRatings = data[data["user"]==user]  # Filtering by user  (relevant data)
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N]  # Sort by rating in desc order pick N
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)  # Apply bookMeta to ISBN column
    return sortedRatings

In [13]:
faveBooks(204622,5)  # Top 5 books of that user


Unnamed: 0,user,isbn,rating,title
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


In [14]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head(10)


0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0679781587     639
0142001740     615
067976402X     614
0671027360     586
Name: isbn, dtype: int64

In [15]:
ISBNsPerUser = data.user.value_counts()

ISBNsPerUser.shape 

(105283,)

In [16]:
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]  # ISBN read for more than 10 users

data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)] 

In [17]:
userItemRatingMatrix=pd.pivot_table(data, values='rating',
                                    index=['user'], columns=['isbn'])  # Rating Matrix (pivot)

userItemRatingMatrix.head()


isbn,000000000,0002005018,0002251760,0002259001,0002259834,0002558122,0006172768,0006374921,0006475973,0006479286,...,9722015184,9722020609,9722319345,9724113361,9726106141,9726116902,9727591965,9727722458,9770390107900,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,5.0,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [18]:
userItemRatingMatrix.shape

user1 = 204622
user2 = 255489


user1Ratings = userItemRatingMatrix.transpose()[user1]  # User 1 
user1Ratings.head()


user2Ratings = userItemRatingMatrix.transpose()[user2]  # User 1 

In [19]:
from scipy.spatial.distance import hamming 
hamming(user1Ratings,user2Ratings) 

0.9999381264694963

In [20]:
import numpy as np
# Distance by any pair of users
def distance(user1,user2):
        try:
            user1Ratings = userItemRatingMatrix.transpose()[user1]
            user2Ratings = userItemRatingMatrix.transpose()[user2]
            distance = hamming(user1Ratings,user2Ratings)
        except: 
            distance = np.NaN
        return distance 

In [21]:
distance(204622,10118)


0.9998762529389927

In [22]:
user = 204622
allUsers = pd.DataFrame(userItemRatingMatrix.index)
allUsers = allUsers[allUsers.user!=user]  # Remove the actual user to recommend
allUsers.head()

Unnamed: 0,user
0,8
1,99
2,242
3,243
4,254


In [23]:
allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x))  # Apply the distance function to this new column


In [24]:
K = 10
KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["user"][:K]  # Find the K nearest neighbors


In [25]:
KnearestUsers

3509      82893
2872      68555
3693      87555
1991      48046
10582    251422
273        7346
620       16795
8287     198711
9689     232131
5901     140036
Name: user, dtype: int64