In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
user_data = pd.read_csv('data/BX-CSV/BX-Users.csv', delimiter=';', encoding='latin-1')
book_data = pd.read_csv('data/BX-CSV/BX-Books.csv', delimiter=';', encoding='latin-1', on_bad_lines='skip',low_memory=False)
rating_data = pd.read_csv('data/BX-CSV/BX-Book-Ratings.csv', delimiter=';', encoding='latin-1')

In [3]:
user_data['Country'] = user_data['Location'].map(lambda x: x.split(', '))
user_data['Country'] = user_data['Country'].map(lambda x: x[-1])
user_data = user_data.drop(['Location', 'Age'], axis=1)
user_data.head()

Unnamed: 0,User-ID,Country
0,1,usa
1,2,usa
2,3,russia
3,4,portugal
4,5,united kingdom


In [4]:
regional_data = user_data[(user_data['Country'] == 'spain')|(user_data['Country'] == 'usa')]

In [5]:
book_data = book_data.drop(['Year-Of-Publication','Publisher','Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)

In [6]:
book_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [7]:
rating_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [8]:
rating_data.shape

(1149780, 3)

In [9]:
ratings = rating_data.merge(book_data, on='ISBN').merge(regional_data, on='User-ID')

In [10]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Country
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,usa
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,usa
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,usa
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,usa
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,usa


In [11]:
ratings.shape

(761434, 6)

In [12]:
count_ratings = ratings['Book-Title'].value_counts() > 50
col = count_ratings[count_ratings == True]
rated_data = ratings[ratings['Book-Title'].isin(col.index)]

In [13]:
rated_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Country
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,usa
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,usa
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,usa
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,usa
5,2313,0385482388,5,The Mistress of Spices,Chitra Banerjee Divakaruni,usa


In [14]:
rated_data.shape

(197792, 6)

## content based filtering

In [15]:
tfidf = TfidfVectorizer(stop_words='english')

In [16]:
tfidf_matrix = tfidf.fit_transform(rated_data['Book-Title'])

In [17]:
tfidf_matrix.shape

(197792, 2228)

In [18]:
#cos_sim = cosine_similarity(tfidf_matrix)

In [19]:
def pred(title, top_n=5):
    index = df[df['x'] == title].index
    similarity = cos_sim[index].T

    sim_df = pd.DataFrame(similarity, columns=['similar'])
    
    return None

## collaborative filtering

In [20]:
ratings = rated_data[['Book-Title', 'User-ID', 'Book-Rating']]

In [21]:
ratings.head()

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Flesh Tones: A Novel,276725,0
1,Flesh Tones: A Novel,2313,5
2,Ender's Game (Ender Wiggins Saga (Paperback)),2313,9
4,Divine Secrets of the Ya-Ya Sisterhood : A Novel,2313,9
5,The Mistress of Spices,2313,5


In [22]:
ratings['Book-Title'].value_counts()

Wild Animus                                          1273
The Lovely Bones: A Novel                             946
The Da Vinci Code                                     729
A Painted House                                       709
The Secret Life of Bees                               675
                                                     ... 
This Year It Will Be Different: And Other Stories      51
The Drowning People                                    51
Ragtime                                                51
Original Sin                                           51
Dracula                                                51
Name: Book-Title, Length: 1756, dtype: int64

In [23]:
ratings['Book-Title'] = ratings['Book-Title'].astype(str)
ratings['User-ID'] = ratings['User-ID'].astype(str)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['Book-Title'] = ratings['Book-Title'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['User-ID'] = ratings['User-ID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)


In [24]:
ratings = ratings.drop_duplicates(['Book-Title', 'User-ID'])

In [25]:
ratings.shape

(195467, 3)

In [26]:
df_pivot = ratings.pivot(index='Book-Title', columns='User-ID', values='Book-Rating')

In [27]:
df_pivot.shape

(1756, 33141)

In [28]:
df_pivot = df_pivot.fillna(0)

In [29]:
df_pivot.head()

User-ID,100002,100004,10001,100010,100025,100030,100043,100044,100046,10005,...,999,9991,99919,9992,99946,99955,99963,99970,99973,99980
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
nam = csr_matrix(df_pivot.values)

In [31]:
nam.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
count = df_pivot['2313']>0
count = count[count==True]
count

Book-Title
Divine Secrets of the Ya-Ya Sisterhood : A Novel                                    True
Ender's Game (Ender Wiggins Saga (Paperback))                                       True
Flesh Tones: A Novel                                                                True
Song of Solomon (Oprah's Book Club (Paperback))                                     True
The Bonesetter's Daughter                                                           True
The Hundred Secret Senses                                                           True
The Martian Chronicles                                                              True
The Mistress of Spices                                                              True
The Princess Bride: S Morgenstern's Classic Tale of True Love and High Adventure    True
Name: 2313, dtype: bool

## weighted mean