In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import Reader, Dataset, SVD, evaluate, dump, accuracy
from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint
from reduction import reduce_matrix, get_sparse

In [2]:
def get_top_n_recs(result, books, n, q):
    recs = []
    for i in range(len(result)):
        if q[i] == 0: # book user hasn't already rated
            recs.append((i, result[i]))
        else:
            recs.append((i, float('-inf')))
    recs = sorted(recs, key=lambda tup: tup[1], reverse=True)

    top_titles = []
    for i in range(n):
        book_id = recs[i][0]
        title = books.iloc[book_id]['title']
        top_titles.append(title)
    return top_titles

In [3]:
def map_user(q, V):
    # map new user to concept space by q*V
    user_to_concept = np.matmul(q, V)
    # map user back to itme space with user_to_concept * VT
    result = np.matmul(user_to_concept, V.T)
    return result

In [4]:
def map_user_sparse(q, V):
    q_sparse = scipy.sparse.csr_matrix(q)
    # map new user to concept space by q*V
    user_to_concept = q_sparse.dot(V)
    # map user back to itme space with user_to_concept * VT
    result = user_to_concept.dot(V.T).todense()
    return result.T

In [5]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [6]:
# Get dataframe from books
books = get_book_dataframe(data_path)

found books_dataframe in file...


In [34]:
# only readin in goodreads ratings here, join amazon to this
ratings = pd.read_csv(data_path + 'ratings.csv')

In [38]:
# prepare to be read in to Reader
ratings = ratings.sort_values(by=['user_id','book_id'])
ratings = ratings.reset_index(drop=True)
ratings = ratings.rename(index=str, columns={"user_id": "userID", "book_id": "itemID", "rating": "rating"})

In [40]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userID', 'itemID', 'rating']], reader)

In [51]:
svd = SVD(n_factors=100, n_epochs=20, random_state=1984)

In [None]:
data.split(n_folds=2)
evaluate(svd, data, measures=['RMSE', 'MAE'])

In [52]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1eefc51b978>

In [57]:
# save the trained SVD model
dump.dump(data_path + 'svd', algo=svd)

In [31]:
# load the saved SVD model - (predictions, algo)
svd = dump.load(data_path + 'svd')[1]

In [7]:
'''
Users Ratings need to be in a -2 - 3 scale. Bad ratings should count 'against' recs
'''

"\nUsers Ratings need to be in a -2 - 3 scale. Bad ratings should count 'against' recs\n"

In [93]:
# new user - likes fantasy
q = np.zeros((10000), dtype = np.int)
q[19-1] = 2.0 # LOTR 1
q[155-1] = 2.0 # LOTR 2
q[161-1] = 2.0 # LOTR 3
q[7-1] = 2.0 # Hobbit
q[611-1] = 2.0 #Silmarillion
q[189-1] = 2.0 #LOTR boxed set
q[135-1] = 2.0 #GOT
q[188-1] = 2.0 #GOT
q[330-1] = 2.0 #WOT
q[510-1] = 2.0 #WOT

In [95]:
# new user - likes sci-fi and mystery, hates fantasy
q = np.zeros((10000), dtype = np.int)
q[126-1] = 2.0 # Dune
q[70-1] = 2.0 # Enders Game
q[503-1] = 2.0 # Space Odyssey

q[514-1] = 2.0 # Sherlock Holmens
q[672-1] = 2.0 # Orient Express
q[200-1] = 2.0 # And then there were none

q[19-1] = -2.0 # LOTR 1
q[155-1] = -2.0 # LOTR 2
q[161-1] = -2.0 # LOTR 3
q[7-1] = -2.0 # Hobbit

In [8]:
# user from goodreads
q = np.load('../.tmp/user_vector.npy')

# Turn 1-5 rating scale into negative - positive scale
ratings_mapper = {0:0, 1:-2, 2:-1, 3:1, 4:2, 5:3}
for i in range(len(q)):
    q[i] = ratings_mapper[q[i]]

# for i in range(len(q)):
#     if q[i] != 0:
#         title = books.iloc[i]['title']
#         print("%s --> %s" % (q[i], title))

In [99]:
# r^ui = μ + bu + bi + qTipu
V = svd.qi
V.shape

(10000, 100)

In [100]:
recs = get_top_n_recs(map_user(q, svd.qi), books, 25, q)
for r in recs:
    print(r)

Cryptonomicon
Eat, Pray, Love
Anne of Green Gables (Anne of Green Gables, #1)
The Sisterhood of the Traveling Pants (Sisterhood, #1)
Twilight (Twilight, #1)
The Left Hand of Darkness
Yes Please
Before I Go to Sleep
Fahrenheit 451
Divine Secrets of the Ya-Ya Sisterhood
The One (The Selection, #3)
Safe Haven
The Plague
The Da Vinci Code (Robert Langdon, #2)
The Remains of the Day
It
Three Cups of Tea: One Man's Mission to Promote Peace ... One School at a Time
Eclipse (Twilight, #3)
The Tommyknockers
The Shadow of the Wind (The Cemetery of Forgotten Books,  #1)
The Nightingale
Bag of Bones
Olivia
The White Queen (The Plantagenet and Tudor Novels, #2)
Treasure Island


In [9]:
'''

Use Item Matrix to get recs for new user

'''

'\n\nUse Item Matrix to get recs for new user\n\n'

In [10]:
# Load in item_matrix (concepts and features) and test recs
filename = '../.tmp/item_matrix.npy'
item_matrix = np.load(filename)
item_matrix.shape

(10000, 120)

In [11]:
recs = get_top_n_recs(map_user(q, item_matrix), books, 25, q)
for r in recs:
    print(r)

A Wrinkle in Time (A Wrinkle in Time Quintet, #1)
Slaughterhouse-Five
Fahrenheit 451
A Wizard of Earthsea (Earthsea Cycle, #1)
Alice's Adventures in Wonderland & Through the Looking-Glass
Flowers for Algernon
A Wind in the Door (A Wrinkle in Time Quintet, #2)
Little Women (Little Women, #1)
Alice in Wonderland
Grendel
Dandelion Wine (Green Town, #1)
The Lovely Bones
Many Waters (A Wrinkle in Time Quintet, #4)
Tuck Everlasting
A Swiftly Tilting Planet (A Wrinkle in Time Quintet, #3)
The Horse and His Boy (Chronicles of Narnia, #5)
The Grapes of Wrath
The Neverending Story
The Handmaid's Tale
The Silver Chair (Chronicles of Narnia, #4)
An Acceptable Time (A Wrinkle in Time Quintet, #5)
Howl's Moving Castle (Howl's Moving Castle, #1)
The Bone Clocks
The Secret Garden
The Earthsea Trilogy


In [12]:
'''

Use Just part of Item Matrix for Recs

'''

'\n\nUse Just part of Item Matrix for Recs\n\n'

In [17]:
recs = get_top_n_recs(map_user(q, item_matrix[:,0:20]), books, 10, q)
for r in recs:
    print(r)

The Da Vinci Code (Robert Langdon, #2)
The Help
Twilight (Twilight, #1)
Angels & Demons  (Robert Langdon, #1)
Fahrenheit 451
Memoirs of a Geisha
Little Women (Little Women, #1)
Life of Pi
The Lovely Bones
Water for Elephants


In [14]:
'''

Use Feature Matrix for Recs

'''

'\n\nUse Feature Matrix for Recs\n\n'

In [15]:
# produce feature matrix
feature_matrix = get_book_features(books)
feature_matrix.shape

feature_matrix exists in file...


(10000, 82203)

In [16]:
recs = get_top_n_recs(map_user_sparse(q, feature_matrix), books, 25, q)
for r in recs:
    print(r)

A Wrinkle in Time (A Wrinkle in Time Quintet, #1)
Grendel
A Wizard of Earthsea (Earthsea Cycle, #1)
Dandelion Wine (Green Town, #1)
A Wind in the Door (A Wrinkle in Time Quintet, #2)
Many Waters (A Wrinkle in Time Quintet, #4)
An Acceptable Time (A Wrinkle in Time Quintet, #5)
The Earthsea Trilogy
A Swiftly Tilting Planet (A Wrinkle in Time Quintet, #3)
Slaughterhouse-Five
Flowers for Algernon
Tuck Everlasting
The Bone Clocks
Alice in Wonderland
The Neverending Story
Alice's Adventures in Wonderland & Through the Looking-Glass
Howl's Moving Castle (Howl's Moving Castle, #1)
The Lost World (Professor Challenger, #1)
Brave New World Revisited 
Alice's Adventures in Wonderland
Through the Looking-Glass, and What Alice Found There
Brave New World / Brave New World Revisited
The Horse and His Boy (Chronicles of Narnia, #5)
The Buried Giant
Journey to the Center of the Earth (Extraordinary Voyages, #3)
