### Notes

**Surprise Lib:**
- [link explaining how to use Surprise and SVD](https://surprise.readthedocs.io/en/stable/getting_started.html)
- [surprise predictors parent class](https://surprise.readthedocs.io/en/stable/algobase.html#surprise.prediction_algorithms.algo_base.AlgoBase.predict)

[link explaining SVD and prediction conceptually](https://www.youtube.com/watch?v=8wLKuscyO9I)

[link for walk-through using SVD from surprise](https://towardsdatascience.com/svd-where-model-tuning-goes-wrong-61c269402919)

[link to video of creator of Surprise discussing it](https://www.youtube.com/watch?v=z0dx-YckFko)

- may be able to use SVD++ from Surprise! library
- for testing, could determine recommendations for users already in system and compare them to the users top rated beers

### About

this notebook follows the API docs for surprise found [here](https://surprise.readthedocs.io/en/stable/getting_started.html)

In [60]:
import collab_filtering as cf
from collections import defaultdict
import os
import time

import math
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import dump
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

import numpy as np
import pandas as pd

from fuzzywuzzy import fuzz
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df,beer_ids = cf.import_data()
id2beer, beer2id = cf.get_maps(beer_ids)

In [4]:
beer_mat = cf.get_beer_matrix(df)

In [5]:
df

Unnamed: 0,beer_id,user_score,user_id
0,18580,3.75,1
1,18570,4.25,1
2,18581,4.25,1
3,4200,4.25,1
4,1,4.50,1
...,...,...,...
4837387,3583,4.25,101906
4837388,14654,4.00,101906
4837389,1106,3.40,101906
4837390,11819,4.00,101906


In [6]:
# returns a dataframe of a users top beers by both name and beer id
def get_user_top_beers(user_id, df, num_beers=10):
    if df[df['user_id'] == user_id]['user_id'].count() < 1:
        print('user not found')
        return
    
    # grab all users ratings and sort them from highest to lowest
    user = df[df['user_id'] == user_id].sort_values(by='user_score', ascending=False)
    # save only the first num_beers and add their names to the df
    user_top_beers = user.head(num_beers)
    user_top_beers['beer_name'] = [id2beer[beer_id] for beer_id in user_top_beers['beer_id'].values]
    
    return user_top_beers
    

In [7]:
# allows surprise to read df
reader = Reader(rating_scale=(1, 5))
# must load in particular column order
data = Dataset.load_from_df(df[['user_id','beer_id', 'user_score']], reader)

In [8]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = SVD()

In [9]:
# Train the algorithm on the trainset     
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15ce4886708>

In [10]:
# and predict ratings for the testset. test() returns a list of prediction objects
# which have several attributes such as est (the prediction) and r_ui (the true rating)
predictions = algo.test(testset)

In [67]:
# rmse below 1 is considered low
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.3981
MAE:  0.2818


In [144]:
# saves trained algorithm in file
dump.dump("svd_dump.txt",algo=algo)

In [12]:
# put predictions into a dataframe
pred_df = pd.DataFrame(predictions, columns=['user_id', 'beer_id', 'user_score', 'est', 'details'])
pred_df

Unnamed: 0,user_id,beer_id,user_score,est,details
0,18084,22616,4.66,4.796965,{'was_impossible': False}
1,82498,13461,3.15,3.559246,{'was_impossible': False}
2,51396,8449,4.18,4.137552,{'was_impossible': False}
3,74,24317,3.88,4.009833,{'was_impossible': False}
4,86213,12471,4.25,4.267443,{'was_impossible': False}
...,...,...,...,...,...
1209343,2317,892,4.45,4.064481,{'was_impossible': False}
1209344,85576,7968,4.50,4.277146,{'was_impossible': False}
1209345,28118,2577,4.50,4.240233,{'was_impossible': False}
1209346,77152,4595,3.75,3.699484,{'was_impossible': False}


In [14]:
user_1 = pred_df[pred_df.user_id == 1].sort_values(by='est', ascending=False)
user_1

Unnamed: 0,user_id,beer_id,user_score,est,details
919037,1,4200,4.25,4.460573,{'was_impossible': False}
821195,1,7,4.75,4.443422,{'was_impossible': False}
59495,1,4,4.25,4.432576,{'was_impossible': False}
1103042,1,18567,4.25,4.160399,{'was_impossible': False}
1027193,1,18570,4.25,4.141934,{'was_impossible': False}
690288,1,18581,4.25,3.9969,{'was_impossible': False}
670367,1,18560,4.0,3.892783,{'was_impossible': False}
928563,1,18579,3.5,3.89032,{'was_impossible': False}


In [15]:
user_1_top_10 = get_user_top_beers(1, df, num_beers=10)
user_1_top_10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,beer_id,user_score,user_id,beer_name
11,18568,5.0,1,Saint Arnold Brewing Company Endeavour IPA
25,10,4.75,1,(512) Brewing Company (512) Pecan Porter
19,18575,4.75,1,Saint Arnold Brewing Company Pumpkinator
8,13,4.75,1,(512) Brewing Company (512) Whiskey Barrel Age...
9,7,4.75,1,(512) Brewing Company (512) IPA
4,1,4.5,1,(512) Brewing Company (512) Black IPA
15,19337,4.5,1,Sierra Nevada Brewing Co. Sierra Nevada Pale Ale
1,18570,4.25,1,Saint Arnold Brewing Company Homefront IPA
23,9,4.25,1,(512) Brewing Company (512) Pale
21,4,4.25,1,(512) Brewing Company (512) Cascabel Cream Stout


In [111]:
# returns the average difference between a user score and an estimated score for each beer the user reviewed
def get_avg_diff(user):
    diff = user['user_score'].sub(user['est']).abs().values
    return diff.sum() / diff.size

def get_beers_not_tried(user_id, df):
    """
    returns a list of beers in the dataset that a user has not tried
    :param user_id: int id of a user
    :param df: dataframe containing user ratings
    :return: a numpy array of beer ID's
    """
    user_df = df.groupby('user_id')
    beers_tried = user_df.get_group(1)['beer_id'].unique()
    not_tried = df[~df['beer_id'].isin(beers_tried)]['beer_id'].unique()
    return not_tried

    

In [58]:
user = 1
algo.predict(user,4)

Prediction(uid=1, iid=4, r_ui=None, est=4.432575521030691, details={'was_impossible': False})

In [145]:
def make_svd_recommendation(user, algo, df, num_beers=10):
    # build a list of beers from the dataset that a user has not tried
    user_df = df.groupby('user_id')
    beers_tried = user_df.get_group(1)['beer_id'].unique()
    not_tried = df[~df['beer_id'].isin(beers_tried)]['beer_id'].unique()
    # get all predictions for the beers a user has not tried
    user_preds = sorted([algo.predict(user, beer) for beer in not_tried], key=lambda pred: pred.est, reverse=True)
    top_beers = [(id2beer[pred.iid], pred.est) for pred in user_preds]
    
    for rec in top_beers[:num_beers]:
        print('beer: %s \nestimated_rec: %2f\n' %(rec[0], rec[1]))
    
    return top_beers[:num_beers]

In [None]:
# get the top 10 recommendations for user number 9
user_9_recommendations = make_svd_recommendation(9,algo,df)