# Recommender Systems with Surprise

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datapane as dp
import chart_studio.plotly as py
import chart_studio
import os
username = os.environ['PLOTLY_USERNAME']
api_key = os.environ['PLOTLY_API_KEY']
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
%matplotlib inline

KeyError: 'PLOTLY_USERNAME'

## Reading the Data

In [3]:
ratings_data = pd.read_csv('./data/ratings.csv.zip')
books_metadata = pd.read_csv('./data/books.csv.zip')
ratings_data.head(10)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
5,1,2077,4
6,1,2487,4
7,1,2900,5
8,1,3662,4
9,1,3922,5


In [4]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981756 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  981756 non-null  int64
 1   user_id  981756 non-null  int64
 2   rating   981756 non-null  int64
dtypes: int64(3)
memory usage: 22.5 MB


In [5]:
books_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   book_id                    10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

## Creating a Dataset for Surprise

In [9]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_data[['user_id', 'book_id', 'rating']], reader)

## Cross-Validating a Simple SVD Model

In [10]:
from surprise import SVD
from surprise.model_selection import cross_validate

svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8570  0.8551  0.8565  0.8562  0.0008  
MAE (testset)     0.6754  0.6748  0.6760  0.6754  0.0005  
Fit time          22.87   22.97   22.87   22.90   0.05    
Test time         3.64    3.89    3.84    3.79    0.11    


{'test_rmse': array([0.85695388, 0.85506625, 0.85652513]),
 'test_mae': array([0.67544767, 0.67481916, 0.67600046]),
 'fit_time': (22.8698091506958, 22.969661712646484, 22.866719484329224),
 'test_time': (3.6385087966918945, 3.889108896255493, 3.8368778228759766)}

In [11]:
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ed3a8ea550>

## Generating Rating Predictions

In [12]:
svd.predict(uid=10, iid=100)

Prediction(uid=10, iid=100, r_ui=None, est=3.9574235717432624, details={'was_impossible': False})

## Generating Recommendations

In [13]:
import difflib
import random

def get_book_id(book_title, metadata):
    
    existing_titles = list(metadata['title'].values)
    closest_titles = difflib.get_close_matches(book_title, existing_titles)
    book_id = metadata[metadata['title'] == closest_titles[0]]['id'].values[0]
    return book_id

def get_book_info(book_id, metadata):
    
    book_info = metadata[metadata['id'] == book_id][['id', 'isbn', 
                                                    'authors', 'title', 'original_title']]
    return book_info.to_dict(orient='records')

def predict_review(user_id, book_title, model, metadata):
    
    book_id = get_book_id(book_title, metadata)
    review_prediction = model.predict(uid=user_id, iid=book_id)
    return review_prediction.est

def generate_recommendation(user_id, model, metadata, thresh=4):
    
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    for book_title in book_titles:
        rating = predict_review(user_id, book_title, model, metadata)
        if rating >= thresh:
            book_id = get_book_id(book_title, metadata)
            return get_book_info(book_id, metadata)


In [10]:
generate_recommendation(10, svd, books_metadata)

[{'id': 6365,
  'isbn': nan,
  'authors': 'Linda Kage',
  'title': 'Price of a Kiss (Forbidden Men, #1)',
  'original_title': nan}]

## Visualizing the Similarity Between Books Using t-SNE

In [14]:
svd.qi.shape

(10000, 100)

In [15]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, n_iter=500, verbose=3, random_state=1)
books_embedding = tsne.fit_transform(svd.qi)
projection = pd.DataFrame(columns=['x', 'y'], data=books_embedding)
projection['title'] = books_metadata['original_title']



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.001s...
[t-SNE] Computed neighbors for 10000 samples in 3.459s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.233726
[t-SNE] Computed conditional probabilities in 0.472s
[t-SNE] Iteration 50: error = 96.4169006, gradient norm = 0.1003555 (50 iterations in 5.026s)
[t-SNE] It

In [22]:
import plotly.express as px

fig = px.scatter(
    projection, x='x', y='y'
)
fig.show()

report = dp.Report(dp.Plot(fig)) #Create a report
report.publish(name='books_scatter_plot', open=True, visibility='PUBLIC') #Publish the report

AttributeError: 'Report' object has no attribute 'publish'

### Visualizing Specific Book Vectors

In [19]:
import datapane as dp

def plot_books(titles, plot_name):
    
    book_indices = []
    for book in titles:
        book_indices.append(get_book_id(book, books_metadata)-1)
        
    book_vector_df = projection.iloc[book_indices]
    
    fig = px.scatter(
    book_vector_df, x='x', y='y', text='title',
    )
    fig.show()
    
    report = dp.Report(dp.Plot(fig)) #Create a report
    report.publish(name=plot_name, open=True, visibility='PUBLIC') #Publish the report

In [15]:
books = list(books_metadata['title'][:30])
plot_books(books, plot_name='books_embedding')

Publishing report and associated data - please wait..



Passing visibility as a string is deprecated, use dp.Visibility enum instead.



Report successfully published at https://datapane.com/u/amolmavuduru/reports/books-embedding/
